diff --git a/.pipelines/build-linux.sh b/.pipelines/build-linux.sh index f4c92fda2..53f6a3a07 100644 --- a/.pipelines/build-linux.sh +++ b/.pipelines/build-linux.sh @@ -14,3 +14,8 @@ cd $DIR/../build/linux echo "----------- Build Docker Provider -------------------------------" make cd $DIR + +echo "------------ Bundle Shell Extension Scripts & HELM chart -------------------------" +cd $DIR/../deployment/arc-k8s-extension/ServiceGroupRoot/Scripts +tar -czvf ../artifacts.tar.gz ../../../../charts/azuremonitor-containers/ pushChartToAcr.sh + diff --git a/.pipelines/get-aad-app-creds-from-kv.sh b/.pipelines/get-aad-app-creds-from-kv.sh index 8ef56cddb..a0ba464cc 100755 --- a/.pipelines/get-aad-app-creds-from-kv.sh +++ b/.pipelines/get-aad-app-creds-from-kv.sh @@ -11,6 +11,8 @@ do KV) KV=$VALUE ;; KVSECRETNAMEAPPID) AppId=$VALUE ;; KVSECRETNAMEAPPSECRET) AppSecret=$VALUE ;; + KVSECRETNAMECDPXAPPID) CdpxAppId=$VALUE ;; + KVSECRETNAMECDPXAPPSECRET) CdpxAppSecret=$VALUE ;; *) esac done @@ -27,4 +29,16 @@ az keyvault secret download --file ~/acrappsecret --vault-name ${KV} --name ${A echo "downloaded the appsecret from KV:${KV} and KV secret:${AppSecret}" +echo "key vault secret name for cdpx appid:${KVSECRETNAMECDPXAPPID}" + +echo "key vault secret name for cdpx appsecret:${KVSECRETNAMECDPXAPPSECRET}" + +az keyvault secret download --file ~/cdpxacrappid --vault-name ${KV} --name ${CdpxAppId} + +echo "downloaded the appid from KV:${KV} and KV secret:${CdpxAppId}" + +az keyvault secret download --file ~/cdpxacrappsecret --vault-name ${KV} --name ${CdpxAppSecret} + +echo "downloaded the appsecret from KV:${KV} and KV secret:${CdpxAppSecret}" + echo "end: get app id and secret from specified key vault" diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 57273111e..565661d64 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -24,10 +24,15 @@ restore: build: commands: - - !!defaultcommand + - !!buildcommand name: 'Build Docker Provider Shell Bundle' command: '.pipelines/build-linux.sh' fail_on_stderr: false + artifacts: + - from: 'deployment' + to: 'build' + include: + - '**' package: commands: diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh old mode 100755 new mode 100644 index 638d3a937..e7d26245f --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh @@ -25,13 +25,32 @@ ACR_APP_ID=$(cat ~/acrappid) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" +echo "start: read appid and appsecret for cdpx" +CDPX_ACR_APP_ID=$(cat ~/cdpxacrappid) +CDPX_ACR_APP_SECRET=$(cat ~/cdpxacrappsecret) +echo "end: read appid and appsecret which has read access on cdpx acr" + + +# Name of CDPX_ACR should be in this format :Naming convention: 'cdpx' + service tree id without '-' + two digit suffix like'00'/'01 +# suffix 00 primary and 01 secondary, and we only use primary +# This configured via pipeline variable echo "login to cdpxlinux acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to cdpxlinux acr completed: ${CDPX_ACR}" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull agent image from cdpxlinux acr: ${CDPX_ACR}" -docker pull ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxlinux acr completed: ${CDPX_ACR}" +docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} +if [ $? -eq 0 ]; then + echo "pulling of agent image from cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error pulling of agent image from cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "CI Release name is:"$CI_RELEASE imagetag=$CI_RELEASE$CI_IMAGE_TAG_SUFFIX @@ -41,14 +60,30 @@ echo "CI ACR : ${CI_ACR}" echo "CI AGENT REPOSITORY NAME : ${CI_AGENT_REPO}" echo "tag linux agent image" -docker tag ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of linux agent image completed successfully." +else + echo "-e error tagging of linux agent image failed. Please see release task logs." + exit 1 +fi echo "login ciprod acr":$CI_ACR -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to ciprod acr: ${CI_ACR} completed successfully" +else + echo "-e error login to ciprod acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi echo "pushing the image to ciprod acr:${CI_ACR}" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing of the image to ciprod acr completed successfully" +else + echo "-e error pushing of image to ciprod acr failed. Please see release task logs." + exit 1 +fi echo "end: pull linux agent image from cdpx and push to ciprod acr" diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh old mode 100755 new mode 100644 index 066410af5..19fe55722 --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh @@ -25,13 +25,31 @@ ACR_APP_ID=$(cat ~/acrappid ) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" +echo "start: read appid and appsecret for cdpx" +CDPX_ACR_APP_ID=$(cat ~/cdpxacrappid) +CDPX_ACR_APP_SECRET=$(cat ~/cdpxacrappsecret) +echo "end: read appid and appsecret which has read access on cdpx acr" + +# Name of CDPX_ACR should be in this format :Naming convention: 'cdpx' + service tree id without '-' + two digit suffix like'00'/'01 +# suffix 00 primary and 01 secondary, and we only use primary +# This configured via pipeline variable echo "login to cdpxwindows acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to cdpxwindows acr:${CDPX_ACR} completed" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxwindows acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxwindows acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull image from cdpxwin acr: ${CDPX_ACR}" -docker pull ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxwin acr completed: ${CDPX_ACR}" +docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} +if [ $? -eq 0 ]; then + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} completed successfully." +else + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} failed. Please see release task logs." + exit 1 +fi echo "CI Release name:"$CI_RELEASE echo "CI Image Tax suffix:"$CI_IMAGE_TAG_SUFFIX @@ -40,14 +58,31 @@ imagetag="win-"$CI_RELEASE$CI_IMAGE_TAG_SUFFIX echo "agentimagetag="$imagetag echo "tag windows agent image" -docker tag ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of windows agent image completed successfully." +else + echo "-e error tagging of windows agent image failed. Please see release task logs." + exit 1 +fi echo "login to ${CI_ACR} acr" -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr: ${CI_ACR} completed successfully." +else + echo "login to acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi + echo "pushing the image to ciprod acr" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing the image to ciprod acr completed successfully." +else + echo "pushing the image to ciprod acr failed. Please see release task logs" + exit 1 +fi echo "end: pull windows agent image from cdpx and push to ciprod acr" diff --git a/.pipelines/push-helm-chart-as-oci-artifact.sh b/.pipelines/push-helm-chart-to-canary-repos.sh similarity index 54% rename from .pipelines/push-helm-chart-as-oci-artifact.sh rename to .pipelines/push-helm-chart-to-canary-repos.sh index 50e16e3d0..db8bff56e 100644 --- a/.pipelines/push-helm-chart-as-oci-artifact.sh +++ b/.pipelines/push-helm-chart-to-canary-repos.sh @@ -1,8 +1,9 @@ #!/bin/bash -# push the helm chart as an OCI artifact to specified ACR # working directory of this script should be charts/azuremonitor-containers -export REPO_PATH="batch1/test/azure-monitor-containers" +# note: this repo registered in arc k8s extension for canary region +export REPO_PATH="public/azuremonitor/containerinsights/canary/preview/azuremonitor-containers" + export HELM_EXPERIMENTAL_OCI=1 for ARGUMENT in "$@" @@ -11,13 +12,13 @@ do VALUE=$(echo $ARGUMENT | cut -f2 -d=) case "$KEY" in - CIARCACR) CIARCACR=$VALUE ;; + CIACR) CIACR=$VALUE ;; CICHARTVERSION) CHARTVERSION=$VALUE ;; *) esac done -echo "CI ARC K8S ACR: ${CIARCACR}" +echo "CI ARC K8S ACR: ${CIACR}" echo "CI HELM CHART VERSION: ${CHARTVERSION}" echo "start: read appid and appsecret" @@ -25,18 +26,19 @@ ACR_APP_ID=$(cat ~/acrappid) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" -ACR=${CIARCACR} +ACR=${CIACR} + +echo "login to acr:${ACR} using helm" +helm registry login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to acr:${ACR} using oras" -oras login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET echo "login to acr:${ACR} completed: ${ACR}" echo "start: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" -echo "generate helm package" -helm package . +echo "save the chart locally with acr full path" +helm chart save . ${ACR}/${REPO_PATH}:${CHARTVERSION} -echo "pushing the helm chart as an OCI artifact" -oras push ${ACR}/${REPO_PATH}:${CHARTVERSION} --manifest-config /dev/null:application/vnd.unknown.config.v1+json ./azuremonitor-containers-${CHARTVERSION}.tgz:application/tar+gzip +echo "pushing the helm chart to ACR: ${ACR}" +helm chart push ${ACR}/${REPO_PATH}:${CHARTVERSION} echo "end: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" diff --git a/.pipelines/push-helm-chart-to-prod-repos.sh b/.pipelines/push-helm-chart-to-prod-repos.sh new file mode 100644 index 000000000..71aa989de --- /dev/null +++ b/.pipelines/push-helm-chart-to-prod-repos.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# working directory of this script should be charts/azuremonitor-containers + +# this repo used without extension public preview release +export PROD_REPO_PATH="public/azuremonitor/containerinsights/preview/azuremonitor-containers" + +# note: this repo registered in arc k8s extension for prod group1 regions. +export EXTENSION_PROD_REPO_PATH="public/azuremonitor/containerinsights/prod1/preview/azuremonitor-containers" + +export HELM_EXPERIMENTAL_OCI=1 + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + CIACR) CIACR=$VALUE ;; + CICHARTVERSION) CHARTVERSION=$VALUE ;; + *) + esac +done + +echo "CI ARC K8S ACR: ${CIACR}" +echo "CI HELM CHART VERSION: ${CHARTVERSION}" + +echo "start: read appid and appsecret" +ACR_APP_ID=$(cat ~/acrappid) +ACR_APP_SECRET=$(cat ~/acrappsecret) +echo "end: read appid and appsecret" + +ACR=${CIACR} + +echo "login to acr:${ACR} using helm" +helm registry login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET + +echo "login to acr:${ACR} completed: ${ACR}" + +echo "start: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" + +echo "save the chart locally with acr full path: ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION}" +helm chart save . ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION} + +echo "save the chart locally with acr full path: ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION}" +helm chart save . ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION} + +echo "pushing the helm chart to ACR: ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION}" +helm chart push ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION} + +echo "pushing the helm chart to ACR: ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION}" +helm chart push ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION} + +echo "end: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" diff --git a/.pipelines/update-place-holdres-in-e2e-tests.sh b/.pipelines/update-place-holdres-in-e2e-tests.sh new file mode 100755 index 000000000..5fec73684 --- /dev/null +++ b/.pipelines/update-place-holdres-in-e2e-tests.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "start: update placeholders of e2e-tests.yaml ..." + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + TENANT_ID) TENANT_ID=$VALUE ;; + *) + esac +done + +echo "start: read appid and appsecret" +# used the same SP which used for acr +CLIENT_ID=$(cat ~/acrappid) +CLIENT_SECRET=$(cat ~/acrappsecret) +echo "end: read appid and appsecret" + +echo "Service Principal CLIENT_ID:$CLIENT_ID" +echo "replace CLIENT_ID value" +sed -i "s=SP_CLIENT_ID_VALUE=$CLIENT_ID=g" e2e-tests.yaml + +# only uncomment for debug purpose +# echo "Service Principal CLIENT_SECRET:$CLIENT_SECRET" +echo "replace CLIENT_SECRET value" +sed -i "s=SP_CLIENT_SECRET_VALUE=$CLIENT_SECRET=g" e2e-tests.yaml + +echo "Service Principal TENANT_ID:$TENANT_ID" +echo "replace TENANT_ID value" +sed -i "s=SP_TENANT_ID_VALUE=$TENANT_ID=g" e2e-tests.yaml + +echo "end: update placeholders of e2e-tests.yaml." diff --git a/.pipelines/validate-e2e-tests-results.sh b/.pipelines/validate-e2e-tests-results.sh new file mode 100644 index 000000000..c38fa0f50 --- /dev/null +++ b/.pipelines/validate-e2e-tests-results.sh @@ -0,0 +1,71 @@ +#!/bin/bash +echo "start: validating results of e2e-tests ..." +DEFAULT_SONOBUOY_VERSION="0.20.0" +DEFAULT_TIME_OUT_IN_MINS=60 +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + SONOBUOY_VERSION) SONOBUOY_VERSION=$VALUE ;; + *) + esac +done + +if [ -z $SONOBUOY_VERSION ]; then + SONOBUOY_VERSION=$DEFAULT_SONOBUOY_VERSION +fi + +echo "sonobuoy version: ${SONOBUOY_VERSION}" + +echo "start: downloading sonobuoy" +curl -LO https://github.com/vmware-tanzu/sonobuoy/releases/download/v${SONOBUOY_VERSION}/sonobuoy_${SONOBUOY_VERSION}_linux_amd64.tar.gz +echo "end: downloading sonobuoy" + +echo "start: extract sonobuoy tar file" +mkdir -p sonobuoy-install/ +tar -zxf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz -C sonobuoy-install/ +echo "end: extract sonobuoy tar file" + +echo "start: move sonobuoy binaries to /usr/local/bin/" +mv -f sonobuoy-install/sonobuoy /usr/local/bin/ +echo "end: move sonobuoy binaries to /usr/local/bin/" + +rm -rf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz sonobuoy-install/ + +results=$(sonobuoy retrieve) +mins=0 +IsSucceeded=true +while [ $mins -le $DEFAULT_TIME_OUT_IN_MINS ] +do + # check the status + echo "checking test status" + status=$(sonobuoy status) + status=$(echo $status | sed 's/`//g') + if [[ $status == *"completed"* ]]; then + echo "test run completed" + mins=$DEFAULT_TIME_OUT_IN_MINS + if [[ $status == *"failed"* ]]; then + IsSucceeded=false + fi + else + echo "sleep for 1m to check the status again" + sleep 1m + fi + mins=$(( $mins + 1 )) +done +echo "status:${IsSucceeded}" + +results=$(sonobuoy retrieve) +sonobuoy results $results + +if $IsSucceeded == true; then + echo "all test passed" + exit 0 +else + echo "tests are failed. please review the results by downloading tar file via sonobuoy retrieve command" + exit 1 +fi + +echo "end: validating results of e2e-tests ..." diff --git a/Documentation/OSMPrivatePreview/Image1.jpg b/Documentation/OSMPrivatePreview/Image1.jpg new file mode 100644 index 000000000..04cd03ab1 Binary files /dev/null and b/Documentation/OSMPrivatePreview/Image1.jpg differ diff --git a/Documentation/OSMPrivatePreview/ReadMe.md b/Documentation/OSMPrivatePreview/ReadMe.md new file mode 100644 index 000000000..aa90c7413 --- /dev/null +++ b/Documentation/OSMPrivatePreview/ReadMe.md @@ -0,0 +1,71 @@ +Note - This is private preview. For any support issues, please reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com). Please don't open a support ticket. + +# Azure Monitor Container Insights Open Service Mesh Monitoring + +Azure Monitor container insights now supporting preview of [Open Service Mesh(OSM)](https://docs.microsoft.com/azure/aks/servicemesh-osm-about) Monitoring. As part of this support, customer can: +1. Filter & view inventory of all the services that are part of your service mesh. +2. Visualize and monitor requests between services in your service mesh, with request latency, error rate & resource utilization by services. +3. Provides connection summary for OSM infrastructure running on AKS. + +## How to onboard Container Insights OSM monitoring? +OSM exposes Prometheus metrics which Container Insights can collect, for container insights agent to collect OSM metrics follow the following steps. + +1. Follow this [link](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#register-the-aks-openservicemesh-preview-feature) as a prereq before enabling the addon. + +2. Enable AKS OSM addon on your + - [New AKS cluster](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#install-open-service-mesh-osm-azure-kubernetes-service-aks-add-on-for-a-new-aks-cluster) + - [Existing AKS cluster](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#enable-open-service-mesh-osm-azure-kubernetes-service-aks-add-on-for-an-existing-aks-cluster) +2. Configure OSM to allow Prometheus scraping, follow steps from [here](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#configure-osm-to-allow-prometheus-scraping) +3. To enable namespace(s), download the osm client library [here](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#osm-service-quotas-and-limits-preview) & then enable metrics on namespaces +```bash +# With osm +osm metrics enable --namespace test +osm metrics enable --namespace "test1, test2" + +``` +3. If you are using Azure Monitor Container Insights follow steps below, if not on-board [here.](https://docs.microsoft.com/azure/azure-monitor/containers/container-insights-overview) + * Download the configmap from [here](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-osmconfig.yaml) + * Add the namespaces you want to monitor in configmap `monitor_namespaces = ["namespace1", "namespace2"]` + * Run the following kubectl command: kubectl apply -f + * Example: `kubectl apply -f container-azm-ms-agentconfig.yaml` +4. The configuration change can take upto 15 mins to finish before taking effect, and all omsagent pods in the cluster will restart. The restart is a rolling restart for all omsagent pods, not all restart at the same time. + + +## Validate the metrics flow +1. Query cluster's Log Analytics workspace InsightsMetrics table to see metrics are flowing or not +``` +InsightsMetrics +| where Name contains "envoy" +| summarize count() by Name +``` + +## How to consume OSM monitoring dashboard? +1. Access your AKS cluster & Container Insights through this [link.](https://aka.ms/azmon/osmux) +2. Go to reports tab and access Open Service Mesh (OSM) workbook. +3. Select the time-range & namespace to scope your services. By default, we only show services deployed by customers and we exclude internal service communication. In case you want to view that you select Show All in the filter. Please note OSM is managed service mesh, we show all internal connections for transparency. + +![alt text](https://github.com/microsoft/Docker-Provider/blob/saarorOSMdoc/Documentation/OSMPrivatePreview/Image1.jpg) +### Requests Tab +1. This tab provides you the summary of all the http requests sent via service to service in OSM. +2. You can view all the services and all the services it is communicating to by selecting the service in grid. +3. You can view total requests, request error rate & P90 latency. +4. You can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + +### Connections Tab +1. This tab provides you a summary of all the connections between your services in Open Service Mesh. +2. Outbound connections: Total number of connections between Source and destination services. +3. Outbound active connections: Last count of active connections between source and destination in selected time range. +4. Outbound failed connections: Total number of failed connections between source and destination service + +### Troubleshooting guidance when Outbound active connections is 0 or failed connection count is >10k. +1. Please check your connection policy in OSM configuration. +2. If connection policy is fine, please refer the OSM documentation. https://aka.ms/osm/tsg +3. From this view as well, you can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + + +### Known Issues +1. The workbook has scale limits of 50 pods per namespace. If you have more than 50 pods in mesh you can have workbook loading issues. +2. When source or destination is osmcontroller we show no latency & for internal services we show no resource utilization. +3. When both prometheus scraping using pod annotations and OSM monitoring are enabled on the same set of namespaces, the default set of metrics (envoy_cluster_upstream_cx_total, envoy_cluster_upstream_cx_connect_fail, envoy_cluster_upstream_rq, envoy_cluster_upstream_rq_xx, envoy_cluster_upstream_rq_total, envoy_cluster_upstream_rq_time_bucket, envoy_cluster_upstream_cx_rx_bytes_total, envoy_cluster_upstream_cx_tx_bytes_total, envoy_cluster_upstream_cx_active) will be collected twice. You can follow [this](https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-prometheus-integration#prometheus-scraping-settings) documentation to exclude these namespaces from pod annotation scraping using the setting monitor_kubernetes_pods_namespaces to work around this issue. + +This is private preview, the goal for us is to get feedback. Please feel free to reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com) for any feedback and questions! diff --git a/Health/onboarding_instructions.md b/Health/onboarding_instructions.md index 9c07b2167..4c83577b5 100644 --- a/Health/onboarding_instructions.md +++ b/Health/onboarding_instructions.md @@ -6,12 +6,28 @@ For on-boarding to Health(Tab), you would need to complete two steps ## Configure agent through ConfigMap -1. Include the following section in ConfigMap yaml file -```cmd:agent-settings: |- - [agent_settings.health_model] +1. If you are configuring your existing ConfigMap, append the following section in your existing ConfigMap yaml file +``` +#Append this section in your existing configmap +agent-settings: |- + # agent health model feature settings + [agent_settings.health_model] + # In the absence of this configmap, default value for enabled is false + enabled = true +``` +2. Else if you don't have ConfigMap, download the new ConfigMap from [here.](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-agentconfig.yaml) & then set `enabled =true` + +``` +#For new downloaded configmap enabled this default setting to true +agent-settings: |- + # agent health model feature settings + [agent_settings.health_model] + # In the absence of this configmap, default value for enabled is false enabled = true ``` -2. Run the following kubectl command: + + +3. Run the following kubectl command: `kubectl apply -f ` Example: `kubectl apply -f container-azm-ms-agentconfig.yaml`. diff --git a/README.md b/README.md index 3eec1f344..555234c61 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ The general directory structure is: │ │ ├── acrworkflows/ - acr work flows for the Linux Agent container image │ │ ├── defaultpromenvvariables - default environment variables for Prometheus scraping │ │ ├── defaultpromenvvariables-rs - cluster level default environment variables for Prometheus scraping +│ │ ├── defaultpromenvvariables-sidecar - cluster level default environment variables for Prometheus scraping in sidecar │ ├── windows/ - scripts to build the Docker image for Windows Agent │ │ ├── dockerbuild - script to build the code and docker imag, and publish docker image │ │ ├── acrworkflows/ - acr work flows for the Windows Agent container image @@ -91,6 +92,7 @@ The general directory structure is: │ │ | ... - plugins in, out and filters code in ruby │ ├── toml-parser/ - code for parsing of toml configuration files ├── test/ - source code for tests +│ ├── e2e/ - e2e tests to validate agent and e2e workflow(s) │ ├── unit-tests/ - unit tests code │ ├── scenario/ - scenario tests code ├── !_README.md - this file @@ -271,6 +273,36 @@ For DEV and PROD branches, automatically deployed latest yaml with latest agent # E2E Tests +## For executing tests + +1. Deploy the omsagent.yaml with your agent image. In the yaml, make sure `ISTEST` environment variable set to `true` if its not set already +2. Update the Service Principal CLIENT_ID, CLIENT_SECRET and TENANT_ID placeholder values and apply e2e-tests.yaml to execute the tests + > Note: Service Principal requires reader role on log analytics workspace and cluster resource to query LA and metrics + ``` + cd ~/Docker-Provider/test/e2e # based on your repo path + kubectl apply -f e2e-tests.yaml # this will trigger job to run the tests in sonobuoy namespace + kubectl get po -n sonobuoy # to check the pods and jobs associated to tests + ``` +3. Download (sonobuoy)[https://github.com/vmware-tanzu/sonobuoy/releases] on your dev box to view the results of the tests + ``` + results=$(sonobuoy retrieve) # downloads tar file which has logs and test results + sonobuoy results $results # get the summary of the results + tar -xzvf # extract downloaded tar file and look for pod logs, results and other k8s resources if there are any failures + ``` + +## For adding new tests + +1. Add the test python file with your test code under `tests` directory +2. Build the docker image, recommended to use ACR & MCR + ``` + cd ~/Docker-Provider/test/e2e/src # based on your repo path + docker login -u -p # login to acr + docker build -f ./core/Dockerfile -t /: . + docker push /: + ``` +3. update existing agentest image tag in e2e-tests.yaml with newly built image tag with MCR repo + +# Scenario Tests Clusters are used in release pipeline already has the yamls under test\scenario deployed. Make sure to validate these scenarios. If you have new interesting scenarios, please add/update them. diff --git a/ReleaseNotes.md b/ReleaseNotes.md index d7e4c99a9..78881e2d1 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,12 +11,122 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 09/25/2020 - -> Note: This is hotfix agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update -##### Version microsoft/oms:ciprod09252020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09252020 (linux) -##### Version microsoft/oms:win-ciprod09252020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09252020 (windows) +### 05/20/2021 hotfix - +##### Version microsoft/oms:ciprod05202021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix (linux) +##### No Windows changes with this release, win-ciprod04222021 still current. ##### Code change log -- Fix azure json file crash in MDM path for azure arc k8s connected clusters +- Fixing a bug where the input plugin for NodeInventory dies if any windows nodes are rmeoved from the cluster. + +### 05/20/2021 - +##### Version microsoft/oms:ciprod05202021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021 (linux) +##### No Windows changes with this release, win-ciprod04222021 still current. +##### Code change log +- Telegraf now waits 30 seconds on startup for network connections to complete (Linux only) +- Change adding telegraf to the liveness probe reverted (Linux only) + +### 05/12/2021 - +##### Version microsoft/oms:ciprod05122021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021 (linux) +##### No Windows changes with this release, win-ciprod04222021 still current. +##### Code change log +- Upgrading oneagent to version 1.8 (only for Linux) +- Enabling oneagent for container logs for East US 2 + +### 04/22/2021 - +##### Version microsoft/oms:ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021 (linux) +##### Version microsoft/oms:win-ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021 (windows) +##### Code change log +- Bug fixes for metrics cpuUsagePercentage and memoryWorkingSetPercentage for windows nodes +- Added metrics for threshold violation +- Made Job completion metric configurable +- Udated default buffer sizes in fluent-bit +- Updated recommended alerts +- Fixed bug where logs written before agent starts up were not collected +- Fixed bug which kept agent logs from being rotated +- Bug fix for Windows Containerd container log collection +- Bug fixes +- Doc updates +- Minor telemetry changes + +### 03/26/2021 - +##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) +##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) +##### Code change log +- Started collecting new metric - kubelet running pods count +- Onboarding script fixes to add explicit json output +- Proxy and token updates for ARC +- Doc updates for Microsoft charts repo release +- Bug fixes for trailing whitespaces in enable-monitoring.sh script +- Support for higher volume of prometheus metrics by scraping metrics from sidecar +- Update to get new version of telegraf - 1.18 +- Add label and field selectors for prometheus scraping using annotations +- Support for OSM integration +- Removed wireserver calls to get CA certs since access is removed +- Added liveness timeout for exec for linux containers + +### 02/23/2021 - +##### Version microsoft/oms:ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021 (linux) +##### Version microsoft/oms:win-ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021 (windows) +##### Code change log +- ContainerLogV2 schema support for LogAnalytics & ADX (not usable externally yet) +- Fix nodemetrics (cpuusageprecentage & memoryusagepercentage) metrics not flowing. This is fixed upstream for k8s versions >= 1.19.7 and >=1.20.2. +- Fix cpu & memory usage exceeded threshold container metrics not flowing when requests and/or limits were not set +- Mute some unused exceptions from going to telemetry +- Collect containerimage (repository, image & imagetag) from spec (instead of runtime) +- Add support for extension MSI for k8s arc +- Use cloud specific instrumentation keys for telemetry +- Picked up newer version for apt +- Add priority class to daemonset (in our chart only) + +### 01/11/2021 - +##### Version microsoft/oms:ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021 (linux) +##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) +##### Code change log +- Fixes for Linux Agent Replicaset Pod OOMing issue +- Update fluentbit (1.14.2 to 1.6.8) for the Linux Daemonset +- Make Fluentbit settings: log_flush_interval_secs, tail_buf_chunksize_megabytes and tail_buf_maxsize_megabytes configurable via configmap +- Support for PV inventory collection +- Removal of Custom metric region check for Public cloud regions and update to use cloud environment variable to determine the custom metric support +- For daemonset pods, add the dnsconfig to use ndots: 3 from ndots:5 to optimize the number of DNS API calls made +- Fix for inconsistency in the collection container environment variables for the pods which has high number of containers +- Fix for disabling of std{out;err} log_collection_settings via configmap issue in windows daemonset +- Update to use workspace key from mount file rather than environment variable for windows daemonset agent +- Remove per container info logs in the container inventory +- Enable ADX route for windows container logs +- Remove logging to termination log in windows agent liveness probe + +### 11/09/2020 - +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) +##### Code change log +- Fix for duplicate windows metrics + +### 10/27/2020 - +##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) +##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020 (windows) +##### Code change log +- Activate oneagent in few AKS regions (koreacentral,norwayeast) +- Disable syslog +- Fix timeout for Windows daemonset liveness probe +- Make request == limit for Windows daemonset resources (cpu & memory) +- Schema v2 for container log (ADX only - applicable only for select customers for piloting) + +### 10/05/2020 - +##### Version microsoft/oms:ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020 (linux) +##### Version microsoft/oms:win-ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Code change log +- Health CRD to version v1 (from v1beta1) for k8s versions >= 1.19.0 +- Collection of PV usage metrics for PVs mounted by pods (kube-system pods excluded by default)(doc-link-needed) +- Zero fill few custom metrics under a timer, also add zero filling for new PV usage metrics +- Collection of additional Kubelet metrics ('kubelet_running_pod_count','volume_manager_total_volumes','kubelet_node_config_error','process_resident_memory_bytes','process_cpu_seconds_total','kubelet_runtime_operations_total','kubelet_runtime_operations_errors_total'). This also includes updates to 'kubelet' workbook to include these new metrics +- Collection of Azure NPM (Network Policy Manager) metrics (basic & advanced. By default, NPM metrics collection is turned OFF)(doc-link-needed) +- Support log collection when docker root is changed with knode. Tracked by [this](https://github.com/Azure/AKS/issues/1373) issue +- Support for Pods in 'Terminating' state for nodelost scenarios +- Fix for reduction in telemetry for custom metrics ingestion failures +- Fix CPU capacity/limits metrics being 0 for Virtual nodes (VK) +- Add new custom metric regions (eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth) +- Enable strict SSL validation for AppInsights Ruby SDK +- Turn off custom metrics upload for unsupported cluster types +- Install CA certs from wire server for windows (in certain clouds) ### 09/16/2020 - > Note: This agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 19802e22c..8ec91546c 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -35,17 +35,55 @@ Image automatically synched to MCR CN from Public cloud MCR. - Refer to internal docs for the release process and instructions. -## ARO v3 - -This needs to be co-ordinated with Red hat and ARO-RP team for the release and Red hat team will pick up the changes for the release. - ## AKS-Engine Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## ARO v4, On-prem K8s, Azure Arc K8s and OpenShift v4 clusters - -Make PR against [HELM-charts](https://github.com/helm/charts) with Azure Monitor for containers chart update. +## Arc for Kubernetes + +Ev2 pipeline used to deploy the chart of the Arc K8s Container Insights Extension as per Safe Deployment Process. +Here is the high level process +``` + 1. Specify chart version of the release candidate and trigger [container-insights-arc-k8s-extension-ci_prod-release](https://github-private.visualstudio.com/microsoft/_release?_a=releases&view=all) + 2. Get the approval from one of team member for the release + 3. Once the approved, release should be triggered automatically + 4. use `cimon-arck8s-eastus2euap` for validating latest release in canary region + 5. TBD - Notify vendor team for the validation on all Arc K8s supported platforms +``` + +## Microsoft Charts Repo release for On-prem K8s + +Since HELM charts repo being deprecated, Microsoft charts repo being used for HELM chart release of on-prem K8s clusters. +To make chart release PR, fork [Microsoft-charts-repo]([https://github.com/microsoft/charts/tree/gh-pages) and make the PR against `gh-pages` branch of the upstream repo. + +Refer PR - https://github.com/microsoft/charts/pull/23 for example. +Once the PR merged, latest version of HELM chart should be available in couple of mins in https://microsoft.github.io/charts/repo and https://artifacthub.io/. + +Instructions to create PR +``` +# 1. create helm package for the release candidate + git clone git@github.com:microsoft/Docker-Provider.git + git checkout ci_prod + cd ~/Docker-Provider/charts/azuremonitor-containers # this path based on where you have cloned the repo + helm package . + +# 2. clone your fork repo and checkout gh_pages branch # gh_pages branch used as release branch + cd ~ + git clone + cd ~/charts # assumed the root dir of the clone is charts + git checkout gh_pages + +# 3. copy release candidate helm package + cd ~/charts/repo/azuremonitor-containers + # update chart version value with the version of chart being released + cp ~/Docker-Provider/charts/azuremonitor-containers/azuremonitor-containers-.tgz . + cd ~/charts/repo + # update repo index file + helm repo index . + +# 4. Review the changes and make PR. Please note, you may need to revert unrelated changes automatically added by `helm repo index .` command + +``` # 4. Monitor agent roll-out status diff --git a/alerts/recommended_alerts_ARM/PVUsagePercentage.json b/alerts/recommended_alerts_ARM/PVUsagePercentage.json new file mode 100644 index 000000000..e6cdbee15 --- /dev/null +++ b/alerts/recommended_alerts_ARM/PVUsagePercentage.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "alertName": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Name of the alert" + } + }, + "alertDescription": { + "type": "string", + "defaultValue": "This is a metric alert", + "metadata": { + "description": "Description of alert" + } + }, + "alertSeverity": { + "type": "int", + "defaultValue": 3, + "allowedValues": [ + 0, + 1, + 2, + 3, + 4 + ], + "metadata": { + "description": "Severity of alert {0,1,2,3,4}" + } + }, + "isEnabled": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Specifies whether the alert is enabled" + } + }, + "clusterResourceId": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz" + } + }, + "operator": { + "type": "string", + "defaultValue": "GreaterThan", + "allowedValues": [ + "Equals", + "NotEquals", + "GreaterThan", + "GreaterThanOrEqual", + "LessThan", + "LessThanOrEqual" + ], + "metadata": { + "description": "Operator comparing the current value with the threshold value." + } + }, + "threshold": { + "type": "int", + "defaultValue": 80, + "metadata": { + "description": "The threshold value at which the alert is activated." + }, + "minValue": 1, + "maxValue": 100 + }, + "timeAggregation": { + "type": "string", + "defaultValue": "Average", + "allowedValues": [ + "Average", + "Minimum", + "Maximum", + "Count" + ], + "metadata": { + "description": "How the data that is collected should be combined over time." + } + }, + "windowSize": { + "type": "string", + "defaultValue": "PT5M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H", + "PT6H", + "PT12H", + "PT24H" + ], + "metadata": { + "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format." + } + }, + "evaluationFrequency": { + "type": "string", + "defaultValue": "PT1M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H" + ], + "metadata": { + "description": "how often the metric alert is evaluated represented in ISO 8601 duration format" + } + }, + "actionGroupId": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "The ID of the action group that is triggered when the alert is activated or deactivated" + } + } + }, + "variables": {}, + "resources": [ + { + "name": "[parameters('alertName')]", + "type": "Microsoft.Insights/metricAlerts", + "location": "global", + "apiVersion": "2018-03-01", + "tags": {}, + "properties": { + "description": "[parameters('alertDescription')]", + "severity": "[parameters('alertSeverity')]", + "enabled": "[parameters('isEnabled')]", + "scopes": [ + "[parameters('clusterResourceId')]" + ], + "evaluationFrequency": "[parameters('evaluationFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria", + "allOf": [ + { + "name": "1st criterion", + "metricName": "pvUsageExceededPercentage", + "metricNamespace": "Insights.Container/persistentvolumes", + "dimensions": [ + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "[parameters('operator')]", + "threshold": "[parameters('threshold')]", + "timeAggregation": "[parameters('timeAggregation')]", + "skipMetricValidation": true + } + ] + }, + "actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]" + } + } + ] +} diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index fae3acb36..ea1536866 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -3,7 +3,9 @@ @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" -@default_service_interval = "15" +@default_service_interval = "1" +@default_buffer_chunk_size = "1" +@default_buffer_max_size = "1" def is_number?(value) true if Integer(value) rescue false @@ -18,12 +20,17 @@ def substituteFluentBitPlaceHolders bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"] bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] - serviceInterval = (!interval.nil? && is_number?(interval)) ? interval : @default_service_interval + serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval - tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize)) ? bufferChunkSize : nil + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : @default_buffer_chunk_size - tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize)) ? bufferMaxSize : nil + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : @default_buffer_max_size = "1" + + if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) + puts "config:warn buffer max size must be greater or equal to chunk size" + tailBufferMaxSize = tailBufferChunkSize + end text = File.read(@td_agent_bit_conf_path) new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting) diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..819c1956f --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,423 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end +# require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require "fileutils" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@promSideCar = "prometheussidecar" +@windows = "windows" +@configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +# @defaultRsMonitorPods = false +@defaultCustomPrometheusInterval = "1m" +@defaultCustomPrometheusFieldPass = [] +@defaultCustomPrometheusFieldDrop = [] +@defaultCustomPrometheusMonitorPods = false +@defaultCustomPrometheusLabelSelectors = "" +@defaultCustomPrometheusFieldSelectors = "" + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Checking to see if this is the daemonset or replicaset to parse config accordingly +@controller = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +def checkForType(variable, varType) + if variable.nil? || variable.kind_of?(varType) + return true + else + return false + end +end + +def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with no namespace filters" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", ("pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", ("kubernetes_label_selector = \"#{kubernetesLabelSelectors}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", ("kubernetes_field_selector = \"#{kubernetesFieldSelectors}\"")) + rescue => errorStr + puts "Exception while replacing default pod monitor settings for custom prometheus scraping: #{errorStr}" + end + return new_contents +end + +def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with namespace filters" + + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE") + + pluginConfigsWithNamespaces = "" + monitorKubernetesPodsNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + interval = \"#{interval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + kubernetes_label_selector = \"#{kubernetesLabelSelectors}\" + kubernetes_field_selector = \"#{kubernetesFieldSelectors}\" + fieldpass = #{fieldPassSetting} + fielddrop = #{fieldDropSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) + return new_contents + rescue => errorStr + puts "Exception while creating prometheus input plugins to filter namespaces for custom prometheus: #{errorStr}, using defaults" + replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + if !@controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if @controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + + # Remove below 4 lines after phased rollout + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datatypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + # Remove below check after phased rollout + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) # Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + # Remove below lines after phased rollout + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + end + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + end + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for replicaset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") + setRsPromDefaults + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && + ((!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) || + (!@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0) && @sidecarScrapingEnabled.strip.casecmp("true") == 0) && + !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus custom config settings for monitor kubernetes pods + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for custom prometheus scraping" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultCustomPrometheusInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultCustomPrometheusFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultCustomPrometheusFieldDrop : fieldDrop + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultCustomPrometheusMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + file_name = "/etc/telegraf/telegraf.conf" + else + file_name = "/opt/telegraf-test-prom-side-car.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf", file_name) + end + puts "config::Starting to substitute the placeholders in telegraf conf copy file for linux or conf file for windows for custom prometheus scraping" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" + #Set environment variables for telemetry in the sidecar container + if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for prometheus sidecar" + end + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for prometheus side car, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for promethues side car: #{errorStr}, using defaults") + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for daemonset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 7235ee0c3..a0f3c2f0a 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -23,6 +23,7 @@ @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false +@containerLogSchemaVersion = "" @collectAllKubeEvents = false @containerLogsRoute = "" @@ -138,6 +139,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") end + #Get container log schema version setting + begin + if !parsedConfig[:log_collection_settings][:schema].nil? && !parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version].nil? + @containerLogSchemaVersion = parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version] + puts "config::Using config map setting for container log schema version" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for container log schema version - #{errorStr}, using defaults, please check config map for errors") + end + #Get kube events enrichment setting begin if !parsedConfig[:log_collection_settings][:collect_all_kube_events].nil? && !parsedConfig[:log_collection_settings][:collect_all_kube_events][:enabled].nil? @@ -200,6 +211,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") + file.write("export AZMON_CONTAINER_LOG_SCHEMA_VERSION=#{@containerLogSchemaVersion}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -228,7 +240,7 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_LOG_TAIL_PATH', @logTailPath) file.write(commands) - commands = get_command_windows('AZMON_LOG_EXCLUSION_REGEX_PATTERN', @stdoutExcludeNamespaces) + commands = get_command_windows('AZMON_LOG_EXCLUSION_REGEX_PATTERN', @logExclusionRegexPattern) file.write(commands) commands = get_command_windows('AZMON_STDOUT_EXCLUDED_NAMESPACES', @stdoutExcludeNamespaces) file.write(commands) @@ -244,7 +256,9 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) + commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) + file.write(commands) + commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) file.write(commands) # Close file after writing all environment variables diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f02ec0131..958a85eb6 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -45,14 +45,12 @@ #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info type filter_telegraf2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level debug diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 9ada8425f..fb566c360 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -13,7 +13,14 @@ tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + + + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug #Kubernetes events @@ -66,15 +73,13 @@ type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info @@ -98,6 +103,21 @@ max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug diff --git a/build/linux/installer/conf/prometheus-side-car.conf b/build/linux/installer/conf/prometheus-side-car.conf new file mode 100644 index 000000000..fd40910d9 --- /dev/null +++ b/build/linux/installer/conf/prometheus-side-car.conf @@ -0,0 +1,4 @@ + + + + diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf new file mode 100644 index 000000000..8a69f7995 --- /dev/null +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -0,0 +1,41 @@ +[SERVICE] + #Default service flush interval is 15 seconds + Flush 15 + HTTP_Server Off + Daemon Off + storage.path /var/opt/microsoft/docker-cimprov/state/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 10m + Buffer_Size 10m + Mem_Buf_Limit 200m + +[OUTPUT] + Name oms + EnableTelemetry true + Retry_Limit 10 + TelemetryPushIntervalSeconds 300 + Match oms.container.* \ No newline at end of file diff --git a/build/linux/installer/conf/td-agent-bit-rs.conf b/build/linux/installer/conf/td-agent-bit-rs.conf index 696ac80e6..9613c270d 100644 --- a/build/linux/installer/conf/td-agent-bit-rs.conf +++ b/build/linux/installer/conf/td-agent-bit-rs.conf @@ -10,6 +10,19 @@ Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/conf/td-agent-bit.conf b/build/linux/installer/conf/td-agent-bit.conf index 484a4bbbf..045aefcaf 100644 --- a/build/linux/installer/conf/td-agent-bit.conf +++ b/build/linux/installer/conf/td-agent-bit.conf @@ -15,6 +15,7 @@ Name tail Tag oms.container.log.la.* Path ${AZMON_LOG_TAIL_PATH} + Read_from_Head true DB /var/log/omsagent-fblogs.db DB.Sync Off Parser docker @@ -32,6 +33,7 @@ Name tail Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db DB.Sync Off Parser docker @@ -44,6 +46,7 @@ Name tail Tag oms.container.log.flbplugin.mdsd.* Path /var/opt/microsoft/linuxmonagent/log/mdsd.err + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/mdsd-ai.db DB.Sync Off Parser docker @@ -52,6 +55,19 @@ Skip_Long_Lines On Ignore_Older 2m +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf new file mode 100644 index 000000000..b3b4ba1d3 --- /dev/null +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 3000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 60000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index d81196330..ee1cf8819 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -540,13 +540,13 @@ #Prometheus Custom Metrics [[inputs.prometheus]] - interval = "$AZMON_RS_PROM_INTERVAL" + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = $AZMON_RS_PROM_URLS + urls = $AZMON_TELEGRAF_CUSTOM_PROM_URLS ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + kubernetes_services = $AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES ## Scrape Kubernetes pods for the following prometheus annotations: ## - prometheus.io/scrape: Enable scraping for this pod @@ -554,10 +554,15 @@ ## set this to `https` & most likely set the tls config. ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. ## - prometheus.io/port: If port is not 9102 use this annotation - $AZMON_RS_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS - fieldpass = $AZMON_RS_PROM_FIELDPASS - fielddrop = $AZMON_RS_PROM_FIELDDROP + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -581,7 +586,11 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS + # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 013aa1af2..5a5bb2d8c 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -632,8 +632,7 @@ name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - ## Include "$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" when we add for support for 1.18 - fieldpass = ["$KUBELET_RUNTIME_OPERATIONS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC"] + fieldpass = ["$KUBELET_RUNTIME_OPERATIONS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC"] metric_version = 2 url_tag = "scrapeUrl" @@ -675,8 +674,10 @@ name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - - fieldpass = ["kubelet_running_pod_count","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] + + # <= 1.18: metric name is kubelet_running_pod_count + # >= 1.19: metric name changed to kubelet_running_pods + fieldpass = ["kubelet_running_pod_count","kubelet_running_pods","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] metric_version = 2 url_tag = "scrapeUrl" @@ -690,7 +691,7 @@ ## Optional TLS Config tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" insecure_skip_verify = true - + ## prometheus custom metrics [[inputs.prometheus]] @@ -731,7 +732,7 @@ #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = $AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE - + metric_version = 2 url_tag = "scrapeUrl" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..df8fbc3da 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -22,6 +22,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/filter_container.rb; source/plugins/ruby/filter_container.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root @@ -109,23 +110,28 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlrb/string_utils.rb; source/toml-parser/tomlrb/string_utils.rb; 644; root; root /opt/tomlrb/version.rb; source/toml-parser/tomlrb/version.rb; 644; root; root -/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root -/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root - -/opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root +/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root +/etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf; build/linux/installer/conf/prometheus-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf; build/linux/installer/conf/telegraf-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root +/opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root + +/opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root +/opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 87f68a560..5e1261e7e 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -4,27 +4,53 @@ (ps -ef | grep omsagent- | grep -v "grep") if [ $? -ne 0 ] then - echo "Agent is NOT running" > /dev/termination-log + echo " omsagent is not running" > /dev/termination-log exit 1 fi +#optionally test to exit non zero value if oneagent is not running +if [ -e "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" ]; then + (ps -ef | grep "mdsd" | grep -v "grep") + if [ $? -ne 0 ] + then + echo "oneagent is not running" > /dev/termination-log + exit 1 + fi +fi + #test to exit non zero value if fluentbit is not running (ps -ef | grep td-agent-bit | grep -v "grep") if [ $? -ne 0 ] then - echo "Fluentbit is NOT running" > /dev/termination-log + echo "Fluentbit is not running" > /dev/termination-log exit 1 fi -if [ ! -s "inotifyoutput.txt" ] +#test to exit non zero value if telegraf is not running +(ps -ef | grep telegraf | grep -v "grep") +if [ $? -ne 0 ] then - # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded - exit 0 -else - if [ -s "inotifyoutput.txt" ] - then - # inotifyoutput file has data(config map was applied) - echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log - exit 1 - fi + # echo "Telegraf is not running" > /dev/termination-log + echo "Telegraf is not running (controller: ${CONTROLLER_TYPE}, container type: ${CONTAINER_TYPE})" > /dev/write-to-traces # this file is tailed and sent to traces + # exit 1 fi + +if [ -s "inotifyoutput.txt" ] +then + # inotifyoutput file has data(config map was applied) + echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log + exit 1 +fi + +# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then + if [ -s "inotifyoutput-osm.txt" ] + then + # inotifyoutput-osm file has data(config map was applied) + echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log + exit 1 + fi +fi + +exit 0 diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb new file mode 100644 index 000000000..e587909e5 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -0,0 +1,220 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" +@enable_health_model = false + +# 250 Node items (15KB per node) account to approximately 4MB +@nodesChunkSize = 250 +# 1000 pods (10KB per pod) account to approximately 10MB +@podsChunkSize = 1000 +# 4000 events (1KB per event) account to approximately 4MB +@eventsChunkSize = 4000 +# roughly each deployment is 8k +# 500 deployments account to approximately 4MB +@deploymentsChunkSize = 500 +# roughly each HPA is 3k +# 2000 HPAs account to approximately 6-7MB +@hpaChunkSize = 2000 +# stream batch sizes to avoid large file writes +# too low will consume higher disk iops +@podsEmitStreamBatchSize = 200 +@nodesEmitStreamBatchSize = 100 + +# higher the chunk size rs pod memory consumption higher and lower api latency +# similarly lower the value, helps on the memory consumption but incurrs additional round trip latency +# these needs to be tuned be based on the workload +# nodes +@nodesChunkSizeMin = 100 +@nodesChunkSizeMax = 400 +# pods +@podsChunkSizeMin = 250 +@podsChunkSizeMax = 1500 +# events +@eventsChunkSizeMin = 2000 +@eventsChunkSizeMax = 10000 +# deployments +@deploymentsChunkSizeMin = 500 +@deploymentsChunkSizeMax = 1000 +# hpa +@hpaChunkSizeMin = 500 +@hpaChunkSizeMax = 2000 + +# emit stream sizes to prevent lower values which costs disk i/o +# max will be upto the chunk size +@podsEmitStreamBatchSizeMin = 50 +@nodesEmitStreamBatchSizeMin = 50 + +# configmap settings related fbit config +@fbitFlushIntervalSecs = 0 +@fbitTailBufferChunkSizeMBs = 0 +@fbitTailBufferMaxSizeMBs = 0 + + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + chunk_config = parsedConfig[:agent_settings][:chunk_config] + if !chunk_config.nil? + nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) && (@nodesChunkSizeMin..@nodesChunkSizeMax) === nodesChunkSize.to_i + @nodesChunkSize = nodesChunkSize.to_i + puts "Using config map value: NODES_CHUNK_SIZE = #{@nodesChunkSize}" + end + + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] + if !podsChunkSize.nil? && is_number?(podsChunkSize) && (@podsChunkSizeMin..@podsChunkSizeMax) === podsChunkSize.to_i + @podsChunkSize = podsChunkSize.to_i + puts "Using config map value: PODS_CHUNK_SIZE = #{@podsChunkSize}" + end + + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) && (@eventsChunkSizeMin..@eventsChunkSizeMax) === eventsChunkSize.to_i + @eventsChunkSize = eventsChunkSize.to_i + puts "Using config map value: EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + end + + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) && (@deploymentsChunkSizeMin..@deploymentsChunkSizeMax) === deploymentsChunkSize.to_i + @deploymentsChunkSize = deploymentsChunkSize.to_i + puts "Using config map value: DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + end + + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) && (@hpaChunkSizeMin..@hpaChunkSizeMax) === hpaChunkSize.to_i + @hpaChunkSize = hpaChunkSize.to_i + puts "Using config map value: HPA_CHUNK_SIZE = #{@hpaChunkSize}" + end + + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) && + podsEmitStreamBatchSize.to_i <= @podsChunkSize && podsEmitStreamBatchSize.to_i >= @podsEmitStreamBatchSizeMin + @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i + puts "Using config map value: PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + end + nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) && + nodesEmitStreamBatchSize.to_i <= @nodesChunkSize && nodesEmitStreamBatchSize.to_i >= @nodesEmitStreamBatchSizeMin + @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i + puts "Using config map value: NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + end + end + # fbit config settings + fbit_config = parsedConfig[:agent_settings][:fbit_config] + if !fbit_config.nil? + fbitFlushIntervalSecs = fbit_config[:log_flush_interval_secs] + if !fbitFlushIntervalSecs.nil? && is_number?(fbitFlushIntervalSecs) && fbitFlushIntervalSecs.to_i > 0 + @fbitFlushIntervalSecs = fbitFlushIntervalSecs.to_i + puts "Using config map value: log_flush_interval_secs = #{@fbitFlushIntervalSecs}" + end + + fbitTailBufferChunkSizeMBs = fbit_config[:tail_buf_chunksize_megabytes] + if !fbitTailBufferChunkSizeMBs.nil? && is_number?(fbitTailBufferChunkSizeMBs) && fbitTailBufferChunkSizeMBs.to_i > 0 + @fbitTailBufferChunkSizeMBs = fbitTailBufferChunkSizeMBs.to_i + puts "Using config map value: tail_buf_chunksize_megabytes = #{@fbitTailBufferChunkSizeMBs}" + end + + fbitTailBufferMaxSizeMBs = fbit_config[:tail_buf_maxsize_megabytes] + if !fbitTailBufferMaxSizeMBs.nil? && is_number?(fbitTailBufferMaxSizeMBs) && fbitTailBufferMaxSizeMBs.to_i > 0 + if fbitTailBufferMaxSizeMBs.to_i >= @fbitTailBufferChunkSizeMBs + @fbitTailBufferMaxSizeMBs = fbitTailBufferMaxSizeMBs.to_i + puts "Using config map value: tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs}" + else + # tail_buf_maxsize_megabytes has to be greater or equal to tail_buf_chunksize_megabytes + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: tail_buf_maxsize_megabytes must be greater or equal to value of tail_buf_chunksize_megabytes. Using tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs} since provided config value not valid" + end + end + # in scenario - tail_buf_chunksize_megabytes provided but not tail_buf_maxsize_megabytes to prevent fbit crash + if @fbitTailBufferChunkSizeMBs > 0 && @fbitTailBufferMaxSizeMBs == 0 + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: since tail_buf_maxsize_megabytes not provided hence using tail_buf_maxsize_megabytes=#{@fbitTailBufferMaxSizeMBs} which is same as the value of tail_buf_chunksize_megabytes" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for agent configuration setting - #{errorStr}, using defaults" + @enable_health_model = false + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("agent_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + # fbit settings + if @fbitFlushIntervalSecs > 0 + file.write("export FBIT_SERVICE_FLUSH_INTERVAL=#{@fbitFlushIntervalSecs}\n") + end + if @fbitTailBufferChunkSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_CHUNK_SIZE=#{@fbitTailBufferChunkSizeMBs}\n") + end + if @fbitTailBufferMaxSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_MAX_SIZE=#{@fbitTailBufferMaxSizeMBs}\n") + end + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end diff --git a/build/linux/installer/scripts/tomlparser-health-config.rb b/build/linux/installer/scripts/tomlparser-health-config.rb deleted file mode 100644 index 14c8bdb44..000000000 --- a/build/linux/installer/scripts/tomlparser-health-config.rb +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/local/bin/ruby - -#this should be require relative in Linux and require in windows, since it is a gem install on windows -@os_type = ENV["OS_TYPE"] -if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 - require "tomlrb" -else - require_relative "tomlrb" -end - -require_relative "ConfigParseErrorLogger" - -@configMapMountPath = "/etc/config/settings/agent-settings" -@configSchemaVersion = "" -@enable_health_model = false - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for agent health settings mounted, parsing values" - parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for agent health settings not mounted, using defaults" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for enabling health: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? - @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] - puts "enable_health_model = #{@enable_health_model}" - end - rescue => errorStr - puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" - @enable_health_model = false - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@configMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") - end - @enable_health_model = false -end - -# Write the settings to file, so that they can be set as environment variables -file = File.open("health_config_env_var", "w") - -if !file.nil? - file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") - # Close file after writing all environment variables - file.close -else - puts "Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" -end \ No newline at end of file diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 1c01dd8c6..5ce5d79d2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -12,6 +12,8 @@ @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD +@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD +@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -35,7 +37,7 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings def populateSettingValuesFromConfigMap(parsedConfig) if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil? - # Get mdm metrics config settings for resource utilization + # Get mdm metrics config settings for container resource utilization begin resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds] if !resourceUtilization.nil? @@ -66,7 +68,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - puts "config::Using config map settings for MDM metric configuration settings for resource utilization" + puts "config::Using config map settings for MDM metric configuration settings for container resource utilization" end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") @@ -74,6 +76,51 @@ def populateSettingValuesFromConfigMap(parsedConfig) @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end + + # Get mdm metrics config settings for PV utilization + begin + isUsingPVThresholdConfig = false + pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + if !pvUtilizationThresholds.nil? + pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] + if !pvUsageThreshold.nil? + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + isUsingPVThresholdConfig = true + end + end + end + + if isUsingPVThresholdConfig + puts "config::Using config map settings for MDM metric configuration settings for PV utilization" + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end + + # Get mdm metrics config settings for job completion + begin + jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] + if !jobCompletion.nil? + jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] + jobCompletionThresholdInt = jobCompletionThreshold.to_i + if jobCompletionThresholdInt.kind_of? Integer + @jobCompletionThresholdMinutes = jobCompletionThresholdInt + else + puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default " + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end + puts "config::Using config map settings for MDM metric configuration settings for job completion" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors") + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end end end @@ -97,6 +144,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb new file mode 100644 index 000000000..40d87b7f1 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -0,0 +1,71 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require_relative "microsoft/omsagent/plugin/constants" + +@configMapMountPath = "/etc/config/settings/metric_collection_settings" +@configVersion = "" +@configSchemaVersion = "" + +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectPVKubeSystemMetrics = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings +def populateSettingValuesFromConfigMap(parsedConfig) + # Get metric collection settings for including or excluding kube-system namespace in PV metrics + begin + if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Metric Collection Settings Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_metric_collection_env_var", "w") + +if !file.nil? + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") + # Close file after writing all metric collection setting environment variables + file.close + puts "****************End Metric Collection Settings Processing********************" +else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End Metric Collection Settings Processing********************" +end diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb new file mode 100644 index 000000000..096064db8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -0,0 +1,168 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" +require "fileutils" +require_relative "ConfigParseErrorLogger" + +@controllerType = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +@replicaset = "replicaset" +@prometheusSidecar = "prometheussidecar" + +if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@replicaset) == 0 && + (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && !@sidecarScrapingEnabled.empty? && @sidecarScrapingEnabled.strip.casecmp("false") == 0)) + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + @tgfTestConfigFile = "/opt/telegraf-test-rs.conf" +elsif !@containerType.nil? && !@containerType.empty? && @containerType.strip.casecmp(@prometheusSidecar) == 0 + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +end + +@configMapMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" +@configSchemaVersion = "" +# @tgfConfigFileSidecar = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" +# @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +@osmMetricNamespaces = [] + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +#@fieldPassSetting = "[\"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq\"]" +@fieldPassSetting = "[\"envoy_cluster_upstream_cx_total\", \"envoy_cluster_upstream_cx_connect_fail\", \"envoy_cluster_upstream_rq\", \"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq_total\", \"envoy_cluster_upstream_rq_time_bucket\", \"envoy_cluster_upstream_cx_rx_bytes_total\", \"envoy_cluster_upstream_cx_tx_bytes_total\", \"envoy_cluster_upstream_cx_active\"]" +@scrapeInterval = "1m" +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-osmconfig for osm metrics found, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map for osm metrics" + return parsedConfig + else + puts "config::configmap container-azm-ms-osmconfig for osm metrics not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for osm metrics: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && + !parsedConfig[:osm_metric_collection_configuration].nil? && + !parsedConfig[:osm_metric_collection_configuration][:settings].nil? + osmPromMetricNamespaces = parsedConfig[:osm_metric_collection_configuration][:settings][:monitor_namespaces] + puts "config::osm::got:osm_metric_collection_configuration.settings.monitor_namespaces='#{osmPromMetricNamespaces}'" + + # Check to see if osm_metric_collection_configuration.settings has a valid setting for monitor_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + if !osmPromMetricNamespaces.nil? && checkForTypeArray(osmPromMetricNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (osmPromMetricNamespaces.length > 0) + @osmMetricNamespaces = osmPromMetricNamespaces + end + end + end + rescue => errorStr + puts "config::osm::error:Exception while reading config settings for osm configuration settings - #{errorStr}, using defaults" + @osmMetricNamespaces = [] + end +end + +def replaceOsmTelegrafConfigPlaceHolders + begin + #replace place holders in configuration file + tgfConfig = File.read(@tgfTestConfigFile) #read returns only after closing the file + + if @osmMetricNamespaces.length > 0 + osmPluginConfigsWithNamespaces = "" + @osmMetricNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + osmPluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + name_prefix=\"container.azm.ms.osm/\" + interval = \"#{@scrapeInterval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controllerType.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + fieldpass = #{@fieldPassSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", osmPluginConfigsWithNamespaces) + else + puts "Using defaults for OSM configuration since there was an error in OSM config map or no namespaces were set" + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", "") + end + File.open(@tgfTestConfigFile, "w") { |file| file.puts tgfConfig } # 'file' will be closed here after it goes out of scope + puts "config::osm::Successfully substituted the OSM placeholders in #{@tgfTestConfigFile} file in sidecar container" + rescue => errorStr + # TODO: test this scenario out + puts "config::osm::error:Exception while replacing telegraf configuration settings for osm - #{errorStr}, using defaults" + end +end + +@osmConfigSchemaVersion = ENV["AZMON_OSM_CFG_SCHEMA_VERSION"] +puts "****************Start OSM Config Processing********************" +if !@osmConfigSchemaVersion.nil? && !@osmConfigSchemaVersion.empty? && @osmConfigSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + # Check to see if the prometheus custom config parser has created a test config file so that we can replace the settings in the test file and run it, If not create + # a test config file by copying contents of the actual telegraf config file. + if (!File.exist?(@tgfTestConfigFile)) + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + puts "test telegraf config file #{@tgfTestConfigFile} does not exist, creating new one" + FileUtils.cp(@tgfConfigFile, @tgfTestConfigFile) + end + + replaceOsmTelegrafConfigPlaceHolders() + + # Write the telemetry to file, so that they can be set as environment variables + telemetryFile = File.open("integration_osm_config_env_var", "w") + + if !telemetryFile.nil? + telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + # Close file after writing all environment variables + telemetryFile.close + else + puts "config::osm::Exception while opening file for writing OSM telemetry environment variables" + end + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::osm::unsupported/missing config schema version - '#{@osmConfigSchemaVersion}' , using defaults, please use supported schema version") + else + puts "config::No configmap mounted for OSM config, using defaults" + end +end +puts "****************End OSM Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb b/build/linux/installer/scripts/tomlparser-prom-customconfig.rb deleted file mode 100644 index 7aad580ee..000000000 --- a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/local/bin/ruby - -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" -require "fileutils" - -@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" -@replicaset = "replicaset" -@daemonset = "daemonset" -@configSchemaVersion = "" -@defaultDsInterval = "1m" -@defaultDsPromUrls = [] -@defaultDsFieldPass = [] -@defaultDsFieldDrop = [] -@defaultRsInterval = "1m" -@defaultRsPromUrls = [] -@defaultRsFieldPass = [] -@defaultRsFieldDrop = [] -@defaultRsK8sServices = [] -@defaultRsMonitorPods = false - -#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering -@metricVersion = 2 -@urlTag = "scrapeUrl" -@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" -@responseTimeout = "15s" -@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" -@insecureSkipVerify = true - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@promConfigMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" - parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted prometheus config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -def checkForTypeArray(arrayValue, arrayType) - if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) - return true - else - return false - end -end - -def checkForType(variable, varType) - if variable.nil? || variable.kind_of?(varType) - return true - else - return false - end -end - -def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") - rescue => errorStr - puts "Exception while replacing default pod monitor settings: #{errorStr}" - end - return new_contents -end - -def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_RS_PROM_MONITOR_PODS") - pluginConfigsWithNamespaces = "" - monitorKubernetesPodsNamespaces.each do |namespace| - if !namespace.nil? - #Stripping namespaces to remove leading and trailing whitespaces - namespace.strip! - if namespace.length > 0 - pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] - interval = \"#{interval}\" - monitor_kubernetes_pods = true - monitor_kubernetes_pods_namespace = \"#{namespace}\" - fieldpass = #{fieldPassSetting} - fielddrop = #{fieldDropSetting} - metric_version = #{@metricVersion} - url_tag = \"#{@urlTag}\" - bearer_token = \"#{@bearerToken}\" - response_timeout = \"#{@responseTimeout}\" - tls_ca = \"#{@tlsCa}\" - insecure_skip_verify = #{@insecureSkipVerify}\n" - end - end - end - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) - return new_contents - rescue => errorStr - puts "Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" - replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - # Checking to see if this is the daemonset or replicaset to parse config accordingly - controller = ENV["CONTROLLER_TYPE"] - if !controller.nil? - if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? - if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? - #Get prometheus replicaset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] - kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] - monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] - monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(kubernetesServices, String) && - checkForTypeArray(urls, String) && - (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby - puts "config::Successfully passed typecheck for config settings for replicaset" - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultRsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop - kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices - urls = (urls.nil?) ? @defaultRsPromUrls : urls - monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods - - file_name = "/opt/telegraf-test-rs.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPassSetting) - fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDropSetting) - new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) - - # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces - # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - - # - to use defaults in case of nil settings - if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) - new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length - else - new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - monitorKubernetesPodsNamespacesLength = 0 - end - - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") - file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") - - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for replicaset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") - setRsPromDefaults - puts "****************End Prometheus Config Processing********************" - end - elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? - #Get prometheus daemonset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(urls, String) - puts "config::Successfully passed typecheck for config settings for daemonset" - - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultDsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop - urls = (urls.nil?) ? @defaultDsPromUrls : urls - - file_name = "/opt/telegraf-test.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" - - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for daemonset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") - puts "****************End Prometheus Config Processing********************" - end - end # end of controller type check - end - else - ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Prometheus Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@promConfigMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") - else - puts "config::No configmap mounted for prometheus custom config, using defaults" - end -end -puts "****************End Prometheus Config Processing********************" diff --git a/build/version b/build/version index a8dc5b084..d70d1f9bc 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=10 -CONTAINER_BUILDVERSION_MINOR=0 +CONTAINER_BUILDVERSION_MAJOR=15 +CONTAINER_BUILDVERSION_MINOR=2 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=6 -CONTAINER_BUILDVERSION_DATE=20200925 +CONTAINER_BUILDVERSION_BUILDNR=0 +CONTAINER_BUILDVERSION_DATE=20210512 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/build/windows/installer/certificategenerator/Program.cs b/build/windows/installer/certificategenerator/Program.cs index 43063c4be..e24d0e303 100644 --- a/build/windows/installer/certificategenerator/Program.cs +++ b/build/windows/installer/certificategenerator/Program.cs @@ -414,14 +414,12 @@ static void Main(string[] args) try { - if (!String.IsNullOrEmpty(Environment.GetEnvironmentVariable("WSKEY"))) - { - logAnalyticsWorkspaceSharedKey = Environment.GetEnvironmentVariable("WSKEY"); - } + // WSKEY isn't stored as an environment variable + logAnalyticsWorkspaceSharedKey = File.ReadAllText("C:/etc/omsagent-secret/KEY").Trim(); } catch (Exception ex) { - Console.WriteLine("Failed to read env variables (WSKEY)" + ex.Message); + Console.WriteLine("Failed to read secret (WSKEY)" + ex.Message); } try diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 879ee4810..1eebe5fd6 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -12,6 +12,15 @@ Chunk_Size 32 Buffer_Size 64 +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m + [OUTPUT] Name oms EnableTelemetry true diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index c96300b1e..d5eb475ca 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -6,7 +6,8 @@ @type tail - path /var/log/containers/*.log + path "#{ENV['AZMON_LOG_TAIL_PATH']}" + exclude_path "#{ENV['AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH']}" pos_file /var/opt/microsoft/fluent/fluentd-containers.log.pos tag oms.container.log.la @log_level trace @@ -28,6 +29,14 @@ @include fluent-docker-parser.conf + + @type grep + + key stream + pattern "#{ENV['AZMON_LOG_EXCLUSION_REGEX_PATTERN']}" + + + @type record_transformer # fluent-plugin-record-modifier more light-weight but needs to be installed (dependency worth it?) @@ -37,7 +46,6 @@ - @type forward send_timeout 60s diff --git a/build/windows/installer/conf/telegraf.conf b/build/windows/installer/conf/telegraf.conf new file mode 100644 index 000000000..5f4d2364e --- /dev/null +++ b/build/windows/installer/conf/telegraf.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER diff --git a/build/windows/installer/scripts/livenessprobe.cmd b/build/windows/installer/scripts/livenessprobe.cmd index 06d577f31..19d0b69d7 100644 --- a/build/windows/installer/scripts/livenessprobe.cmd +++ b/build/windows/installer/scripts/livenessprobe.cmd @@ -1,40 +1,32 @@ -echo "Checking if fluent-bit is running" +REM "Checking if fluent-bit is running" tasklist /fi "imagename eq fluent-bit.exe" /fo "table" | findstr fluent-bit IF ERRORLEVEL 1 ( - echo "Fluent-Bit is not running" > /dev/termination-log + echo "Fluent-Bit is not running" exit /b 1 -) ELSE ( - echo "Fluent-Bit is running" ) -echo "Checking if config map has been updated since agent start" +REM "Checking if config map has been updated since agent start" IF EXIST C:\etc\omsagentwindows\filesystemwatcher.txt ( - echo "Config Map Updated since agent started" > /dev/termination-log + echo "Config Map Updated since agent started" exit /b 1 -) ELSE ( - echo "Config Map not Updated since agent start" ) -echo "Checking if certificate needs to be renewed (aka agent restart required)" +REM "Checking if certificate needs to be renewed (aka agent restart required)" IF EXIST C:\etc\omsagentwindows\renewcertificate.txt ( - echo "Certificate needs to be renewed" > /dev/termination-log + echo "Certificate needs to be renewed" exit /b 1 -) ELSE ( - echo "Certificate does NOT need to be renewd" ) -echo "Checking if fluentd service is running" +REM "Checking if fluentd service is running" sc query fluentdwinaks | findstr /i STATE | findstr RUNNING IF ERRORLEVEL 1 ( - echo "Fluentd Service is NOT Running" > /dev/termination-log + echo "Fluentd Service is NOT Running" exit /b 1 -) ELSE ( - echo "Fluentd Service is Running" ) exit /b 0 diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 8976b5561..00f3f49ed 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.4 +version: 2.8.3 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md index 3b357ffd5..a3f17b509 100644 --- a/charts/azuremonitor-containers/README.md +++ b/charts/azuremonitor-containers/README.md @@ -29,6 +29,8 @@ Monitoring your Kubernetes cluster and containers is critical, especially when r ## Installing the Chart +> Note: If you want to customize the chart, fork the chart code in https://github.com/microsoft/Docker-Provider/tree/ci_prod/charts/azuremonitor-containers + > Note: `--name` flag not required in Helm3 since this flag is deprecated > Note: use `omsagent.proxy` parameter to set the proxy endpoint if your K8s cluster configured behind the proxy. Refer to [configure proxy](#Configuring-Proxy-Endpoint) for more details about proxy. @@ -36,25 +38,25 @@ Monitoring your Kubernetes cluster and containers is critical, especially when r ### To Use Azure Log Analytics Workspace in Public Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ### To Use Azure Log Analytics Workspace in Azure China Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.domain=opinsights.azure.cn,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.domain=opinsights.azure.cn,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ### To Use Azure Log Analytics Workspace in Azure US Government Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.domain=opinsights.azure.us,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.domain=opinsights.azure.us,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ## Upgrading an existing Release to a new version @@ -91,6 +93,7 @@ The following table lists the configurable parameters of the MSOMS chart and the | `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided | | `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) | | `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) | +| `omsagent.priority` | DaemonSet Pod Priority | This is the [priority](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/) to use for the daemonsets such that they get scheduled onto the node ahead of "normal" pods - must be an integer, defaults to 10 | > Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName` @@ -98,6 +101,7 @@ The following table lists the configurable parameters of the MSOMS chart and the - Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap. - onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName +- The `omsagent.priority` parameter sets the priority of the omsagent daemonset priority class. This pod priority class is used for daemonsets to allow them to have priority over pods that can be scheduled elsewhere. Without a priority class, it is possible for a node to fill up with "normal" pods before the daemonset pods get to be created for the node or get scheduled. Note that pods are not "daemonset" pods - they are just pods created by the daemonset controller but they have a specific affinity set during creation to the specific node each pod was created to run on. You want this value to be greater than 0 (default is 10) and generally greater than pods that have the flexibility to run on different nodes such that they do not block the node specific pods. ## Agent data collection settings @@ -112,13 +116,13 @@ Specify each parameter using the `--set key=value[,key=value]` argument to `helm $ helm install --name myrelease-1 \ --set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= - incubator/azuremonitor-containers + microsoft/azuremonitor-containers ``` Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, ```bash -$ helm install --name myrelease-1 -f values.yaml incubator/azuremonitor-containers +$ helm install --name myrelease-1 -f values.yaml microsoft/azuremonitor-containers ``` diff --git a/charts/azuremonitor-containers/templates/NOTES.txt b/charts/azuremonitor-containers/templates/NOTES.txt index 372cecb95..48ebf33fc 100644 --- a/charts/azuremonitor-containers/templates/NOTES.txt +++ b/charts/azuremonitor-containers/templates/NOTES.txt @@ -29,7 +29,7 @@ This deployment will not complete. To proceed, run --set omsagent.secret.wsid= \ --set omsagent.secret.key= \ --set omsagent.env.clusterName= \ - incubator/azuremonitor-containers + microsoft/azuremonitor-containers {{- else -}} diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml index ebdd5ea3f..b7482b8b5 100644 --- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -1,4 +1,18 @@ {{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }} +#extension model +{{- if not (empty .Values.Azure.Extension.Name) }} +apiVersion: clusterconfig.azure.com/v1beta1 +kind: AzureExtensionIdentity +metadata: + name: {{ .Values.Azure.Extension.Name }} + namespace: azure-arc +spec: + serviceAccounts: + - name: omsagent + namespace: kube-system + tokenNamespace: azure-arc +--- +{{- end }} apiVersion: clusterconfig.azure.com/v1beta1 kind: AzureClusterIdentityRequest metadata: @@ -6,4 +20,7 @@ metadata: namespace: azure-arc spec: audience: https://monitoring.azure.com/ + {{- if not (empty .Values.Azure.Extension.Name) }} + resourceId: {{ .Values.Azure.Extension.Name }} + {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-crd.yaml index f4a028bd3..bbaf89a52 100644 --- a/charts/azuremonitor-containers/templates/omsagent-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-crd.yaml @@ -1,3 +1,4 @@ +{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }} apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: @@ -10,3 +11,26 @@ spec: names: plural: healthstates kind: HealthState +{{- else }} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: healthstates.azmon.container.insights + namespace: kube-system +spec: + group: azmon.container.insights + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string + scope: Namespaced + names: + plural: healthstates + kind: HealthState +{{- end }} \ No newline at end of file diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 7acd46c37..580ef9d15 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -24,9 +24,21 @@ spec: agentVersion: {{ .Values.omsagent.image.tagWindows }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: + priorityClassName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} + nodeSelector: + kubernetes.io/os: windows +{{- else }} nodeSelector: beta.kubernetes.io/os: windows +{{- end }} {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -39,7 +51,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonsetwindows | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID @@ -69,6 +81,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SIDECAR_SCRAPING_ENABLED + value: "false" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers @@ -89,6 +107,7 @@ spec: - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd periodSeconds: 60 initialDelaySeconds: 180 + timeoutSeconds: 15 {{- with .Values.omsagent.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} @@ -99,6 +118,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 7514247a0..7201ee6ae 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -24,7 +24,15 @@ spec: agentVersion: {{ .Values.omsagent.image.tag }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} + checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: + priorityClassName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -37,7 +45,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonsetlinux | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID @@ -63,6 +71,10 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" {{- if .Values.omsagent.logsettings.logflushintervalsecs }} @@ -77,6 +89,8 @@ spec: - name: FBIT_TAIL_BUFFER_MAX_SIZE value: {{ .Values.omsagent.logsettings.tailbufmaxsizemegabytes | quote }} {{- end }} + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: @@ -117,6 +131,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.daemonset.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 7d7ac7040..fdc520cba 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -25,6 +25,9 @@ spec: agentVersion: {{ .Values.omsagent.image.tag }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} + checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent @@ -64,8 +67,16 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + - name: SIDECAR_SCRAPING_ENABLED + value: "false" + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: @@ -100,6 +111,9 @@ spec: - mountPath: /etc/config/settings/adx name: omsagent-adx-secret readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true livenessProbe: exec: command: @@ -108,6 +122,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.deployment.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} @@ -149,4 +164,8 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml new file mode 100644 index 000000000..4d9980ab3 --- /dev/null +++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml @@ -0,0 +1,22 @@ +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} +# This pod priority class is used for daemonsets to allow them to have priority +# over pods that can be scheduled elsewhere. Without a priority class, it is +# possible for a node to fill up with pods before the daemonset pods get to be +# created for the node or get scheduled. Note that pods are not "daemonset" +# pods - they are just pods created by the daemonset controller but they have +# a specific affinity set during creation to the specific node each pod was +# created to run on (daemonset controller takes care of that) +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: omsagent + # Priority classes don't have labels :-) + annotations: + chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + component: oms-agent +value: {{ .Values.omsagent.priority }} +globalDefault: false +description: "This is the daemonset priority class for omsagent" +{{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index 4f7408e7c..c0a6e3722 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -19,7 +19,7 @@ metadata: heritage: {{ .Release.Service }} rules: - apiGroups: [""] - resources: ["pods", "events", "nodes", "nodes/stats", "nodes/metrics", "nodes/spec", "nodes/proxy", "namespaces", "services"] + resources: ["pods", "events", "nodes", "nodes/stats", "nodes/metrics", "nodes/spec", "nodes/proxy", "namespaces", "services", "persistentvolumes"] verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] resources: ["replicasets", "deployments", "horizontalpodautoscalers"] @@ -28,15 +28,19 @@ rules: resources: ["healthstates"] verbs: ["get", "create", "patch"] - apiGroups: ["clusterconfig.azure.com"] - resources: ["azureclusteridentityrequests"] + resources: ["azureclusteridentityrequests", "azureclusteridentityrequests/status"] resourceNames: ["container-insights-clusteridentityrequest"] verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] +#arc k8s extension model grants access as part of the extension msi +#remove this explicit permission once the extension available in public preview +{{- if (empty .Values.Azure.Extension.Name) }} - apiGroups: [""] resources: ["secrets"] resourceNames: ["container-insights-clusteridentityrequest-token"] verbs: ["get"] +{{- end }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index ee0664495..fc7c471f8 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -18,7 +18,14 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + + + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug #Kubernetes events @@ -70,14 +77,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info @@ -90,7 +95,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -102,12 +107,27 @@ data: max_retry_wait 5m - + type out_oms log_level debug num_threads 5 buffer_chunk_limit 4m buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 2 + buffer_chunk_limit 4m + buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk @@ -135,7 +155,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -164,7 +184,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml index 1a7f087ed..8c245338c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml @@ -13,7 +13,19 @@ data: WSID: {{ required "A valid workspace id is required!" .Values.omsagent.secret.wsid | b64enc | quote }} KEY: {{ required "A valid workspace key is required!" .Values.omsagent.secret.key | b64enc | quote }} DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }} - {{- if ne .Values.omsagent.proxy "" }} + {{- $httpsProxyDict := urlParse .Values.Azure.proxySettings.httpsProxy -}} + {{- $httpProxyDict := urlParse .Values.Azure.proxySettings.httpProxy -}} + {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpsProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpsProxyDict.scheme "userinfo" "admin:secret" "host" $httpsProxyDict.host) | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) ($httpProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpProxyDict.scheme "userinfo" "admin:secret" "host" $httpProxyDict.host) | b64enc | quote }} + {{- else if ne .Values.omsagent.proxy "" }} PROXY: {{ .Values.omsagent.proxy | b64enc | quote }} {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 5d519bd9a..2691e9950 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -4,19 +4,46 @@ ## Microsoft OMS Agent image for kubernetes cluster monitoring ## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod -## Values of ResourceId and Region under Azure->Cluster being populated by Azure Arc K8s RP during the installation of the extension +## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension Azure: Cluster: Region: ResourceId: + Extension: + Name: "" + ResourceId: "" + proxySettings: + isProxyEnabled: false + httpProxy: "" + httpsProxy: "" + noProxy: "" + proxyCert: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod09252020" - tagWindows: "win-ciprod09252020" + tag: "ciprod05202021-hotfix" + tagWindows: "win-ciprod04222021" pullPolicy: IfNotPresent - dockerProviderVersion: "10.0.0-6" + dockerProviderVersion: "15.0.0-0" agentVersion: "1.10.0.1" + + # The priority used by the omsagent priority class for the daemonset pods + # Note that this is not execution piority - it is scheduling priority, as + # in getting scheduled to the node. This needs to be greater than 0 such + # that the daemonset pods, which can not schedule onto different nodes as + # they are defined to run on specific nodes, are not accidentally frozen + # out of a node due to other pods showing up earlier in scheduling. + # (DaemonSet pods by definition only are created once the node exists for + # them to be created for and thus it is possible to have "normal" pods + # already in line to run on the node before the DeamonSet controller got a + # chance to build pod for the node and give it to the scheduler) + # Should be some number greater than default (0) + priority: 10 + + # This used for running agent pods in test mode. + # if set to true additional agent workflow logs will be emitted which are used for e2e and arc k8s conformance testing + ISTEST: false + ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. @@ -56,6 +83,21 @@ omsagent: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -67,10 +109,42 @@ omsagent: operator: NotIn values: - virtual-kubelet + - key: beta.kubernetes.io/arch + operator: In + values: + - amd64 deployment: affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/role + operator: NotIn + values: + - master + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -86,21 +160,29 @@ omsagent: operator: NotIn values: - master + - key: beta.kubernetes.io/arch + operator: In + values: + - amd64 ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: - daemonset: + daemonsetlinux: requests: cpu: 75m memory: 225Mi limits: cpu: 150m memory: 600Mi + daemonsetwindows: + limits: + cpu: 200m + memory: 600Mi deployment: requests: cpu: 150m memory: 250Mi limits: cpu: 1 - memory: 750Mi + memory: 1Gi diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json new file mode 100644 index 000000000..a8a99e9f6 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json @@ -0,0 +1,66 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutParameters.json", + "contentVersion": "1.0.0.0", + "wait": [ + { + "name": "waitSdpBakeTime", + "properties": { + "duration": "PT24H" + } + } + ], + "shellExtensions": [ + { + "name": "PushChartToACR", + "type": "ShellExtensionType", + "properties": { + "maxexecutiontime": "PT1H" + }, + "package": { + "reference": { + "path": "artifacts.tar.gz" + } + }, + "launch": { + "command": [ + "/bin/bash", + "pushChartToAcr.sh" + ], + "environmentVariables": [ + { + "name": "RELEASE_STAGE", + "value": "__RELEASE_STAGE__" + }, + { + "name": "ACR_APP_ID", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappid/e8f47bf7505741ebaf65a4db16ff9fa7" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_APP_SECRET", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappsecret/8718afcdac114accb8b26f613cef1e1e" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_NAME", + "value": "__ACR_NAME__" + }, + { + "name": "CHART_VERSION", + "value": "__CHART_VERSION__" + } + ] + } + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json new file mode 100644 index 000000000..cde103633 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Canary", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Canary", + "actions": [ "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json new file mode 100644 index 000000000..1749296c8 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-FF", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-FF", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json new file mode 100644 index 000000000..50729b1ae --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod3", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-HighLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json new file mode 100644 index 000000000..edd61f852 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-LightLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json new file mode 100644 index 000000000..014f4b092 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-MC", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MC", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json new file mode 100644 index 000000000..cd1befbc3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MediumLoad", + "actions": ["wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json new file mode 100644 index 000000000..48c99fce1 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Pilot", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Pilot", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR"], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json new file mode 100644 index 000000000..516eba3e2 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -0,0 +1,125 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/scopeBindings.json", + "contentVersion": "0.0.0.1", + "scopeBindings": [ + { + "scopeTagName": "Canary", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Canary" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "Pilot", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Pilot" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "LightLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumLow" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MediumLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumHigh" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "HighLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "HighLoad" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "FF", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "FF" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MC", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MC" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh new file mode 100644 index 000000000..520557592 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +export HELM_EXPERIMENTAL_OCI=1 +export MCR_NAME="mcr.microsoft.com" +# for prod-> stable and for test -> preview +export REPO_TYPE="stable" + +# repo paths for arc k8s extension roll-out +# canary region +export CANARY_REGION_REPO_PATH="azuremonitor/containerinsights/canary/${REPO_TYPE}/azuremonitor-containers" +# pilot region +export PILOT_REGION_REPO_PATH="azuremonitor/containerinsights/prod1/${REPO_TYPE}/azuremonitor-containers" +# light load regions +export LIGHT_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod2/${REPO_TYPE}/azuremonitor-containers" +# medium load regions +export MEDIUM_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod3/${REPO_TYPE}/azuremonitor-containers" +# high load regions +export HIGH_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod4/${REPO_TYPE}/azuremonitor-containers" +# FairFax regions +export FF_REGION_REPO_PATH="azuremonitor/containerinsights/prod5/${REPO_TYPE}/azuremonitor-containers" +# Mooncake regions +export MC_REGION_REPO_PATH="azuremonitor/containerinsights/prod6/${REPO_TYPE}/azuremonitor-containers" + +# pull chart from previous stage mcr and push chart to next stage acr +pull_chart_from_source_mcr_to_push_to_dest_acr() { + srcMcrFullPath=${1} + destAcrFullPath=${2} + + if [ -z $srcMcrFullPath ]; then + echo "-e error source mcr path must be provided " + exit 1 + fi + + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "Pulling chart from MCR:${srcMcrFullPath} ..." + helm chart pull ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Pulling chart from MCR:${srcMcrFullPath} completed successfully." + else + echo "-e error Pulling chart from MCR:${srcMcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "Exporting chart to current directory ..." + helm chart export ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Exporting chart to current directory completed successfully." + else + echo "-e error Exporting chart to current directory failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save azuremonitor-containers/ ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed successfully." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +# push to local release candidate chart to canary region +push_local_chart_to_canary_region() { + destAcrFullPath=${1} + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save charts/azuremonitor-containers/ $destAcrFullPath + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push $destAcrFullPath + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed.Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +echo "START - Release stage : ${RELEASE_STAGE}" + +# login to acr +echo "Using acr : ${ACR_NAME}" +echo "Using acr repo type: ${REPO_TYPE}" + +echo "login to acr:${ACR_NAME} using helm ..." +echo $ACR_APP_SECRET | helm registry login $ACR_NAME --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr:${ACR_NAME} using helm completed successfully." +else + echo "-e error login to acr:${ACR_NAME} using helm failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 +fi + +case $RELEASE_STAGE in + + Canary) + echo "START: Release stage - Canary" + destAcrFullPath=${ACR_NAME}/public/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + push_local_chart_to_canary_region $destAcrFullPath + echo "END: Release stage - Canary" + ;; + + Pilot | Prod1) + echo "START: Release stage - Pilot" + srcMcrFullPath=${MCR_NAME}/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Pilot" + ;; + + LightLoad | Pord2) + echo "START: Release stage - Light Load Regions" + srcMcrFullPath=${MCR_NAME}/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Light Load Regions" + ;; + + MediumLoad | Prod3) + echo "START: Release stage - Medium Load Regions" + srcMcrFullPath=${MCR_NAME}/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Medium Load Regions" + ;; + + HighLoad | Prod4) + echo "START: Release stage - High Load Regions" + srcMcrFullPath=${MCR_NAME}/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - High Load Regions" + ;; + + FF | Prod5) + echo "START: Release stage - FF" + srcMcrFullPath=${MCR_NAME}/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${FF_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - FF" + ;; + + MC | Prod6) + echo "START: Release stage - MC" + srcMcrFullPath=${MCR_NAME}/${FF_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MC_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - MC" + ;; + + *) + echo -n "unknown release stage" + exit 1 + ;; +esac + +echo "END - Release stage : ${RELEASE_STAGE}" diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json new file mode 100644 index 000000000..71081661a --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -0,0 +1,159 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/ServiceModel.json", + "ContentVersion": "0.0.0.1", + "ServiceMetadata": { + "ServiceGroup": "ContainerInsightsExtension", + "Environment": "Prod" + }, + "ServiceResourceGroupDefinitions": [ + { + "Name": "ARC-Extension-ServiceResourceGroupDefinition", + "ServiceResourceDefinitions": [ + { + "Name": "ShellExtension", + "ComposedOf": { + "Extension": { + "Shell": [ + { + "type": "ShellExtensionType", + "properties": { + "imageName": "adm-ubuntu-1804-l", + "imageVersion": "v18" + } + } + ] + } + } + } + ] + } + ], + "ServiceResourceGroups": [ + { + "AzureResourceGroupName": "ContainerInsightsExtension-Canary-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Canary" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Canary", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-Pilot-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Pilot" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Pilot", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-LightLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "LightLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-LightLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MediumLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MediumLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MediumLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-HighLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "HighLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-HighLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-FF-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "FF" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-FF", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MC-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MC" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MC", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + } + ] + } diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt new file mode 100644 index 000000000..1921233b3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt @@ -0,0 +1 @@ +1.0.0.0 diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58e09f041..543f270c1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,6 +42,7 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected + prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings [prometheus_data_collection_settings.cluster] @@ -75,6 +76,17 @@ data: ## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"] # monitor_kubernetes_pods_namespaces = ["default1"] + ## Label selector to target pods which have the specified label + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + # kubernetes_label_selector = "env=dev,app=nginx" + + ## Field selector to target pods which have the specified field + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + ## eg. To scrape pods on a specific node + # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" + [prometheus_data_collection_settings.node] # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. @@ -90,6 +102,15 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] @@ -100,6 +121,16 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/kubernetes/container-azm-ms-osmconfig.yaml b/kubernetes/container-azm-ms-osmconfig.yaml new file mode 100644 index 000000000..05b7ac3ed --- /dev/null +++ b/kubernetes/container-azm-ms-osmconfig.yaml @@ -0,0 +1,17 @@ +kind: ConfigMap +apiVersion: v1 +data: + schema-version: + #string.used by agent to parse OSM config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by OSM addon team to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + osm-metric-collection-configuration: |- + # OSM metric collection settings + [osm_metric_collection_configuration.settings] + # Namespaces to monitor + # monitor_namespaces = ["namespace1", "namespace2"] +metadata: + name: container-azm-ms-osmconfig + namespace: kube-system diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 3fb9de084..253612556 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod09252020 +ARG IMAGE_TAG=ciprod05202021-hotfix ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi @@ -15,8 +15,9 @@ ENV HOST_VAR /hostfs/var ENV AZMON_COLLECT_ENV False ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 +ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image diff --git a/kubernetes/linux/defaultpromenvvariables-rs b/kubernetes/linux/defaultpromenvvariables-rs index 1346e62b9..920f4e90e 100644 --- a/kubernetes/linux/defaultpromenvvariables-rs +++ b/kubernetes/linux/defaultpromenvvariables-rs @@ -1,7 +1,12 @@ -export AZMON_RS_PROM_INTERVAL="1m" -export AZMON_RS_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" -export AZMON_RS_PROM_FIELDPASS="[]" -export AZMON_RS_PROM_FIELDDROP="[]" -export AZMON_RS_PROM_URLS="[]" -export AZMON_RS_PROM_K8S_SERVICES="[]" -export AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'cluster'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_URLS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" + diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar new file mode 100644 index 000000000..3301488d8 --- /dev/null +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -0,0 +1,9 @@ +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'node'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" diff --git a/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh b/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh old mode 100644 new mode 100755 diff --git a/kubernetes/linux/envmdsd b/kubernetes/linux/envmdsd index e4886012e..3f834bfb8 100644 --- a/kubernetes/linux/envmdsd +++ b/kubernetes/linux/envmdsd @@ -12,3 +12,7 @@ export HOSTNAME_OVERRIDE="${NODE_NAME}" export MDSD_TCMALLOC_RELEASE_FREQ_SEC=1 export MDSD_COMPRESSION_ALGORITHM=LZ4 export SSL_CERT_DIR="/etc/ssl/certs" +# increase the size of msgpack items mdsd will accept, otherwise they will be silently dropped. These values were arbitrairly chosen to be 10 or 100 times larger than the defaults. +export MDSD_MSGPACK_ARRAY_SIZE_ITEMS=10000000 +export MDSD_MSGPACK_MAP_SIZE_ITEMS=10000000 +export MDSD_MSGPACK_NESTING_LEVEL=100 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 311470660..c7d939034 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,8 +1,56 @@ #!/bin/bash +waitforlisteneronTCPport() { + local sleepdurationsecs=1 + local totalsleptsecs=0 + local port=$1 + local waittimesecs=$2 + local numeric='^[0-9]+$' + local varlistener="" + + if [ -z "$1" ] || [ -z "$2" ]; then + echo "${FUNCNAME[0]} called with incorrect arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" + return -1 + else + + if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then + #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') + while true + do + if [ $totalsleptsecs -gt $waittimesecs ]; then + echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs" + return 1 + fi + varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":'"$port"'$"') + if [ -z "$varlistener" ]; then + #echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..." + sleep $sleepdurationsecs + totalsleptsecs=$(($totalsleptsecs+1)) + else + echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs" + return 0 + fi + done + else + echo "${FUNCNAME[0]} called with non-numeric arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" + return -1 + fi + fi +} + if [ -e "/etc/config/kube.conf" ]; then cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf +elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "setting omsagent conf file for prometheus sidecar" + cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. + # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, + # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. + # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. + sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf + sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf else + echo "setting omsagent conf file for daemonset" sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf fi sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf @@ -28,6 +76,12 @@ sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +#Run inotify as a daemon to track changes to the mounted configmap for OSM settings. +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +fi + #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" @@ -68,6 +122,24 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi +#set OSM config schema version +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then + #trim + osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" + #remove all spaces + osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" + #take first 10 characters + osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + + export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + source ~/.bashrc + echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" + fi +fi + export PROXY_ENDPOINT="" # Check for internet connectivity or workspace deletion @@ -150,69 +222,101 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi -#Parse the configmap to set the right environment variables. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb +# Set environment variable for if public cloud by checking the workspace domain. +if [ -z $domain ]; then + ClOUD_ENVIRONMENT="unknown" +elif [ $domain == "opinsights.azure.com" ]; then + CLOUD_ENVIRONMENT="public" +else + CLOUD_ENVIRONMENT="national" +fi +export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT +echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc + +# Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) +if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) + for BACKOFF in {1..4}; do + KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL ) + # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + break + else + sleep $((2**$BACKOFF / 4)) # (exponential backoff) + fi + done -cat config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source config_env_var + # validate that the retrieved data is an instrumentation key + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + echo "Using cloud-specific instrumentation key" + else + # no ikey can be retrieved. Disable telemetry and continue + export DISABLE_TELEMETRY=true + echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + fi +fi -#Parse the configmap to set the right environment variables for health feature. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-health-config.rb +aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc -cat health_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source health_config_env_var +source ~/.bashrc -#Parse the configmap to set the right environment variables for network policy manager (npm) integration. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + #Parse the configmap to set the right environment variables. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb -cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source integration_npm_config_env_var + cat config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_env_var +fi + +#Parse the configmap to set the right environment variables for agent config. +#Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb + + cat agent_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source agent_config_env_var + + #Parse the configmap to set the right environment variables for network policy manager (npm) integration. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + + cat integration_npm_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source integration_npm_config_env_var +fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb -#If config parsing was successful, a copy of the conf file with replaced custom settings file is created -if [ ! -e "/etc/config/kube.conf" ]; then - if [ -e "/opt/telegraf-test.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -else - if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -fi - #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then - cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc - done - source defaultpromenvvariables + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + cat defaultpromenvvariables-sidecar | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables-sidecar + else + cat defaultpromenvvariables | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables + fi else cat defaultpromenvvariables-rs | while read line; do echo $line >> ~/.bashrc @@ -228,13 +332,37 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb -cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_mdm_metrics_env_var + cat config_mdm_metrics_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_mdm_metrics_env_var + + #Parse the configmap to set the right environment variables for metric collection settings + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + + cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_metric_collection_env_var +fi + +# OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + + if [ -e "integration_osm_config_env_var" ]; then + cat integration_osm_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source integration_osm_config_env_var + fi +fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" @@ -292,11 +420,10 @@ fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc -# enable these metrics in next agent release -# export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" -# echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc -# export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" -# echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc +export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" +echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc +export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc # default to docker metrics export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" @@ -409,18 +536,147 @@ echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +#region check to auto-activate oneagent, to route container logs, +#Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap +# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map +# AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic +echo "************start oneagent log routing checks************" +# by default, use configmap route for safer side +AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE + +#trim region list +oneagentregions="$(echo $AZMON_CONTAINERLOGS_ONEAGENT_REGIONS | xargs)" +#lowercase region list +typeset -l oneagentregions=$oneagentregions +echo "oneagent regions: $oneagentregions" +#trim current region +currentregion="$(echo $AKS_REGION | xargs)" +#lowercase current region +typeset -l currentregion=$currentregion +echo "current region: $currentregion" + +#initilze isoneagentregion as false +isoneagentregion=false + +#set isoneagentregion as true if matching region is found +if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then + for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do + if [ "$rgn" == "$currentregion" ]; then + isoneagentregion=true + echo "current region is in oneagent regions..." + break + fi + done +else + echo "current region is not in oneagent regions..." +fi + +if [ "$isoneagentregion" = true ]; then + #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route + if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE + echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + else #there is no configmap route, so route thru oneagent + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE="v2" + echo "oneagent region is true for current region:$currentregion and config map logs route is empty. so using oneagent as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + fi +else + echo "oneagent region is false for current region:$currentregion" +fi + + +#start oneagent +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then + echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" + echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + #trim + containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE | xargs)" + # convert to lowercase + typeset -l containerlogsroute=$containerlogsroute + + echo "setting AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE as :$containerlogsroute" + export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute + echo "export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute" >> ~/.bashrc + source ~/.bashrc + + if [ "$containerlogsroute" == "v2" ]; then + echo "activating oneagent..." + echo "configuring mdsd..." + cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc + done + source /etc/mdsd.d/envmdsd + + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_key=$CIWORKSPACE_key + echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc + + source ~/.bashrc + + dpkg -l | grep mdsd | awk '{print $2 " " $3}' + + echo "starting mdsd ..." + mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + + touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 + fi + fi +fi +echo "************end oneagent log routing checks************" + +#If config parsing was successful, a copy of the conf file with replaced custom settings file is created +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + else + if [ -e "/opt/telegraf-test.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi + fi +else + if [ -e "/opt/telegraf-test-rs.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi +fi + #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then - if [ "$CONTAINER_RUNTIME" == "docker" ]; then - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else - echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" - sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + echo "starting fluent-bit and setting telegraf conf file for daemonset" + if [ "$CONTAINER_RUNTIME" == "docker" ]; then + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + else + echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" + sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi fi else + echo "starting fluent-bit and setting telegraf conf file for replicaset" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf -e /opt/td-agent-bit/bin/out_oms.so & telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" fi @@ -471,11 +727,20 @@ echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc export HOST_VAR=/hostfs/var echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc -aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc - -source ~/.bashrc +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25229 30 + else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 + echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25228 30 + fi +else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 +fi #start telegraf /opt/telegraf --config $telegrafConfFile & @@ -484,37 +749,17 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' #dpkg -l | grep telegraf | awk '{print $2 " " $3}' -#start oneagent -if [ ! -e "/etc/config/kube.conf" ]; then - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then - echo "container logs route is defined as $AZMON_CONTAINER_LOGS_ROUTE" - #trim - containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_ROUTE | xargs)" - # convert to lowercase - typeset -l containerlogsroute=$containerlogsroute - if [ "$containerlogsroute" == "v2" ]; then - echo "containerlogsroute $containerlogsroute" - echo "configuring mdsd..." - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_key=$CIWORKSPACE_key - echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc - source ~/.bashrc +# Write messages from the liveness probe to stdout (so telemetry picks it up) +touch /dev/write-to-traces - dpkg -l | grep mdsd | awk '{print $2 " " $3}' - echo "starting mdsd ..." - mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - fi - fi -fi +echo "stopping rsyslog..." +service rsyslog stop + +echo "getting rsyslog status..." +service rsyslog status shutdown() { /opt/microsoft/omsagent/bin/service_control stop diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml index 76d2104fc..49d329791 100644 --- a/kubernetes/linux/mdsd.xml +++ b/kubernetes/linux/mdsd.xml @@ -48,20 +48,31 @@ --> - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - + + @@ -97,15 +108,22 @@ priority events to be delivered sooner than the next five-minute interval. --> - - - - + + + + + + + + + - @@ -118,7 +136,16 @@ - ]]> + ]]> + + + + + + + + + ]]> diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 67a981dfa..f065cc165 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -2,8 +2,8 @@ TMPDIR="/opt" cd $TMPDIR #Download utf-8 encoding capability on the omsagent container. - -apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales +#upgrade apt to latest version +apt-get update && apt-get install -y apt && DEBIAN_FRONTEND=noninteractive apt-get install -y locales sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ @@ -31,8 +31,8 @@ mv $TMPDIR/omsbundle* $TMPDIR/omsbundle /usr/bin/dpkg -i $TMPDIR/omsbundle/110/omsagent*.deb #/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb -#install oneagent - Latest dev bits (7/17) -wget https://github.com/microsoft/Docker-Provider/releases/download/7172020-oneagent/azure-mdsd_1.5.124-build.develop.1294_x86_64.deb +#install oneagent - Official bits (05/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/05112021-oneagent/azure-mdsd_1.8.0-build.master.189_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d @@ -60,7 +60,13 @@ sudo apt-get install libcap2-bin -y #service telegraf stop -wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf +#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf + +#1.18 pre-release +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz +tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz + +mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf @@ -71,7 +77,7 @@ chmod 777 /opt/telegraf wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=1.4.2 -y +sudo apt-get install td-agent-bit=1.6.8 -y rm -rf $TMPDIR/omsbundle rm -f $TMPDIR/omsagent*.sh @@ -79,3 +85,7 @@ rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd + +# Remove settings for cron.daily that conflict with the node's cron.daily. Since both are trying to rotate the same files +# in /var/log at the same time, the rotation doesn't happen correctly and then the *.1 file is forever logged to. +rm /etc/logrotate.d/alternatives /etc/logrotate.d/apt /etc/logrotate.d/azure-mdsd /etc/logrotate.d/rsyslog diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index eec4c6216..59119fd70 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -21,6 +21,7 @@ rules: "nodes/proxy", "namespaces", "services", + "persistentvolumes" ] verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] @@ -64,7 +65,14 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + + + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug #Kubernetes events @@ -117,15 +125,13 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info @@ -137,7 +143,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -149,10 +155,25 @@ data: max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer @@ -182,7 +203,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -211,7 +232,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer @@ -337,17 +358,21 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-6" + dockerProviderVersion: "15.2.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09252020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" imagePullPolicy: IfNotPresent resources: limits: - cpu: 150m + cpu: 500m memory: 600Mi requests: cpu: 75m @@ -358,6 +383,9 @@ spec: value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -370,6 +398,8 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS + value: "koreacentral,norwayeast,eastus2" securityContext: privileged: true ports: @@ -413,13 +443,69 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 +#Only in sidecar scraping mode + - name: omsagent-prometheus + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 500m + memory: 1Gi + requests: + cpu: 75m + memory: 225Mi + env: + # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_RESOURCE_REGION_VALUE" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: matchExpressions: - - key: beta.kubernetes.io/os + # kubernetes.io/os label doesnt exist in k8s versions < 1.14 so make sure to choose label based on k8s version in aks yaml + - key: kubernetes.io/os operator: In values: - linux @@ -471,6 +557,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: Deployment @@ -493,27 +583,29 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-6" + dockerProviderVersion: "15.2.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09252020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" imagePullPolicy: IfNotPresent resources: limits: cpu: 1 - memory: 750Mi + memory: 1Gi requests: cpu: 150m memory: 250Mi env: - # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -526,6 +618,9 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + # Add the below environment variable to true only in sidecar enabled regions, else set it to false + - name: SIDECAR_SCRAPING_ENABLED + value: "true" securityContext: privileged: true ports: @@ -553,6 +648,8 @@ spec: readOnly: true - mountPath: /etc/config/settings/adx name: omsagent-adx-secret + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config readOnly: true livenessProbe: exec: @@ -562,8 +659,18 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: @@ -616,6 +723,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: DaemonSet @@ -639,21 +750,22 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-6" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09252020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: - cpu: 150m + cpu: 200m memory: 600Mi - requests: - cpu: 75m - memory: 225Mi env: # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID @@ -668,10 +780,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP + - name: SIDECAR_SCRAPING_ENABLED + value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers @@ -695,6 +813,7 @@ spec: - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd periodSeconds: 60 initialDelaySeconds: 180 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -720,6 +839,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig @@ -745,14 +865,24 @@ spec: port: 25227 targetPort: in-rs-tcp --- -apiVersion: apiextensions.k8s.io/v1beta1 +# this is for versions >=1.19, for versions <1.19 we continue to use v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: healthstates.azmon.container.insights namespace: kube-system spec: group: azmon.container.insights - version: v1 + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string scope: Namespaced names: plural: healthstates diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index de5e6fcde..fefd089a8 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,14 +3,14 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod09252020 +ARG IMAGE_TAG=win-ciprod04222021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20190524.0.0.20191030 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20200903.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update @@ -47,6 +47,7 @@ RUN ./setup.ps1 COPY main.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell # copy ruby scripts to /opt folder COPY ./omsagentwindows/installer/scripts/*.rb /opt/omsagentwindows/scripts/ruby/ @@ -62,6 +63,9 @@ COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows +# copy telegraf conf file +COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ + # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index de82722ad..95cba2579 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -43,34 +43,32 @@ function Start-FileSystemWatcher { function Set-EnvironmentVariables { $domain = "opinsights.azure.com" + $cloud_environment = "public" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging $domain = Get-Content /etc/omsagent-secret/DOMAIN + $cloud_environment = "national" } # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") + # Set CLOUD_ENVIRONMENT + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") + $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { # TODO: Change to omsagent-secret before merging $wsID = Get-Content /etc/omsagent-secret/WSID } - # Set DOMAIN + # Set WSID [System.Environment]::SetEnvironmentVariable("WSID", $wsID, "Process") [System.Environment]::SetEnvironmentVariable("WSID", $wsID, "Machine") - $wsKey = "" - if (Test-Path /etc/omsagent-secret/KEY) { - # TODO: Change to omsagent-secret before merging - $wsKey = Get-Content /etc/omsagent-secret/KEY - } - - # Set KEY - [System.Environment]::SetEnvironmentVariable("WSKEY", $wsKey, "Process") - [System.Environment]::SetEnvironmentVariable("WSKEY", $wsKey, "Machine") + # Don't store WSKEY as environment variable $proxy = "" if (Test-Path /etc/omsagent-secret/PROXY) { @@ -121,10 +119,48 @@ function Set-EnvironmentVariables { $env:AZMON_AGENT_CFG_SCHEMA_VERSION } - # Set environment variable for TELEMETRY_APPLICATIONINSIGHTS_KEY - $aiKey = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Process") - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Machine") + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) + $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') + if ($aiKeyURl) { + $aiKeyFetched = "" + # retry up to 5 times + for( $i = 1; $i -le 4; $i++) { + try { + $response = Invoke-WebRequest -uri $aiKeyURl -UseBasicParsing -TimeoutSec 5 -ErrorAction:Stop + + if ($response.StatusCode -ne 200) { + Write-Host "Expecting reponse code 200, was: $($response.StatusCode), retrying" + Start-Sleep -Seconds ([MATH]::Pow(2, $i) / 4) + } + else { + $aiKeyFetched = $response.Content + break + } + } + catch { + Write-Host "Exception encountered fetching instrumentation key:" + Write-Host $_.Exception + } + } + + # Check if the fetched IKey was properly encoded. if not then turn off telemetry + if ($aiKeyFetched -match '^[A-Za-z0-9=]+$') { + Write-Host "Using cloud-specific instrumentation key" + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Process") + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Machine") + } + else { + # Couldn't fetch the Ikey, turn telemetry off + Write-Host "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Process") + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Machine") + } + } + + $aiKeyDecoded = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb @@ -237,9 +273,9 @@ function Get-ContainerRuntime { return $containerRuntime } -function Start-Fluent { +function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service. + # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } @@ -253,11 +289,96 @@ function Start-Fluent { (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } + # Start telegraf only in sidecar scraping mode + $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') + { + Write-Host "Starting telegraf..." + Start-Telegraf + } + fluentd --reg-winsvc i --reg-winsvc-auto-start --winsvc-name fluentdwinaks --reg-winsvc-fluentdopt '-c C:/etc/fluent/fluent.conf -o C:/etc/fluent/fluent.log' Notepad.exe | Out-Null } +function Start-Telegraf { + # Set default telegraf environment variables for prometheus scraping + Write-Host "**********Setting default environment variables for telegraf prometheus plugin..." + .\setdefaulttelegrafenvvariables.ps1 + + # run prometheus custom config parser + Write-Host "**********Running config parser for custom prometheus scraping**********" + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-prom-customconfig.rb + Write-Host "**********End running config parser for custom prometheus scraping**********" + + + # Set required environment variable for telegraf prometheus plugin to run properly + Write-Host "Setting required environment variables for telegraf prometheus input plugin to run properly..." + $kubernetesServiceHost = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_HOST", "process") + if (![string]::IsNullOrEmpty($kubernetesServiceHost)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_HOST", $kubernetesServiceHost, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_HOST - $($kubernetesServiceHost) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_HOST for target 'machine' since it is either null or empty" + } + + $kubernetesServicePort = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_PORT", "process") + if (![string]::IsNullOrEmpty($kubernetesServicePort)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_PORT", $kubernetesServicePort, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_PORT - $($kubernetesServicePort) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" + } + + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } + + Write-Host "Installing telegraf service" + C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" + + # Setting delay auto start for telegraf since there have been known issues with windows server and telegraf - + # https://github.com/influxdata/telegraf/issues/4081 + # https://github.com/influxdata/telegraf/issues/3601 + try { + $serverName = [System.Environment]::GetEnvironmentVariable("PODNAME", "process") + if (![string]::IsNullOrEmpty($serverName)) { + sc.exe \\$serverName config telegraf start= delayed-auto + Write-Host "Successfully set delayed start for telegraf" + + } else { + Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" + } + } + catch { + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" + } + Write-Host "Running telegraf service in test mode" + C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test + Write-Host "Starting telegraf service" + C:\opt\telegraf\telegraf.exe --service start + + # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup + Get-Service telegraf | findstr Running + if ($? -eq $false) + { + Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." + Start-Sleep -s 30 + C:\opt\telegraf\telegraf.exe --service start + Get-Service telegraf + } +} + function Generate-Certificates { Write-Host "Generating Certificates" C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe @@ -288,16 +409,13 @@ Start-Transcript -Path main.txt Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher + Generate-Certificates Test-CertificatePath -Start-Fluent +Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId #check if fluentd service is running Get-Service fluentdwinaks - - - - diff --git a/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 new file mode 100644 index 000000000..269894139 --- /dev/null +++ b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 @@ -0,0 +1,17 @@ +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "machine") + diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index dd6d52a11..25aad5e16 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -8,10 +8,12 @@ Write-Host ('Creating folder structure') New-Item -Type Directory -Path /opt/fluent-bit New-Item -Type Directory -Path /opt/scripts/ruby + New-Item -Type Directory -Path /opt/telegraf New-Item -Type Directory -Path /etc/fluent-bit New-Item -Type Directory -Path /etc/fluent New-Item -Type Directory -Path /etc/omsagentwindows + New-Item -Type Directory -Path /etc/telegraf New-Item -Type Directory -Path /etc/config/settings/ New-Item -Type Directory -Path /etc/config/adx/ @@ -32,6 +34,20 @@ Write-Host ('Installing Fluent Bit'); } Write-Host ('Finished Installing Fluentbit') +Write-Host ('Installing Telegraf'); +try { + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip + Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf + Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue +} +catch { + $ex = $_.Exception + Write-Host "exception while downloading telegraf for windows" + Write-Host $ex + exit 1 +} +Write-Host ('Finished downloading Telegraf') Write-Host ('Installing Visual C++ Redistributable Package') $vcRedistLocation = 'https://aka.ms/vs/16/release/vc_redist.x64.exe' diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index b5e6e2d18..3bb56ac2a 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -21,7 +21,7 @@ function Install-Go { # install go lang Write-Host("installing go ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing go completed") Write-Host "updating PATH variable" @@ -102,7 +102,7 @@ function Install-DotNetCoreSDK() { # install dotNet core sdk Write-Host("installing .net core sdk 3.1 ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing .net core sdk 3.1 completed") } @@ -129,7 +129,7 @@ function Install-Docker() { # install docker Write-Host("installing docker for desktop ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing docker for desktop completed") } diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json new file mode 100644 index 000000000..c68bfed17 --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json @@ -0,0 +1,113 @@ +{ + "mode": "Indexed", + "policyRule": { + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } + }, + "parameters": { + "workspaceResourceId": { + "type": "String", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json new file mode 100644 index 000000000..6281cdade --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json @@ -0,0 +1,9 @@ +{ + "workspaceResourceId": { + "type": "string", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json new file mode 100644 index 000000000..a113441ce --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json @@ -0,0 +1,101 @@ +{ + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } +} diff --git a/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml new file mode 100644 index 000000000..fce2fc582 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: containerHealth-read-logs-global +roleRef: + kind: ClusterRole + name: containerHealth-log-reader + apiGroup: rbac.authorization.k8s.io +subjects: + - kind: User + name: clusterUser + apiGroup: rbac.authorization.k8s.io diff --git a/scripts/onboarding/clusteruser/cluster-user-role.yaml b/scripts/onboarding/clusteruser/cluster-user-role.yaml new file mode 100644 index 000000000..b3519fdd3 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: containerHealth-log-reader +rules: + - apiGroups: ["", "metrics.k8s.io", "extensions", "apps"] + resources: + - "pods/log" + - "events" + - "nodes" + - "pods" + - "deployments" + - "replicasets" + verbs: ["get", "list"] diff --git a/scripts/onboarding/enable-monitoring-using-policy.md b/scripts/onboarding/enable-monitoring-using-policy.md new file mode 100644 index 000000000..e1e395ecc --- /dev/null +++ b/scripts/onboarding/enable-monitoring-using-policy.md @@ -0,0 +1,64 @@ +# How to enable AKS Monitoring Addon via Azure Policy +This doc describes how to enable AKS Monitoring Addon using Azure Custom Policy.Monitoring Addon Custom Policy can be assigned +either at subscription or resource group scope. If Azure Log Analytics workspace and AKS cluster are in different subscriptions then Managed Identity used by Policy assignnment has to have required role permissions on both the subscriptions or least on the resource of the Azure Log Aalytics workspace. Similarly, If the policy scoped to Resource Group, then Managed Identity should have required role permissions on the Log Analytics workspace if the workspace not in the selected Resource Group scope. + +Monitoring Addon require following roles on the Managed Identity used by Azure Policy + - [azure-kubernetes-service-contributor-role](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-contributor-role) + - [log-analytics-contributor](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#log-analytics-contributor) + +## Create and Assign Policy definition using Azure Portal + +### Create Policy Definition + +1. Download the Azure Custom Policy definition to enable AKS Monitoring Addon +``` sh + curl -o azurepolicy.json -L https://aka.ms/aks-enable-monitoring-custom-policy +``` +2. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and create policy definition with the following details in the Policy definition create dialogue box + + - Pick any Azure Subscription where you want to store Policy Definition + - Name - '(Preview)AKS-Monitoring-Addon' + - Description - 'Azure Custom Policy to enable Monitoring Addon onto Azure Kubernetes Cluster(s) in specified scope' + - Category - Choose "use existing" and pick 'Kubernetes' from drop down + - Remove the existing sample rules and copy the contents of azurepolicy.json downloaded in step #1 above + +### Assign Policy Definition to Specified Scope + +> Note: Managed Identity will be created automatically and assigned specified roles in the Policy definition. + +3. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and select the Policy Definition 'AKS Monitoring Addon' +4. Click an Assignment and select Scope, Exclusions (if any) +5. Provide the Resource Id of the Azure Log Analytics Workspace. The Resource Id should be in this format `/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/` +6. Create Remediation task in case if you want apply to policy to existing AKS clusters in selected scope +7. Click and Review & Create Option to create Policy Assignment + +## Create and Assign Policy definition using Azure CLI + +### Create Policy Definition + +1. Download the Azure Custom Policy definition rules and parameters files + ``` sh + curl -o azurepolicy.rules.json -L https://aka.ms/aks-enable-monitoring-custom-policy-rules + curl -o azurepolicy.parameters.json -L https://aka.ms/aks-enable-monitoring-custom-policy-parameters + ``` +2. Create policy definition using below command + + ``` sh + az cloud set -n # set the Azure cloud + az login # login to cloud environment + az account set -s + az policy definition create --name "(Preview)AKS-Monitoring-Addon" --display-name "(Preview)AKS-Monitoring-Addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules azurepolicy.rules.json --params azurepolicy.parameters.json + ``` +### Assign Policy Definition to Specified Scope + +3. Create policy assignment + +``` sh +az policy assignment create --name aks-monitoring-addon --policy "(Preview)AKS-Monitoring-Addon" --assign-identity --identity-scope /subscriptions/ --role Contributor --scope /subscriptions/ --location --role Contributor --scope /subscriptions/ -p "{ \"workspaceResourceId\": { \"value\": \"/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/\" } }" +``` + +## References +- https://docs.microsoft.com/en-us/azure/governance/policy/ +- https://docs.microsoft.com/en-us/azure/governance/policy/how-to/remediate-resources#how-remediation-security-works +- https://docs.microsoft.com/en-us/cli/azure/install-azure-cli +- https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview \ No newline at end of file diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index 8945f90b6..bcd135dba 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -1,12 +1,12 @@ <# .DESCRIPTION - Disables Azure Monitor for containers to monitoring enabled Azure Managed K8s cluster such as Azure Arc K8s, ARO v4 and AKS etc. + Disables Azure Monitor for containers to monitoring enabled Azure Managed K8s cluster such as Azure Arc enabled Kubernetes, ARO v4 and AKS etc. 1. Deletes the existing Azure Monitor for containers helm release 2. Deletes logAnalyticsWorkspaceResourceId tag on the provided Managed cluster .PARAMETER clusterResourceId - Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + Id of the Azure Managed Cluster such as Azure Arc enabled Kubernetes, ARO v4 etc. .PARAMETER servicePrincipalClientId client Id of the service principal which will be used for the azure login .PARAMETER servicePrincipalClientSecret @@ -15,10 +15,12 @@ tenantId of the service principal which will be used for the azure login .PARAMETER kubeContext (optional) kube-context of the k8 cluster to install Azure Monitor for containers HELM chart + .PARAMETER azureCloudName (optional) + Name of the Azure cloud name. Supported Azure cloud Name is AzureCloud or AzureUSGovernment Pre-requisites: - Azure Managed cluster Resource Id - - Contributor role permission on the Subscription of the Azure Arc Cluster + - Contributor role permission on the Subscription of the Azure Arc enabled Kubernetes Cluster - Helm v3.0.0 or higher https://github.com/helm/helm/releases - kube-context of the K8s cluster Note: 1. Please make sure you have all the pre-requisistes before running this script. @@ -34,7 +36,9 @@ param( [Parameter(mandatory = $false)] [string]$tenantId, [Parameter(mandatory = $false)] - [string]$kubeContext + [string]$kubeContext, + [Parameter(mandatory = $false)] + [string]$azureCloudName ) $helmChartReleaseName = "azmon-containers-release-1" @@ -46,6 +50,21 @@ $isAksCluster = $false $isAroV4Cluster = $false $isUsingServicePrincipal = $false +if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { + Write-Host("Azure cloud name parameter not passed in so using default cloud as AzureCloud") + $azureCloudName = "AzureCloud" +} else { + if(($azureCloudName.ToLower() -eq "azurecloud" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + } elseif (($azureCloudName.ToLower() -eq "azureusgovernment" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + } else { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + Write-Host("Only supported Azure clouds are : AzureCloud and AzureUSGovernment") + exit + } +} + # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources @@ -226,14 +245,19 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } try { Write-Host("") Write-Host("Trying to get the current Az login context...") $account = Get-AzContext -ErrorAction Stop - Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + $ctxCloud = $account.Environment.Name + if(($azureCloudName.ToLower() -eq $ctxCloud.ToLower() ) -eq $false) { + Write-Host("Specified azure cloud name is not same as current context cloud hence setting account to null to retrigger the login" ) -ForegroundColor Green + $account = $null + } + Write-Host("Successfully fetched current AzContext context and azure cloud name: $azureCloudName" ) -ForegroundColor Green Write-Host("") } catch { @@ -249,10 +273,10 @@ if ($null -eq $account.Account) { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } else { Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId + Connect-AzAccount -subscriptionid $clusterSubscriptionId -Environment $azureCloudName } } catch { @@ -298,7 +322,7 @@ if ($isArcK8sCluster -eq $true) { # validate identity $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.Contains("systemassigned") -eq $false) { - Write-Host("Identity of Azure Arc K8s cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red + Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red exit } } @@ -354,7 +378,3 @@ catch { } Write-Host("Successfully disabled Azure Monitor for containers for cluster: $clusteResourceId") -ForegroundColor Green - - - - diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index f20bd7d33..29b755331 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -26,10 +26,10 @@ set -o pipefail # default release name used during onboarding releaseName="azmon-containers-release-1" -# resource type for azure arc clusters +# resource type for Azure Arc enabled Kubernetes clusters resourceProvider="Microsoft.Kubernetes/connectedClusters" -# resource provider for azure arc connected cluster +# resource provider for Azure Arc enabled Kubernetes cluster arcK8sResourceProvider="Microsoft.Kubernetes/connectedClusters" # resource provider for azure redhat openshift v4 cluster aroV4ResourceProvider="Microsoft.RedHatOpenShift/OpenShiftClusters" @@ -125,13 +125,13 @@ remove_monitoring_tags() echo "set the cluster subscription id: ${clusterSubscriptionId}" az account set -s ${clusterSubscriptionId} - # validate cluster identity for ARC k8s cluster + # validate cluster identity for Azure Arc enabled Kubernetes cluster if [ "$isArcK8sCluster" = true ] ; then - identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then - echo "-e only supported cluster identity is systemassigned for Azure ARC K8s cluster type" + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" exit 1 fi fi @@ -257,7 +257,7 @@ done # detect the resource provider from the provider name in the cluster resource id if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then - echo "provider cluster resource is of Azure ARC K8s cluster type" + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" isArcK8sCluster=true resourceProvider=$arcK8sResourceProvider elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then @@ -280,10 +280,27 @@ done } +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} # parse args parse_args $@ +# validate and configure azure cloud +validate_and_configure_supported_cloud + # parse cluster resource id clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 338de6cbc..828d061ac 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -1,14 +1,14 @@ <# .DESCRIPTION - Onboards Azure Monitor for containers to Azure Managed Kuberenetes such as Azure Arc K8s, ARO v4 and AKS etc. + Onboards Azure Monitor for containers to Azure Managed Kuberenetes such as Azure Arc enabled Kubernetes, ARO v4 and AKS etc. 1. Creates the Default Azure log analytics workspace if doesn't exist one in specified subscription 2. Adds the ContainerInsights solution to the Azure log analytics workspace 3. Adds the workspaceResourceId tag or enable addon (if the cluster is AKS) on the provided Managed cluster resource id 4. Installs Azure Monitor for containers HELM chart to the K8s cluster in provided via --kube-context .PARAMETER clusterResourceId - Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + Id of the Azure Managed Cluster such as Azure Arc enabled Kubernetes, ARO v4 etc. .PARAMETER servicePrincipalClientId Client Id of the service principal which will be used for the azure login .PARAMETER servicePrincipalClientSecret @@ -22,10 +22,8 @@ .PARAMETER proxyEndpoint (optional) Provide Proxy endpoint if you have K8s cluster behind the proxy and would like to route Azure Monitor for containers outbound traffic via proxy. Format of the proxy endpoint should be http(s://:@: - .PARAMETER helmRepoName (optional) - helm repo name. should be used only for the private preview features - .PARAMETER helmRepoUrl (optional) - helm repo url. should be used only for the private preview features + .PARAMETER azureCloudName (optional) + Name of the Azure cloud name. Supported Azure cloud Name is AzureCloud or AzureUSGovernment Pre-requisites: - Azure Managed cluster Resource Id @@ -52,27 +50,40 @@ param( [Parameter(mandatory = $false)] [string]$proxyEndpoint, [Parameter(mandatory = $false)] - [string]$helmRepoName, - [Parameter(mandatory = $false)] - [string]$helmRepoUrl + [string]$azureCloudName ) -$solutionTemplateUri= "https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/onboarding/templates/azuremonitor-containerSolution.json" +$solutionTemplateUri = "https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/onboarding/templates/azuremonitor-containerSolution.json" $helmChartReleaseName = "azmon-containers-release-1" $helmChartName = "azuremonitor-containers" -$helmChartRepoName = "incubator" -$helmChartRepoUrl = "https://kubernetes-charts-incubator.storage.googleapis.com/" + # flags to indicate the cluster types $isArcK8sCluster = $false -$isAksCluster = $false +$isAksCluster = $false $isUsingServicePrincipal = $false -if([string]::IsNullOrEmpty($helmRepoName) -eq $false){ - $helmChartRepoName = $helmRepoName -} +# released chart version in mcr +$mcr = "mcr.microsoft.com" +$mcrChartVersion = "2.8.3" +$mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" +$helmLocalRepoName = "." +$omsAgentDomainName="opinsights.azure.com" -if([string]::IsNullOrEmpty($helmRepoUrl) -eq $false){ - $helmChartRepoUrl = $helmRepoUrl +if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { + Write-Host("Azure cloud name parameter not passed in so using default cloud as AzureCloud") + $azureCloudName = "AzureCloud" +} else { + if(($azureCloudName.ToLower() -eq "azurecloud" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + $omsAgentDomainName="opinsights.azure.com" + } elseif (($azureCloudName.ToLower() -eq "azureusgovernment" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + $omsAgentDomainName="opinsights.azure.us" + } else { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + Write-Host("Only supported azure clouds are : AzureCloud and AzureUSGovernment") + exit + } } # checks the required Powershell modules exist and if not exists, request the user permission to install @@ -200,7 +211,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } if ([string]::IsNullOrEmpty($clusterResourceId)) { - Write-Host("Specified Azure Arc ClusterResourceId should not be NULL or empty") -ForegroundColor Red + Write-Host("Specified Azure Arc enabled Kubernetes ClusterResourceId should not be NULL or empty") -ForegroundColor Red exit } @@ -220,30 +231,31 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { $clusterResourceId = "/" + $clusterResourceId } -if ($clusterResourceId.Split("/").Length -ne 9){ - Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red +if ($clusterResourceId.Split("/").Length -ne 9) { + Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red exit } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and ($clusterResourceId.ToLower().Contains("microsoft.redhatopenshift/openshiftclusters") -ne $true) -and ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) - ) { +) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red exit } -if(([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and - ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and - ([string]::IsNullOrEmpty($tenantId) -eq $false)) { - Write-Host("Using service principal creds for the azure login since these provided.") - $isUsingServicePrincipal = $true +if (([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and + ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and + ([string]::IsNullOrEmpty($tenantId) -eq $false)) { + Write-Host("Using service principal creds for the azure login since these provided.") + $isUsingServicePrincipal = $true } if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -eq $true) { - $isArcK8sCluster = $true -} elseif ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -eq $true) { - $isAksCluster = $true + $isArcK8sCluster = $true +} +elseif ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -eq $true) { + $isAksCluster = $true } $resourceParts = $clusterResourceId.Split("/") @@ -253,15 +265,20 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force - $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } try { Write-Host("") Write-Host("Trying to get the current Az login context...") $account = Get-AzContext -ErrorAction Stop - Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + $ctxCloud = $account.Environment.Name + if(($azureCloudName.ToLower() -eq $ctxCloud.ToLower() ) -eq $false) { + Write-Host("Specified azure cloud name is not same as current context cloud hence setting account to null to retrigger the login" ) -ForegroundColor Green + $account = $null + } + Write-Host("Successfully fetched current AzContext context and azure cloud name: $azureCloudName" ) -ForegroundColor Green Write-Host("") } catch { @@ -275,12 +292,14 @@ if ($null -eq $account.Account) { try { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force - $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId - } else { - Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId - } + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret + + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName + } + else { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $clusterSubscriptionId -Environment $azureCloudName + } } catch { Write-Host("") @@ -322,12 +341,12 @@ if ($null -eq $clusterResource) { $clusterRegion = $clusterResource.Location.ToLower() if ($isArcK8sCluster -eq $true) { - # validate identity - $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() - if ($clusterIdentity.contains("systemassigned") -eq $false) { - Write-Host("Identity of Azure Arc K8s cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit - } + # validate identity + $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() + if ($clusterIdentity.contains("systemassigned") -eq $false) { + Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red + exit + } } if ([string]::IsNullOrEmpty($workspaceResourceId)) { @@ -389,7 +408,8 @@ if ([string]::IsNullOrEmpty($workspaceResourceId)) { "westeurope" = "westeurope" ; "westindia" = "centralindia" ; "westus" = "westus" ; - "westus2" = "westus2" + "westus2" = "westus2"; + "usgovvirginia" = "usgovvirginia" } $workspaceRegionCode = "EUS" @@ -514,7 +534,8 @@ if ($account.Subscription.Id -eq $clusterSubscriptionId) { if ($isAksCluster -eq $true) { Write-Host ("Enabling AKS Monitoring Addon ..") # TBD -} else { +} +else { Write-Host("Attaching workspaceResourceId tag on the cluster ResourceId") $clusterResource.Tags["logAnalyticsWorkspaceResourceId"] = $WorkspaceInformation.ResourceId Set-AzResource -Tag $clusterResource.Tags -ResourceId $clusterResource.ResourceId -Force @@ -526,20 +547,30 @@ Write-Host "Helm version" : $helmVersion Write-Host("Installing or upgrading if exists, Azure Monitor for containers HELM chart ...") try { - Write-Host("Adding $helmChartRepoName repo to helm: $helmChartRepoUrl") - helm repo add $helmChartRepoName $helmChartRepoUrl - Write-Host("updating helm repo to get latest version of charts") - helm repo update - $helmParameters = "omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" - if([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { + Write-Host("pull the chart from mcr.microsoft.com") + [System.Environment]::SetEnvironmentVariable("HELM_EXPERIMENTAL_OCI", 1, "Process") + + Write-Host("pull the chart from mcr.microsoft.com") + helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} + + Write-Host("export the chart from local cache to current directory") + helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . + + $helmChartRepoPath = "${helmLocalRepoName}" + "/" + "${helmChartName}" + + Write-Host("helmChartRepoPath is : ${helmChartRepoPath}") + + $helmParameters = "omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" + if ([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { Write-Host("using proxy endpoint since its provided") $helmParameters = $helmParameters + ",omsagent.proxy=$proxyEndpoint" } if ([string]::IsNullOrEmpty($kubeContext)) { - helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoName/$helmChartName - } else { - Write-Host("using provided kube-context: $kubeContext") - helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoName/$helmChartName --kube-context $kubeContext + helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoPath + } + else { + Write-Host("using provided kube-context: $kubeContext") + helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoPath --kube-context $kubeContext } } catch { @@ -548,7 +579,3 @@ catch { Write-Host("Successfully enabled Azure Monitor for containers for cluster: $clusterResourceId") -ForegroundColor Green Write-Host("Proceed to https://aka.ms/azmon-containers to view your newly onboarded Azure Managed cluster") -ForegroundColor Green - - - - diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 4142dbf6c..f27f944fd 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -38,12 +38,16 @@ set -e set -o pipefail -# default to public cloud since only supported cloud is azure public clod +# default to public cloud since only supported cloud is azure public cloud defaultAzureCloud="AzureCloud" - -# helm repo details -helmRepoName="incubator" -helmRepoUrl="https://kubernetes-charts-incubator.storage.googleapis.com/" +# default domain will be for public cloud +omsAgentDomainName="opinsights.azure.com" + +# released chart version in mcr +mcrChartVersion="2.8.3" +mcr="mcr.microsoft.com" +mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" +helmLocalRepoName="." helmChartName="azuremonitor-containers" # default release name used during onboarding @@ -58,19 +62,18 @@ aroV4ResourceProvider="Microsoft.RedHatOpenShift/OpenShiftClusters" # resource provider for aks cluster aksResourceProvider="Microsoft.ContainerService/managedClusters" -# default of resourceProvider is arc k8s and this will get updated based on the provider cluster resource +# default of resourceProvider is Azure Arc enabled Kubernetes and this will get updated based on the provider cluster resource resourceProvider="Microsoft.Kubernetes/connectedClusters" - # resource type for azure log analytics workspace workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" # openshift project name for aro v4 cluster openshiftProjectName="azure-monitor-for-containers" -# arc k8s cluster resource +# AROv4 cluster resource isAroV4Cluster=false -# arc k8s cluster resource +# Azure Arc enabled Kubernetes cluster resource isArcK8sCluster=false # aks cluster resource @@ -103,28 +106,25 @@ servicePrincipalClientSecret="" servicePrincipalTenantId="" isUsingServicePrincipal=false -usage() -{ - local basename=`basename $0` - echo - echo "Enable Azure Monitor for containers:" - echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ] [--workspace-id ] [--proxy ]" +usage() { + local basename=$(basename $0) + echo + echo "Enable Azure Monitor for containers:" + echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ] [--workspace-id ] [--proxy ]" } -parse_args() -{ +parse_args() { - if [ $# -le 1 ] - then + if [ $# -le 1 ]; then usage exit 1 - fi + fi -# Transform long options to short ones -for arg in "$@"; do - shift - case "$arg" in - "--resource-id") set -- "$@" "-r" ;; + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; "--kube-context") set -- "$@" "-k" ;; "--workspace-id") set -- "$@" "-w" ;; "--proxy") set -- "$@" "-p" ;; @@ -134,130 +134,128 @@ for arg in "$@"; do "--helm-repo-name") set -- "$@" "-n" ;; "--helm-repo-url") set -- "$@" "-u" ;; "--container-log-volume") set -- "$@" "-v" ;; - "--"*) usage ;; - *) set -- "$@" "$arg" - esac -done + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done -local OPTIND opt + local OPTIND opt -while getopts 'hk:r:w:p:c:s:t:n:u:v:' opt; do + while getopts 'hk:r:w:p:c:s:t:n:u:v:' opt; do case "$opt" in - h) + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + echo "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + echo "clusterResourceId is $OPTARG" + ;; + + w) + workspaceResourceId="$OPTARG" + echo "workspaceResourceId is $OPTARG" + ;; + + p) + proxyEndpoint="$OPTARG" + echo "proxyEndpoint is $OPTARG" + ;; + + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + + n) + helmRepoName="$OPTARG" + echo "helm repo name is $OPTARG" + ;; + + u) + helmRepoUrl="$OPTARG" + echo "helm repo url is $OPTARG" + ;; + + v) + containerLogVolume="$OPTARG" + echo "container log volume is $OPTARG" + ;; + + ?) usage - ;; - - k) - kubeconfigContext="$OPTARG" - echo "name of kube-context is $OPTARG" - ;; - - r) - clusterResourceId="$OPTARG" - echo "clusterResourceId is $OPTARG" - ;; - - w) - workspaceResourceId="$OPTARG" - echo "workspaceResourceId is $OPTARG" - ;; - - p) - proxyEndpoint="$OPTARG" - echo "proxyEndpoint is $OPTARG" - ;; - - c) - servicePrincipalClientId="$OPTARG" - echo "servicePrincipalClientId is $OPTARG" - ;; - - s) - servicePrincipalClientSecret="$OPTARG" - echo "clientSecret is *****" - ;; - - t) - servicePrincipalTenantId="$OPTARG" - echo "service principal tenantId is $OPTARG" - ;; - - n) - helmRepoName="$OPTARG" - echo "helm repo name is $OPTARG" - ;; - - u) - helmRepoUrl="$OPTARG" - echo "helm repo url is $OPTARG" - ;; - - v) - containerLogVolume="$OPTARG" - echo "container log volume is $OPTARG" - ;; - - ?) - usage - exit 1 - ;; + exit 1 + ;; esac done - shift "$(($OPTIND -1))" + shift "$(($OPTIND - 1))" + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" - local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" - local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" - # get resource parts and join back to get the provider name - local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" - local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" - local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2} )" + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" - local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") - # convert to lowercase for validation - providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + echo "cluster SubscriptionId:" $subscriptionId + echo "cluster ResourceGroup:" $resourceGroup + echo "cluster ProviderName:" $providerName + echo "cluster Name:" $clusterName - echo "cluster SubscriptionId:" $subscriptionId - echo "cluster ResourceGroup:" $resourceGroup - echo "cluster ProviderName:" $providerName - echo "cluster Name:" $clusterName - - if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then echo "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" exit 1 - fi + fi - if [[ $providerName != microsoft.* ]]; then - echo "-e invalid azure cluster resource id format." - exit 1 - fi + if [[ $providerName != microsoft.* ]]; then + echo "-e invalid azure cluster resource id format." + exit 1 + fi - # detect the resource provider from the provider name in the cluster resource id - # detect the resource provider from the provider name in the cluster resource id - if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then - echo "provider cluster resource is of Azure ARC K8s cluster type" + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" isArcK8sCluster=true resourceProvider=$arcK8sResourceProvider - elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then + elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then echo "provider cluster resource is of AROv4 cluster type" resourceProvider=$aroV4ResourceProvider isAroV4Cluster=true - elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then + elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then echo "provider cluster resource is of AKS cluster type" isAksCluster=true resourceProvider=$aksResourceProvider - else - echo "-e unsupported azure managed cluster type" - exit 1 - fi + else + echo "-e unsupported azure managed cluster type" + exit 1 + fi - if [ -z "$kubeconfigContext" ]; then + if [ -z "$kubeconfigContext" ]; then echo "using or getting current kube config context since --kube-context parameter not set " - fi + fi -if [ ! -z "$workspaceResourceId" ]; then + if [ ! -z "$workspaceResourceId" ]; then local workspaceSubscriptionId="$(echo $workspaceResourceId | cut -d'/' -f3)" local workspaceResourceGroup="$(echo $workspaceResourceId | cut -d'/' -f5)" local workspaceProviderName="$(echo $workspaceResourceId | cut -d'/' -f7)" @@ -269,13 +267,13 @@ if [ ! -z "$workspaceResourceId" ]; then echo "workspace ProviderName:" $workspaceName echo "workspace Name:" $workspaceName - if [[ $workspaceProviderName != microsoft.operationalinsights* ]]; then - echo "-e invalid azure log analytics resource id format." - exit 1 - fi -fi + if [[ $workspaceProviderName != microsoft.operationalinsights* ]]; then + echo "-e invalid azure log analytics resource id format." + exit 1 + fi + fi -if [ ! -z "$proxyEndpoint" ]; then + if [ ! -z "$proxyEndpoint" ]; then # Validate Proxy Endpoint URL # extract the protocol:// proto="$(echo $proxyEndpoint | grep :// | sed -e's,^\(.*://\).*,\1,g')" @@ -302,42 +300,58 @@ if [ ! -z "$proxyEndpoint" ]; then else echo "successfully validated provided proxy endpoint is valid and in expected format" fi -fi + fi -if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then - echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" - isUsingServicePrincipal=true -fi + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi } -configure_to_public_cloud() -{ +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + if [ "$azureCloudName" = "azureusgovernment" ]; then + echo "setting omsagent domain as opinsights.azure.us since the azure cloud is azureusgovernment " + omsAgentDomainName="opinsights.azure.us" + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} + +configure_to_public_cloud() { echo "Set AzureCloud as active cloud for az cli" az cloud set -n $defaultAzureCloud } -validate_cluster_identity() -{ +validate_cluster_identity() { echo "validating cluster identity" local rgName="$(echo ${1})" local clusterName="$(echo ${2})" - local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) - identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) + identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"' | tr -d "[:space:]") echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then - echo "-e only supported cluster identity is systemassigned for Azure ARC K8s cluster type" - exit 1 + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" + exit 1 fi echo "successfully validated the identity of the cluster" } -create_default_log_analytics_workspace() -{ +create_default_log_analytics_workspace() { # extract subscription from cluster resource id local subscriptionId="$(echo $clusterResourceId | cut -d'/' -f3)" @@ -348,73 +362,73 @@ create_default_log_analytics_workspace() # mapping fors for default Azure Log Analytics workspace declare -A AzureCloudLocationToOmsRegionCodeMap=( - [australiasoutheast]=ASE - [australiaeast]=EAU - [australiacentral]=CAU - [canadacentral]=CCA - [centralindia]=CIN - [centralus]=CUS - [eastasia]=EA - [eastus]=EUS - [eastus2]=EUS2 - [eastus2euap]=EAP - [francecentral]=PAR - [japaneast]=EJP - [koreacentral]=SE - [northeurope]=NEU - [southcentralus]=SCUS - [southeastasia]=SEA - [uksouth]=SUK - [usgovvirginia]=USGV - [westcentralus]=EUS - [westeurope]=WEU - [westus]=WUS - [westus2]=WUS2 + [australiasoutheast]=ASE + [australiaeast]=EAU + [australiacentral]=CAU + [canadacentral]=CCA + [centralindia]=CIN + [centralus]=CUS + [eastasia]=EA + [eastus]=EUS + [eastus2]=EUS2 + [eastus2euap]=EAP + [francecentral]=PAR + [japaneast]=EJP + [koreacentral]=SE + [northeurope]=NEU + [southcentralus]=SCUS + [southeastasia]=SEA + [uksouth]=SUK + [usgovvirginia]=USGV + [westcentralus]=EUS + [westeurope]=WEU + [westus]=WUS + [westus2]=WUS2 ) declare -A AzureCloudRegionToOmsRegionMap=( - [australiacentral]=australiacentral - [australiacentral2]=australiacentral - [australiaeast]=australiaeast - [australiasoutheast]=australiasoutheast - [brazilsouth]=southcentralus - [canadacentral]=canadacentral - [canadaeast]=canadacentral - [centralus]=centralus - [centralindia]=centralindia - [eastasia]=eastasia - [eastus]=eastus - [eastus2]=eastus2 - [francecentral]=francecentral - [francesouth]=francecentral - [japaneast]=japaneast - [japanwest]=japaneast - [koreacentral]=koreacentral - [koreasouth]=koreacentral - [northcentralus]=eastus - [northeurope]=northeurope - [southafricanorth]=westeurope - [southafricawest]=westeurope - [southcentralus]=southcentralus - [southeastasia]=southeastasia - [southindia]=centralindia - [uksouth]=uksouth - [ukwest]=uksouth - [westcentralus]=eastus - [westeurope]=westeurope - [westindia]=centralindia - [westus]=westus - [westus2]=westus2 + [australiacentral]=australiacentral + [australiacentral2]=australiacentral + [australiaeast]=australiaeast + [australiasoutheast]=australiasoutheast + [brazilsouth]=southcentralus + [canadacentral]=canadacentral + [canadaeast]=canadacentral + [centralus]=centralus + [centralindia]=centralindia + [eastasia]=eastasia + [eastus]=eastus + [eastus2]=eastus2 + [francecentral]=francecentral + [francesouth]=francecentral + [japaneast]=japaneast + [japanwest]=japaneast + [koreacentral]=koreacentral + [koreasouth]=koreacentral + [northcentralus]=eastus + [northeurope]=northeurope + [southafricanorth]=westeurope + [southafricawest]=westeurope + [southcentralus]=southcentralus + [southeastasia]=southeastasia + [southindia]=centralindia + [uksouth]=uksouth + [ukwest]=uksouth + [westcentralus]=eastus + [westeurope]=westeurope + [westindia]=centralindia + [westus]=westus + [westus2]=westus2 + [usgovvirginia]=usgovvirginia ) - if [ -n "${AzureCloudRegionToOmsRegionMap[$clusterRegion]}" ]; - then + echo "cluster Region:"$clusterRegion + if [ -n "${AzureCloudRegionToOmsRegionMap[$clusterRegion]}" ]; then workspaceRegion=${AzureCloudRegionToOmsRegionMap[$clusterRegion]} fi echo "Workspace Region:"$workspaceRegion - if [ -n "${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]}" ]; - then + if [ -n "${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]}" ]; then workspaceRegionCode=${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]} fi echo "Workspace Region Code:"$workspaceRegionCode @@ -423,30 +437,29 @@ create_default_log_analytics_workspace() isRGExists=$(az group exists -g $workspaceResourceGroup) workspaceName="DefaultWorkspace-"$subscriptionId"-"$workspaceRegionCode - if $isRGExists - then echo "using existing default resource group:"$workspaceResourceGroup + if $isRGExists; then + echo "using existing default resource group:"$workspaceResourceGroup else echo "creating resource group: $workspaceResourceGroup in region: $workspaceRegion" az group create -g $workspaceResourceGroup -l $workspaceRegion fi - workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) - if [ "$workspaceList" = "[]" ]; - then - # create new default workspace since no mapped existing default workspace - echo '{"location":"'"$workspaceRegion"'", "properties":{"sku":{"name": "standalone"}}}' > WorkspaceProps.json - cat WorkspaceProps.json - workspace=$(az resource create -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --is-full-object -p @WorkspaceProps.json) + workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) + if [ "$workspaceList" = "[]" ]; then + # create new default workspace since no mapped existing default workspace + echo '{"location":"'"$workspaceRegion"'", "properties":{"sku":{"name": "standalone"}}}' >WorkspaceProps.json + cat WorkspaceProps.json + workspace=$(az resource create -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --is-full-object -p @WorkspaceProps.json) else echo "using existing default workspace:"$workspaceName fi - workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) + workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id -o json) workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') + echo "workspace resource Id: ${workspaceResourceId}" } -add_container_insights_solution() -{ +add_container_insights_solution() { local resourceId="$(echo ${1})" # extract resource group from workspace resource id @@ -456,29 +469,27 @@ add_container_insights_solution() solution=$(az deployment group create -g $resourceGroup --template-uri $solutionTemplateUri --parameters workspaceResourceId=$resourceId --parameters workspaceRegion=$workspaceRegion) } -get_workspace_guid_and_key() -{ +get_workspace_guid_and_key() { # extract resource parts from workspace resource id - local resourceId="$(echo ${1} | tr -d '"' )" + local resourceId="$(echo ${1} | tr -d '"')" local subId="$(echo ${resourceId} | cut -d'/' -f3)" local rgName="$(echo ${resourceId} | cut -d'/' -f5)" local wsName="$(echo ${resourceId} | cut -d'/' -f9)" # get the workspace guid - workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId) + workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId -o json) workspaceGuid=$(echo $workspaceGuid | tr -d '"') echo "workspaceGuid:"$workspaceGuid echo "getting workspace primaryshared key" - workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey) + workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) workspaceKey=$(echo $workspaceKey | tr -d '"') } -install_helm_chart() -{ +install_helm_chart() { - # get the config-context for ARO v4 cluster - if [ "$isAroV4Cluster" = true ] ; then + # get the config-context for ARO v4 cluster + if [ "$isAroV4Cluster" = true ]; then echo "getting config-context of ARO v4 cluster " echo "getting admin user creds for aro v4 cluster" adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) @@ -490,90 +501,91 @@ install_helm_chart() oc new-project $openshiftProjectName echo "getting config-context of aro v4 cluster" kubeconfigContext=$(oc config current-context) - fi - - if [ -z "$kubeconfigContext" ]; then - echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." - else - echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." - fi - - echo "getting the region of the cluster" - clusterRegion=$(az resource show --ids ${clusterResourceId} --query location) - echo "cluster region is : ${clusterRegion}" - - echo "adding helm repo:" $helmRepoName - helm repo add $helmRepoName $helmRepoUrl - - echo "updating helm repo to get latest charts" - helm repo update - - if [ ! -z "$proxyEndpoint" ]; then - echo "using proxy endpoint since proxy configuration passed in" - if [ -z "$kubeconfigContext" ]; then - echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName - else - echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} - fi - else - if [ -z "$kubeconfigContext" ]; then - echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName - else - echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} - fi - fi - - echo "chart installation completed." + fi + + if [ -z "$kubeconfigContext" ]; then + echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." + else + echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." + fi + + echo "getting the region of the cluster" + clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) + echo "cluster region is : ${clusterRegion}" + + echo "pull the chart version ${mcrChartVersion} from ${mcr}/${mcrChartRepoPath}" + export HELM_EXPERIMENTAL_OCI=1 + helm chart pull $mcr/$mcrChartRepoPath:$mcrChartVersion + + echo "export the chart from local cache to current directory" + helm chart export $mcr/$mcrChartRepoPath:$mcrChartVersion --destination . + + helmChartRepoPath=$helmLocalRepoName/$helmChartName + + echo "helm chart repo path: ${helmChartRepoPath}" + + if [ ! -z "$proxyEndpoint" ]; then + echo "using proxy endpoint since proxy configuration passed in" + if [ -z "$kubeconfigContext" ]; then + echo "using current kube-context since --kube-context/-k parameter not passed in" + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + else + echo "using --kube-context:${kubeconfigContext} since passed in" + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + fi + else + if [ -z "$kubeconfigContext" ]; then + echo "using current kube-context since --kube-context/-k parameter not passed in" + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + else + echo "using --kube-context:${kubeconfigContext} since passed in" + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + fi + fi + + echo "chart installation completed." } -login_to_azure() -{ - if [ "$isUsingServicePrincipal" = true ] ; then - echo "login to the azure using provided service principal creds" - az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId else echo "login to the azure interactively" az login --use-device-code fi } -set_azure_subscription() -{ - local subscriptionId="$(echo ${1})" - echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" - az account set -s ${subscriptionId} - echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" } -attach_monitoring_tags() -{ +attach_monitoring_tags() { echo "attach loganalyticsworkspaceResourceId tag on to cluster resource" - status=$(az resource update --set tags.logAnalyticsWorkspaceResourceId=$workspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) + status=$(az resource update --set tags.logAnalyticsWorkspaceResourceId=$workspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) echo "$status" echo "successfully attached logAnalyticsWorkspaceResourceId tag on the cluster resource" } # enables aks monitoring addon for private preview and dont use this for aks prod -enable_aks_monitoring_addon() -{ - echo "getting cluster object" - clusterGetResponse=$(az rest --method get --uri $clusterResourceId?api-version=2020-03-01) - export jqquery=".properties.addonProfiles.omsagent.config.logAnalyticsWorkspaceResourceID=\"$workspaceResourceId\"" - echo $clusterGetResponse | jq $jqquery > putrequestbody.json - status=$(az rest --method put --uri $clusterResourceId?api-version=2020-03-01 --body @putrequestbody.json --headers Content-Type=application/json) - echo "status after enabling of aks monitoringa addon:$status" +enable_aks_monitoring_addon() { + echo "getting cluster object" + clusterGetResponse=$(az rest --method get --uri $clusterResourceId?api-version=2020-03-01) + export jqquery=".properties.addonProfiles.omsagent.config.logAnalyticsWorkspaceResourceID=\"$workspaceResourceId\"" + echo $clusterGetResponse | jq $jqquery >putrequestbody.json + status=$(az rest --method put --uri $clusterResourceId?api-version=2020-03-01 --body @putrequestbody.json --headers Content-Type=application/json) + echo "status after enabling of aks monitoringa addon:$status" } # parse and validate args parse_args $@ -# configure azure cli for public cloud -configure_to_public_cloud +# validate and configure azure cli for cloud +validate_and_configure_supported_cloud # parse cluster resource id clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" @@ -587,9 +599,9 @@ login_to_azure # set the cluster subscription id as active sub for azure cli set_azure_subscription $clusterSubscriptionId -# validate cluster identity if its ARC k8s cluster -if [ "$isArcK8sCluster" = true ] ; then - validate_cluster_identity $clusterResourceGroup $clusterName +# validate cluster identity if its Azure Arc enabled Kubernetes cluster +if [ "$isArcK8sCluster" = true ]; then + validate_cluster_identity $clusterResourceGroup $clusterName fi if [ -z $workspaceResourceId ]; then @@ -598,7 +610,7 @@ if [ -z $workspaceResourceId ]; then else echo "using provided azure log analytics workspace:${workspaceResourceId}" workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') - workspaceSubscriptionId="$(echo ${workspaceResourceId} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]" )" + workspaceSubscriptionId="$(echo ${workspaceResourceId} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" workspaceResourceGroup="$(echo ${workspaceResourceId} | cut -d'/' -f5)" workspaceName="$(echo ${workspaceResourceId} | cut -d'/' -f9)" @@ -609,7 +621,7 @@ else set_azure_subscription $workspaceSubscriptionId fi - workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location) + workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location -o json) workspaceRegion=$(echo $workspaceRegion | tr -d '"') echo "Workspace Region:"$workspaceRegion fi @@ -620,13 +632,13 @@ add_container_insights_solution $workspaceResourceId # get workspace guid and key get_workspace_guid_and_key $workspaceResourceId -if [ "$isClusterAndWorkspaceInSameSubscription" = false ] ; then +if [ "$isClusterAndWorkspaceInSameSubscription" = false ]; then echo "switch to cluster subscription id as active subscription for cli: ${clusterSubscriptionId}" set_azure_subscription $clusterSubscriptionId fi # attach monitoring tags on to cluster resource -if [ "$isAksCluster" = true ] ; then +if [ "$isAksCluster" = true ]; then enable_aks_monitoring_addon else attach_monitoring_tags diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh new file mode 100644 index 000000000..5456a7072 --- /dev/null +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -0,0 +1,329 @@ +#!/bin/bash +# +# Execute this directly in Azure Cloud Shell (https://shell.azure.com) by pasting (SHIFT+INS on Windows, CTRL+V on Mac or Linux) +# the following line (beginning with curl...) at the command prompt and then replacing the args: +# This scripts upgrades the existing Azure Monitor for containers release on Azure Arc enabled Kubernetes cluster +# +# 1. Upgrades existing Azure Monitor for containers release to the K8s cluster in provided via --kube-context +# Prerequisites : +# Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest +# Helm3 : https://helm.sh/docs/intro/install/ + +# download script +# curl -o enable-monitoring.sh -L https://aka.ms/upgrade-monitoring-bash-script +# 1. Using Service Principal for Azure Login +## bash upgrade-monitoring.sh --client-id --client-secret --tenant-id +# 2. Using Interactive device login +# bash upgrade-monitoring.sh --resource-id + +set -e +set -o pipefail + +# released chart version for Azure Arc enabled Kubernetes public preview +mcrChartVersion="2.8.3" +mcr="mcr.microsoft.com" +mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" + +# default to public cloud since only supported cloud is azure public clod +defaultAzureCloud="AzureCloud" +helmLocalRepoName="." +helmChartName="azuremonitor-containers" + +# default release name used during onboarding +releaseName="azmon-containers-release-1" + +# resource provider for azure arc connected cluster +arcK8sResourceProvider="Microsoft.Kubernetes/connectedClusters" + +# default of resourceProvider is Azure Arc enabled Kubernetes and this will get updated based on the provider cluster resource +resourceProvider="Microsoft.Kubernetes/connectedClusters" + +# Azure Arc enabled Kubernetes cluster resource +isArcK8sCluster=false + +# openshift project name for aro v4 cluster +openshiftProjectName="azure-monitor-for-containers" + +# Azure Arc enabled Kubernetes cluster resource +isAroV4Cluster=false + +# default global params +clusterResourceId="" +kubeconfigContext="" + +# default workspace region and code +workspaceRegion="eastus" +workspaceRegionCode="EUS" +workspaceResourceGroup="DefaultResourceGroup-"$workspaceRegionCode + +# default workspace guid and key +workspaceGuid="" +workspaceKey="" + +# sp details for the login if provided +servicePrincipalClientId="" +servicePrincipalClientSecret="" +servicePrincipalTenantId="" +isUsingServicePrincipal=false + +usage() { + local basename=$(basename $0) + echo + echo "Upgrade Azure Monitor for containers:" + echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ]" +} + +parse_args() { + + if [ $# -le 1 ]; then + usage + exit 1 + fi + + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; + "--kube-context") set -- "$@" "-k" ;; + "--client-id") set -- "$@" "-c" ;; + "--client-secret") set -- "$@" "-s" ;; + "--tenant-id") set -- "$@" "-t" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done + + local OPTIND opt + + while getopts 'hk:r:c:s:t:' opt; do + case "$opt" in + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + echo "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + echo "clusterResourceId is $OPTARG" + ;; + + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND - 1))" + + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" + + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + + echo "cluster SubscriptionId:" $subscriptionId + echo "cluster ResourceGroup:" $resourceGroup + echo "cluster ProviderName:" $providerName + echo "cluster Name:" $clusterName + + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + echo "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + exit 1 + fi + + if [[ $providerName != microsoft.* ]]; then + echo "-e invalid azure cluster resource id format." + exit 1 + fi + + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + isArcK8sCluster=true + resourceProvider=$arcK8sResourceProvider + elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then + echo "provider cluster resource is of AROv4 cluster type" + resourceProvider=$aroV4ResourceProvider + isAroV4Cluster=true + elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then + echo "provider cluster resource is of AKS cluster type" + isAksCluster=true + resourceProvider=$aksResourceProvider + else + echo "-e unsupported azure managed cluster type" + exit 1 + fi + + if [ -z "$kubeconfigContext" ]; then + echo "using or getting current kube config context since --kube-context parameter not set " + fi + + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi +} + +configure_to_public_cloud() { + echo "Set AzureCloud as active cloud for az cli" + az cloud set -n $defaultAzureCloud +} + +validate_cluster_identity() { + echo "validating cluster identity" + + local rgName="$(echo ${1})" + local clusterName="$(echo ${2})" + + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) + identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') + echo "cluster identity type:" $identitytype + + if [[ "$identitytype" != "systemassigned" ]]; then + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + + echo "successfully validated the identity of the cluster" +} + +validate_monitoring_tags() { + echo "get loganalyticsworkspaceResourceId tag on to cluster resource" + logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider -o json) + echo "configured log analytics workspace: ${logAnalyticsWorkspaceResourceIdTag}" + echo "successfully got logAnalyticsWorkspaceResourceId tag on the cluster resource" + if [ -z "$logAnalyticsWorkspaceResourceIdTag" ]; then + echo "-e logAnalyticsWorkspaceResourceId doesnt exist on this cluster which indicates cluster not enabled for monitoring" + exit 1 + fi +} + + +upgrade_helm_chart_release() { + + # get the config-context for ARO v4 cluster + if [ "$isAroV4Cluster" = true ]; then + echo "getting config-context of ARO v4 cluster " + echo "getting admin user creds for aro v4 cluster" + adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) + adminPassword=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminPassword' -o tsv) + apiServer=$(az aro show -g $clusterResourceGroup -n $clusterName --query apiserverProfile.url -o tsv) + echo "login to the cluster via oc login" + oc login $apiServer -u $adminUserName -p $adminPassword + echo "creating project azure-monitor-for-containers" + oc new-project $openshiftProjectName + echo "getting config-context of aro v4 cluster" + kubeconfigContext=$(oc config current-context) + fi + + if [ -z "$kubeconfigContext" ]; then + echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." + else + echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." + fi + + export HELM_EXPERIMENTAL_OCI=1 + + echo "pull the chart from ${mcr}/${mcrChartRepoPath}:${mcrChartVersion}" + helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} + + echo "export the chart from local cache to current directory" + helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . + + helmChartRepoPath=$helmLocalRepoName/$helmChartName + + echo "upgrading the release: $releaseName to chart version : ${mcrChartVersion}" + helm get values $releaseName -o yaml | helm upgrade --install $releaseName $helmChartRepoPath -f - + echo "$releaseName got upgraded successfully." +} + +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + else + echo "login to the azure interactively" + az login --use-device-code + fi +} + +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +} + +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} + +# parse and validate args +parse_args $@ + +# configure azure cli for cloud +validate_and_configure_supported_cloud + +# parse cluster resource id +clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" +clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" +providerName="$(echo $clusterResourceId | cut -d'/' -f7)" +clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" + +# login to azure +login_to_azure + +# set the cluster subscription id as active sub for azure cli +set_azure_subscription $clusterSubscriptionId + +# validate cluster identity if its Azure Arc enabled Kubernetes cluster +if [ "$isArcK8sCluster" = true ]; then + validate_cluster_identity $clusterResourceGroup $clusterName +fi + +# validate the cluster has monitoring tags +validate_monitoring_tags + +# upgrade helm chart release +upgrade_helm_chart_release + +# portal link +echo "Proceed to https://aka.ms/azmon-containers to view health of your newly onboarded cluster" diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json new file mode 100644 index 000000000..95e7ba5d0 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -0,0 +1,127 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Resource Id of the Azure Arc Connected Cluster" + } + }, + "clusterRegion": { + "type": "string", + "metadata": { + "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace region e.g. \"eastus\"" + } + }, + "workspaceDomain": { + "type": "string", + "allowedValues": [ + "opinsights.azure.com", + "opinsights.azure.cn", + "opinsights.azure.us", + "opinsights.azure.eaglex.ic.gov", + "opinsights.azure.microsoft.scloud" + ], + "defaultValue": "opinsights.azure.com", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace Domain e.g. opinsights.azure.com" + } + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "apiVersion": "2015-11-01-preview", + "type": "Microsoft.OperationsManagement/solutions", + "location": "[parameters('workspaceRegion')]", + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "properties": { + "workspaceResourceId": "[parameters('workspaceResourceId')]" + }, + "plan": { + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "product": "[Concat('OMSGallery/', 'ContainerInsights')]", + "promotionCode": "", + "publisher": "Microsoft" + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-ci-extension', '-', uniqueString(parameters('clusterResourceId')))]", + "apiVersion": "2019-05-01", + "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", + "dependsOn": [ + "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.KubernetesConfiguration/extensions", + "apiVersion": "2020-07-01-preview", + "name": "azuremonitor-containers", + "location": "[parameters('clusterRegion')]", + "identity": {"type": "systemassigned"}, + "properties": { + "extensionType": "Microsoft.AzureMonitor.Containers", + "configurationSettings": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "omsagent.domain": "[parameters('workspaceDomain')]" + }, + "configurationProtectedSettings": { + "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" + }, + "autoUpgradeMinorVersion": true, + "releaseTrain": "Stable", + "scope": { + "Cluster": { + "releaseNamespace": "azuremonitor-containers" + } + } + }, + "scope": "[concat('Microsoft.Kubernetes/connectedClusters/', split(parameters('clusterResourceId'),'/')[8])]" + } + ] + } + } + } + ] +} diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json new file mode 100644 index 000000000..6829d3d05 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.Kubernetes/connectedClusters/" + }, + "clusterRegion": { + "value": "" + }, + "workspaceResourceId": { + "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" + }, + "workspaceRegion": { + "value": "" + }, + "workspaceDomain": { + "value": "" + } + } +} diff --git a/scripts/preview/health/omsagent-template-aks-engine.yaml b/scripts/preview/health/omsagent-template-aks-engine.yaml index 5526602c0..5e063fd54 100644 --- a/scripts/preview/health/omsagent-template-aks-engine.yaml +++ b/scripts/preview/health/omsagent-template-aks-engine.yaml @@ -108,14 +108,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/scripts/preview/health/omsagent-template.yaml b/scripts/preview/health/omsagent-template.yaml index 6e3a52020..e58e9c33f 100644 --- a/scripts/preview/health/omsagent-template.yaml +++ b/scripts/preview/health/omsagent-template.yaml @@ -108,14 +108,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/scripts/troubleshoot/TroubleshootError.ps1 b/scripts/troubleshoot/TroubleshootError.ps1 index 754a43e74..4c2d95ac6 100644 --- a/scripts/troubleshoot/TroubleshootError.ps1 +++ b/scripts/troubleshoot/TroubleshootError.ps1 @@ -234,7 +234,17 @@ $MdmCustomMetricAvailabilityLocations = ( 'eastasia', 'centralindia', 'uksouth', - 'canadacentral' + 'canadacentral', + 'francecentral', + 'japaneast', + 'australiaeast', + 'eastus2', + 'westus', + 'australiasoutheast', + 'brazilsouth', + 'germanywestcentral', + 'northcentralus', + 'switzerlandnorth' ); try { diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 63ca6de10..d35acad3d 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -32,13 +32,16 @@ import ( // DataType for Container Log const ContainerLogDataType = "CONTAINER_LOG_BLOB" +//DataType for Container Log v2 +const ContainerLogV2DataType = "CONTAINERINSIGHTS_CONTAINERLOGV2" + // DataType for Insights metric const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" // DataType for KubeMonAgentEvent const KubeMonAgentEventDataType = "KUBE_MON_AGENT_EVENTS_BLOB" -//env varibale which has ResourceId for LA +//env variable which has ResourceId for LA const ResourceIdEnv = "AKS_RESOURCE_ID" //env variable which has ResourceName for NON-AKS @@ -78,20 +81,26 @@ const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimpr const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" const WindowsContainerLogPluginConfFilePath = "/etc/omsagentwindows/out_oms.conf" -// IPName for Container Log -const IPName = "Containers" +// IPName +const IPName = "ContainerInsights" + + const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 //Eventsource name in mdsd -const MdsdSourceName = "ContainerLogSource" +const MdsdContainerLogSourceName = "ContainerLogSource" +const MdsdContainerLogV2SourceName = "ContainerLogV2Source" -//container logs route - v2 (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) +//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" +//container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) +const ContainerLogV2SchemaVersion = "v2" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -125,6 +134,8 @@ var ( ContainerLogsRouteV2 bool // container log route for routing thru ADX ContainerLogsRouteADX bool + // container log schema (applicable only for non-ADX route) + ContainerLogSchemaV2 bool //ADX Cluster URI AdxClusterUri string // ADX clientID @@ -180,8 +191,8 @@ var ( userAgent = "" ) -// DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin -type DataItem struct { +// DataItemLAv1 == ContainerLog table in LA +type DataItemLAv1 struct { LogEntry string `json:"LogEntry"` LogEntrySource string `json:"LogEntrySource"` LogEntryTimeStamp string `json:"LogEntryTimeStamp"` @@ -193,16 +204,31 @@ type DataItem struct { Computer string `json:"Computer"` } +// DataItemLAv2 == ContainerLogV2 table in LA +// Please keep the names same as destination column names, to avoid transforming one to another in the pipeline +type DataItemLAv2 struct { + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` + //PodLabels string `json:"PodLabels"` +} + +// DataItemADX == ContainerLogV2 table in ADX type DataItemADX struct { - LogEntry string `json:"LogEntry"` - LogEntrySource string `json:"LogEntrySource"` - LogEntryTimeStamp string `json:"LogEntryTimeStamp"` - LogEntryTimeOfCommand string `json:"TimeOfCommand"` - ID string `json:"Id"` - Image string `json:"Image"` - Name string `json:"Name"` - SourceSystem string `json:"SourceSystem"` + TimeGenerated string `json:"TimeGenerated"` Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` + //PodLabels string `json:"PodLabels"` AzureResourceId string `json:"AzureResourceId"` } @@ -227,10 +253,17 @@ type InsightsMetricsBlob struct { } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point -type ContainerLogBlob struct { +type ContainerLogBlobLAv1 struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []DataItemLAv1 `json:"DataItems"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type ContainerLogBlobLAv2 struct { DataType string `json:"DataType"` IPName string `json:"IPName"` - DataItems []DataItem `json:"DataItems"` + DataItems []DataItemLAv2 `json:"DataItems"` } // MsgPackEntry represents the object corresponding to a single messagepack event in the messagepack stream @@ -422,7 +455,7 @@ func convert(in interface{}) (float64, bool) { func populateKubeMonAgentEventHash(record map[interface{}]interface{}, errType KubeMonAgentEventType) { var logRecordString = ToString(record["log"]) var eventTimeStamp = ToString(record["time"]) - containerID, _, podName := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) + containerID, _, podName, _ := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) Log("Locked EventHashUpdateMutex for updating hash \n ") EventHashUpdateMutex.Lock() @@ -792,7 +825,8 @@ func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int // PostDataHelper sends data to the ODS endpoint or oneagent or ADX func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { start := time.Now() - var dataItems []DataItem + var dataItemsLAv1 []DataItemLAv1 + var dataItemsLAv2 []DataItemLAv2 var dataItemsADX []DataItemADX var msgPackEntries []MsgPackEntry @@ -816,7 +850,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { - containerID, k8sNamespace, _ := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) + containerID, k8sNamespace, k8sPodName, containerName := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) logEntrySource := ToString(record["stream"]) if strings.EqualFold(logEntrySource, "stdout") { @@ -830,26 +864,42 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } stringMap = make(map[string]string) + //below id & name are used by latency telemetry in both v1 & v2 LA schemas + id := "" + name := "" logEntry := ToString(record["log"]) logEntryTimeStamp := ToString(record["time"]) - stringMap["LogEntry"] = logEntry - stringMap["LogEntrySource"] = logEntrySource - stringMap["LogEntryTimeStamp"] = logEntryTimeStamp - stringMap["SourceSystem"] = "Containers" - stringMap["Id"] = containerID - - if val, ok := imageIDMap[containerID]; ok { - stringMap["Image"] = val - } + //ADX Schema & LAv2 schema are almost the same (except resourceId) + if (ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true) { + stringMap["Computer"] = Computer + stringMap["ContainerId"] = containerID + stringMap["ContainerName"] = containerName + stringMap["PodName"] = k8sPodName + stringMap["PodNamespace"] = k8sNamespace + stringMap["LogMessage"] = logEntry + stringMap["LogSource"] = logEntrySource + stringMap["TimeGenerated"] = logEntryTimeStamp + } else { + stringMap["LogEntry"] = logEntry + stringMap["LogEntrySource"] = logEntrySource + stringMap["LogEntryTimeStamp"] = logEntryTimeStamp + stringMap["SourceSystem"] = "Containers" + stringMap["Id"] = containerID + + if val, ok := imageIDMap[containerID]; ok { + stringMap["Image"] = val + } - if val, ok := nameIDMap[containerID]; ok { - stringMap["Name"] = val - } + if val, ok := nameIDMap[containerID]; ok { + stringMap["Name"] = val + } - stringMap["TimeOfCommand"] = start.Format(time.RFC3339) - stringMap["Computer"] = Computer - var dataItem DataItem + stringMap["TimeOfCommand"] = start.Format(time.RFC3339) + stringMap["Computer"] = Computer + } + var dataItemLAv1 DataItemLAv1 + var dataItemLAv2 DataItemLAv2 var dataItemADX DataItemADX var msgPackEntry MsgPackEntry @@ -866,48 +916,68 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } else if ContainerLogsRouteADX == true { if ResourceCentric == true { stringMap["AzureResourceId"] = ResourceID + } else { + stringMap["AzureResourceId"] = "" } dataItemADX = DataItemADX{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: stringMap["TimeOfCommand"], - SourceSystem: stringMap["SourceSystem"], + TimeGenerated: stringMap["TimeGenerated"], Computer: stringMap["Computer"], - Image: stringMap["Image"], - Name: stringMap["Name"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], AzureResourceId: stringMap["AzureResourceId"], } //ADX dataItemsADX = append(dataItemsADX, dataItemADX) } else { - dataItem = DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: stringMap["TimeOfCommand"], - SourceSystem: stringMap["SourceSystem"], - Computer: stringMap["Computer"], - Image: stringMap["Image"], - Name: stringMap["Name"], + if (ContainerLogSchemaV2 == true) { + dataItemLAv2 = DataItemLAv2{ + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], + } + //ODS-v2 schema + dataItemsLAv2 = append(dataItemsLAv2, dataItemLAv2) + name = stringMap["ContainerName"] + id = stringMap["ContainerId"] + } else { + dataItemLAv1 = DataItemLAv1{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: stringMap["TimeOfCommand"], + SourceSystem: stringMap["SourceSystem"], + Computer: stringMap["Computer"], + Image: stringMap["Image"], + Name: stringMap["Name"], + } + //ODS-v1 schema + dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) + name = stringMap["Name"] + id = stringMap["Id"] } - //ODS - dataItems = append(dataItems, dataItem) } - if stringMap["LogEntryTimeStamp"] != "" { - loggedTime, e := time.Parse(time.RFC3339, stringMap["LogEntryTimeStamp"]) + if logEntryTimeStamp != "" { + loggedTime, e := time.Parse(time.RFC3339, logEntryTimeStamp) if e != nil { - message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) + message := fmt.Sprintf("Error while converting logEntryTimeStamp for telemetry purposes: %s", e.Error()) Log(message) SendException(message) } else { ltncy := float64(start.Sub(loggedTime) / time.Millisecond) if ltncy >= maxLatency { maxLatency = ltncy - maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + maxLatencyContainer = name + "=" + id } } } @@ -917,8 +987,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { //flush to mdsd + mdsdSourceName := MdsdContainerLogSourceName + if (ContainerLogSchemaV2 == true) { + mdsdSourceName = MdsdContainerLogV2SourceName + } fluentForward := MsgPackForward{ - Tag: MdsdSourceName, + Tag: mdsdSourceName, Entries: msgPackEntries, } @@ -965,7 +1039,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { elapsed = time.Since(start) if er != nil { - Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(dataItems), elapsed, er.Error()) + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) if MdsdMsgpUnixSocketClient != nil { MdsdMsgpUnixSocketClient.Close() MdsdMsgpUnixSocketClient = nil @@ -1011,14 +1085,14 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } } - // Setup a maximum time for completion to be 15 Seconds. + // Setup a maximum time for completion to be 30 Seconds. ctx, cancel := context.WithTimeout(ParentContext, 30*time.Second) defer cancel() //ADXFlushMutex.Lock() //defer ADXFlushMutex.Unlock() //MultiJSON support is not there yet - if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogMapping", ingest.JSON), ingest.FileFormat(ingest.JSON), ingest.FlushImmediately()); ingestionErr != nil { + if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogV2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { Log("Error when streaming to ADX Ingestion: %s", ingestionErr.Error()) //ADXIngestor = nil //not required as per ADX team. Will keep it to indicate that we tried this approach @@ -1033,58 +1107,75 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else { - //flush to ODS - if len(dataItems) > 0 { - logEntry := ContainerLogBlob{ - DataType: ContainerLogDataType, + } else { //ODS + var logEntry interface{} + recordType := "" + loglinesCount := 0 + //schema v2 + if (len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true) { + logEntry = ContainerLogBlobLAv2{ + DataType: ContainerLogV2DataType, IPName: IPName, - DataItems: dataItems} - - marshalled, err := json.Marshal(logEntry) - if err != nil { - message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) - Log(message) - SendException(message) - return output.FLB_OK + DataItems: dataItemsLAv2} + loglinesCount = len(dataItemsLAv2) + recordType = "ContainerLogV2" + } else { + //schema v1 + if len(dataItemsLAv1) > 0 { + logEntry = ContainerLogBlobLAv1{ + DataType: ContainerLogDataType, + IPName: IPName, + DataItems: dataItemsLAv1} + loglinesCount = len(dataItemsLAv1) + recordType = "ContainerLog" } + } - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("User-Agent", userAgent) - reqId := uuid.New().String() - req.Header.Set("X-Request-ID", reqId) - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } + marshalled, err := json.Marshal(logEntry) + //Log("LogEntry::e %s", marshalled) + if err != nil { + message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) + Log(message) + SendException(message) + return output.FLB_OK + } - resp, err := HTTPClient.Do(req) - elapsed = time.Since(start) + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("User-Agent", userAgent) + reqId := uuid.New().String() + req.Header.Set("X-Request-ID", reqId) + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + resp, err := HTTPClient.Do(req) + elapsed = time.Since(start) - if err != nil { - message := fmt.Sprintf("Error when sending request %s \n", err.Error()) - Log(message) - // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation - //SendException(message) - Log("Failed to flush %d records after %s", len(dataItems), elapsed) + if err != nil { + message := fmt.Sprintf("Error when sending request %s \n", err.Error()) + Log(message) + // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation + //SendException(message) + + Log("Failed to flush %d records after %s", loglinesCount, elapsed) - return output.FLB_RETRY - } + return output.FLB_RETRY + } - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) - } - return output.FLB_RETRY + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) } + return output.FLB_RETRY + } - defer resp.Body.Close() - numContainerLogRecords = len(dataItems) - Log("PostDataHelper::Info::Successfully flushed %d container log records to ODS in %s", numContainerLogRecords, elapsed) + defer resp.Body.Close() + numContainerLogRecords = loglinesCount + Log("PostDataHelper::Info::Successfully flushed %d %s records to ODS in %s", numContainerLogRecords, recordType, elapsed) } - } ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() @@ -1107,12 +1198,13 @@ func containsKey(currentMap map[string]bool, key string) bool { return c } -// GetContainerIDK8sNamespacePodNameFromFileName Gets the container ID, k8s namespace and pod name From the file Name +// GetContainerIDK8sNamespacePodNameFromFileName Gets the container ID, k8s namespace, pod name and containername From the file Name // sample filename kube-proxy-dgcx7_kube-system_kube-proxy-8df7e49e9028b60b5b0d0547f409c455a9567946cf763267b7e6fa053ab8c182.log -func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, string, string) { +func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, string, string, string) { id := "" ns := "" podName := "" + containerName := "" start := strings.LastIndex(filename, "-") end := strings.LastIndex(filename, ".") @@ -1132,6 +1224,15 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str ns = filename[start+1 : end] } + start = strings.LastIndex(filename, "_") + end = strings.LastIndex(filename, "-") + + if start >= end || start == -1 || end == -1 { + containerName = "" + } else { + containerName = filename[start+1 : end] + } + start = strings.Index(filename, "/containers/") end = strings.Index(filename, "_") @@ -1141,7 +1242,7 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str podName = filename[(start + len("/containers/")):end] } - return id, ns, podName + return id, ns, podName, containerName } // InitializePlugin reads and populates plugin configuration @@ -1313,8 +1414,8 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateHTTPClient() - ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_ROUTE"))) - Log("AZMON_CONTAINER_LOGS_ROUTE:%s", ContainerLogsRoute) + ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE"))) + Log("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE:%s", ContainerLogsRoute) ContainerLogsRouteV2 = false //default is ODS ContainerLogsRouteADX = false //default is LA @@ -1362,10 +1463,22 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateADXClient() } + ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) + Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) + + ContainerLogSchemaV2 = false //default is v1 schema + + if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { + ContainerLogSchemaV2 = true + Log("Container logs schema=%s", ContainerLogV2SchemaVersion) + fmt.Fprintf(os.Stdout, "Container logs schema=%s... \n", ContainerLogV2SchemaVersion) + } + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - if enrichContainerLogs == true { + //enrichment not applicable for ADX and v2 schema + if enrichContainerLogs == true && ContainerLogsRouteADX != true && ContainerLogSchemaV2 != true { Log("ContainerLogEnrichment=true; starting goroutine to update containerimagenamemaps \n") go updateContainerImageNameMaps() } else { @@ -1378,4 +1491,4 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } -} +} \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 3d30ac5aa..461fdea96 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -10,9 +10,9 @@ import ( "strings" "time" + "github.com/fluent/fluent-bit-go/output" "github.com/microsoft/ApplicationInsights-Go/appinsights" "github.com/microsoft/ApplicationInsights-Go/appinsights/contracts" - "github.com/fluent/fluent-bit-go/output" ) var ( @@ -44,33 +44,45 @@ var ( ContainerLogsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 - //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) + //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + OSMNamespaceCount int + //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPods string + //Tracks the number of monitor kubernetes pods namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsNamespaceLength int + //Tracks the number of monitor kubernetes pods label selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsLabelSelectorLength int + //Tracks the number of monitor kubernetes pods field selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsFieldSelectorLength int ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" - metricNameLogSize = "ContainerLogsSize" - metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" - metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" - metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" - metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" - metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" - metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" - metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" - metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameLogSize = "ContainerLogsSize" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" + metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" + metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" + metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" defaultTelemetryPushIntervalSeconds = 300 - eventNameContainerLogInit = "ContainerLogPluginInitialized" - eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameCustomPrometheusSidecarHeartbeat = "CustomPrometheusSidecarHeartbeatEvent" + eventNameWindowsFluentBitHeartbeat = "WindowsFluentBitHeartbeatEvent" ) // SendContainerLogPluginMetrics is a go-routine that flushes the data periodically (every 5 mins to App Insights) @@ -100,6 +112,11 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + osmNamespaceCount := OSMNamespaceCount + promMonitorPods := PromMonitorPods + promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength + promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength + promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 @@ -118,17 +135,37 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) - TelemetryClient.Track(logRateMetric) - Log("Log Size Rate: %f\n", logSizeRate) - TelemetryClient.Track(logSizeMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + telemetryDimensions := make(map[string]string) + telemetryDimensions["CustomPromMonitorPods"] = promMonitorPods + if promMonitorPodsNamespaceLength > 0 { + telemetryDimensions["CustomPromMonitorPodsNamespaceLength"] = strconv.Itoa(promMonitorPodsNamespaceLength) + } + if promMonitorPodsLabelSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsLabelSelectorLength"] = strconv.Itoa(promMonitorPodsLabelSelectorLength) + } + if promMonitorPodsFieldSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsFieldSelectorLength"] = strconv.Itoa(promMonitorPodsFieldSelectorLength) + } + if osmNamespaceCount > 0 { + telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) + } + + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) + + } else { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) + TelemetryClient.Track(logRateMetric) + Log("Log Size Rate: %f\n", logSizeRate) + TelemetryClient.Track(logSizeMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } } TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) if telegrafMetricsSendErrorCount > 0.0 { @@ -255,12 +292,60 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } if isProxyConfigured == true { - CommonProperties["IsProxyConfigured"] = "true" + CommonProperties["IsProxyConfigured"] = "true" } else { - CommonProperties["IsProxyConfigured"] = "false" - } + CommonProperties["IsProxyConfigured"] = "false" + } + + // Adding container type to telemetry + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + CommonProperties["ContainerType"] = "prometheussidecar" + } + } TelemetryClient.Context().CommonProperties = CommonProperties + + // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted + + OSMNamespaceCount = 0 + osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") + if osmNsCount != "" { + OSMNamespaceCount, err = strconv.Atoi(osmNsCount) + if err != nil { + Log("OSM namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") + + PromMonitorPodsNamespaceLength = 0 + promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") + if promMonPodsNamespaceLength != "" { + PromMonitorPodsNamespaceLength, err = strconv.Atoi(promMonPodsNamespaceLength) + if err != nil { + Log("Custom prometheus monitor kubernetes pods namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsLabelSelectorLength = 0 + promLabelSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH") + if promLabelSelectorLength != "" { + PromMonitorPodsLabelSelectorLength, err = strconv.Atoi(promLabelSelectorLength) + if err != nil { + Log("Custom prometheus label selector count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsFieldSelectorLength = 0 + promFieldSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH") + if promFieldSelectorLength != "" { + PromMonitorPodsFieldSelectorLength, err = strconv.Atoi(promFieldSelectorLength) + if err != nil { + Log("Custom prometheus field selector count string to int conversion error %s", err.Error()) + } + } + return 0, nil } diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 8b1a3df65..61d047e52 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -145,7 +145,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLog") + ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogV2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 13796cd1e..8cb6f603e 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -20,10 +20,12 @@ class CAdvisorMetricsAPIClient @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @clusterContainerLogEnrich = ENV["AZMON_CLUSTER_CONTAINER_LOG_ENRICH"] + @clusterContainerLogSchemaVersion = ENV["AZMON_CONTAINER_LOG_SCHEMA_VERSION"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] @@ -53,6 +55,7 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} @@ -63,6 +66,7 @@ class CAdvisorMetricsAPIClient #cadvisor ports @@CADVISOR_SECURE_PORT = "10250" @@CADVISOR_NON_SECURE_PORT = "10255" + def initialize end @@ -83,40 +87,40 @@ def getPodsFromCAdvisor(winNode: nil) end def getBaseCAdvisorUri(winNode) - cAdvisorSecurePort = isCAdvisorOnSecurePort() + cAdvisorSecurePort = isCAdvisorOnSecurePort() + + if !!cAdvisorSecurePort == true + defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + else + defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + end + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + + if !nodeIP.nil? + @Log.info("Using #{nodeIP} for CAdvisor Host") if !!cAdvisorSecurePort == true - defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" else - defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" end - + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") if !winNode.nil? - nodeIP = winNode["InternalIP"] - else - nodeIP = ENV["NODE_IP"] - end - - if !nodeIP.nil? - @Log.info("Using #{nodeIP} for CAdvisor Host") - if !!cAdvisorSecurePort == true - return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" - else - return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" - end + return nil else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") - if !winNode.nil? - return nil - else - return defaultHost - end + return defaultHost end + end end def getCAdvisorUri(winNode, relativeUri) - baseUri = getBaseCAdvisorUri(winNode) - return baseUri + relativeUri + baseUri = getBaseCAdvisorUri(winNode) + return baseUri + relativeUri end def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) @@ -245,20 +249,26 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromFDC"] = @dsPromFieldDropCount telemetryProps["dsPromUrl"] = @dsPromUrlCount end - #telemetry about containerlogs Routing for daemonset - if (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) + #telemetry about containerlog Routing for daemonset + if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) + telemetryProps["containerLogsRoute"] = "v2" + elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) telemetryProps["containerLogsRoute"] = @containerLogsRoute end - #telemetry about health model - if (!@hmEnabled.nil? && !@hmEnabled.empty?) + #telemetry about health model + if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled - end - #telemetry for npm integration - if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) - telemetryProps["int-npm-a"] = "1" - elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) - telemetryProps["int-npm-b"] = "1" - end + end + #telemetry for npm integration + if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) + telemetryProps["int-npm-a"] = "1" + elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) + telemetryProps["int-npm-b"] = "1" + end + #telemetry for Container log schema version clusterContainerLogSchemaVersion + if (!@clusterContainerLogSchemaVersion.nil? && !@clusterContainerLogSchemaVersion.empty?) + telemetryProps["containerLogVer"] = @clusterContainerLogSchemaVersion + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end @@ -299,8 +309,10 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end if !metricInfo.nil? metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed", "containerGpumemoryUsedBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle", "containerGpuDutyCycle", metricTime)) + + metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -311,6 +323,78 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end + def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 + + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podNamespace = pod["podRef"]["namespace"] + excludeNamespace = false + if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" + excludeNamespace = true + end + + if (!excludeNamespace && !pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + pvcName = pvcRef["name"] + pvcNamespace = pvcRef["namespace"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volume["name"] + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + + # If kube-system metrics collection enabled, send telemetry + begin + if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true" + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {}) + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}") + end + + return metricItems + end + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId @@ -330,18 +414,17 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings containerName = container["name"] metricValue = accelerator[metricNameToCollect] - metricItem = {} metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNametoReturn metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - + metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace @@ -357,9 +440,9 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator["id"].nil? && !accelerator["id"].empty?) metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"] end - + metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end @@ -836,13 +919,13 @@ def getResponse(winNode, relativeUri) uri = URI.parse(cAdvisorUri) if isCAdvisorOnSecurePort() Net::HTTP.start(uri.host, uri.port, - :use_ssl => true, :open_timeout => 20, :read_timeout => 40, - :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", - :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" + :use_ssl => true, :open_timeout => 20, :read_timeout => 40, + :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" end else Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| @@ -855,19 +938,24 @@ def getResponse(winNode, relativeUri) rescue => error @Log.warn("CAdvisor api request for #{cAdvisorUri} failed: #{error}") telemetryProps = {} - telemetryProps["Computer"] = winNode["Hostname"] + if !winNode.nil? + hostName = winNode["Hostname"] + else + hostName = (OMS::Common.get_hostname) + end + telemetryProps["Computer"] = hostName ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) end return response end def isCAdvisorOnSecurePort - cAdvisorSecurePort = false - # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port - if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" - cAdvisorSecurePort = true - end - return cAdvisorSecurePort + cAdvisorSecurePort = false + # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port + if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" + cAdvisorSecurePort = true + end + return cAdvisorSecurePort end end end diff --git a/source/plugins/ruby/CustomMetricsUtils.rb b/source/plugins/ruby/CustomMetricsUtils.rb index a19580630..220313e6b 100644 --- a/source/plugins/ruby/CustomMetricsUtils.rb +++ b/source/plugins/ruby/CustomMetricsUtils.rb @@ -6,21 +6,15 @@ def initialize end class << self - def check_custom_metrics_availability(custom_metric_regions) + def check_custom_metrics_availability aks_region = ENV['AKS_REGION'] aks_resource_id = ENV['AKS_RESOURCE_ID'] + aks_cloud_environment = ENV['CLOUD_ENVIRONMENT'] if aks_region.to_s.empty? || aks_resource_id.to_s.empty? return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set end - custom_metrics_regions_arr = custom_metric_regions.split(',') - custom_metrics_regions_hash = custom_metrics_regions_arr.map {|x| [x.downcase,true]}.to_h - - if custom_metrics_regions_hash.key?(aks_region.downcase) - true - else - false - end + return aks_cloud_environment.to_s.downcase == 'public' end end end \ No newline at end of file diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 36dcdd8c6..98347d272 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -31,6 +31,8 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} def initialize end @@ -172,6 +174,10 @@ def isAROV3Cluster return @@IsAROV3Cluster end + def isAROv3MasterOrInfraPod(nodeName) + return isAROV3Cluster() && (!nodeName.nil? && (nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-"))) + end + def isNodeMaster return @@IsNodeMaster if !@@IsNodeMaster.nil? @@IsNodeMaster = false @@ -276,7 +282,8 @@ def getPods(namespace) def getWindowsNodes winNodes = [] begin - resourceUri = getNodesResourceUri("nodes") + # get only windows nodes + resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" # Resetting the windows node cache @@ -396,42 +403,97 @@ def getPodUid(podNameSpace, podMetadata) return podUid end - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - podUid = getPodUid(podNameSpace, pod["metadata"]) - if podUid.nil? - next - end - - # For ARO, skip the pods scheduled on to master or infra nodes to ingest - if isAROV3Cluster() && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && - (pod["spec"]["nodeName"].downcase.start_with?("infra-") || - pod["spec"]["nodeName"].downcase.start_with?("master-")) - next - end + podNameSpace = pod["metadata"]["namespace"] + podName = pod["metadata"]["name"] + podUid = getPodUid(podNameSpace, pod["metadata"]) + if podUid.nil? + return metricItems + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !pod["spec"]["nodeName"].nil? + nodeName = pod["spec"]["nodeName"] + end + # For ARO, skip the pods scheduled on to master or infra nodes to ingest + if isAROv3MasterOrInfraPod(nodeName) + return metricItems + end - if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] - podContainers.each do |container| - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) + podContainers.each do |container| + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about omsagent requests and limits + begin + if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) + nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") + @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue + end + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@resourceLimitsTelemetryHash.each { |key, value| + keyElements = key.split("~~") + if keyElements.length != 4 + next + end + + # get dimension values by key + telemetryProps = {} + telemetryProps["Computer"] = keyElements[0] + telemetryProps["PodName"] = keyElements[1] + telemetryProps["ContainerName"] = keyElements[2] + metricNameFromKey = keyElements[3] + ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) + } + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") + end + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") metricItem = {} metricItem["DataItems"] = [] @@ -451,32 +513,6 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = nodeName - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = nodeName - metricProps["ObjectName"] = "K8SContainer" - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - end end end end @@ -488,78 +524,74 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId clusterName = getClusterName - - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - if pod["metadata"]["annotations"].nil? - next - else - podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] - end + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + if pod["metadata"]["annotations"].nil? + return metricItems else - podUid = pod["metadata"]["uid"] + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] end + else + podUid = pod["metadata"]["uid"] + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end - if (!podContainers.nil? && !podContainers.empty?) - if (!pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] + if (!podContainers.nil? && !podContainers.empty?) + if (!pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + else + nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU + end + podContainers.each do |container| + metricValue = nil + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) else - nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU - end - podContainers.each do |container| - metricValue = nil - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - else - #No container level limit for the given metric, so default to node level limit for non-gpu metrics - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - metricValue = @@NodeMetrics[nodeMetricsHashKey] - end - end - if (!metricValue.nil?) - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = nodeName - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit for non-gpu metrics + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + metricValue = @@NodeMetrics[nodeMetricsHashKey] end end + if (!metricValue.nil?) + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = nodeName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end end end rescue => error @@ -578,32 +610,9 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem = parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime) + if !metricItem.nil? && !metricItem.empty? metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error @@ -612,49 +621,82 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet return metricItems end #parseNodeLimits - def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) - metricItems = [] + def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} begin - metricInfo = metricJSON clusterId = getClusterId - clusterName = getClusterName #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = node["metadata"]["name"] - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect - - metricItem["Tags"] = metricTags + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end + rescue => error + @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItem + end #parseNodeLimitsFromNodeItem - metricItems.push(metricItem) - #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end + def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} + begin + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + clusterId = getClusterId + clusterName = getClusterName + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = node["metadata"]["name"] + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect + + metricItem["Tags"] = metricTags + + #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") end - return metricItems + return metricItem end def getMetricNumericValue(metricName, metricVal) @@ -719,6 +761,9 @@ def getMetricNumericValue(metricName, metricVal) if (metricValue.end_with?("m")) metricValue.chomp!("m") metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("k")) + metricValue.chomp!("k") + metricValue = Float(metricValue) * 1000.0 else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') metricValue = Float(metricValue) * 1000.0 ** 3 end @@ -774,5 +819,32 @@ def getKubeAPIServerUrl end return apiServerUrl end + + def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) + kubeServiceRecords = [] + begin + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?) + servicesCount = serviceList["items"].length + @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") + serviceList["items"].each do |item| + kubeServiceRecord = {} + kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + kubeServiceRecord["ServiceName"] = item["metadata"]["name"] + kubeServiceRecord["Namespace"] = item["metadata"]["namespace"] + kubeServiceRecord["SelectorLabels"] = [item["spec"]["selector"]] + # added these before emit to avoid memory foot print + # kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId + # kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterIP"] = item["spec"]["clusterIP"] + kubeServiceRecord["ServiceType"] = item["spec"]["type"] + kubeServiceRecords.push(kubeServiceRecord.dup) + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getKubeServicesInventoryRecords:Failed with an error : #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return kubeServiceRecords + end end end diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index 2e516a99d..e889c3f09 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -28,7 +28,7 @@ class MdmAlertTemplates } }' - Stable_job_metrics_template = ' + Stable_job_metrics_template = ' { "time": "%{timestamp}", "data": { @@ -45,7 +45,7 @@ class MdmAlertTemplates "dimValues": [ "%{controllerNameDimValue}", "%{namespaceDimValue}", - "6" + "%{jobCompletionThreshold}" ], "min": %{containerCountMetricValue}, "max": %{containerCountMetricValue}, @@ -90,6 +90,105 @@ class MdmAlertTemplates } }' + Container_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/containers", + "dimNames": [ + "containerName", + "podName", + "controllerName", + "Kubernetes namespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{containerNameDimValue}", + "%{podNameDimValue}", + "%{controllerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{containerResourceThresholdViolated}, + "max": %{containerResourceThresholdViolated}, + "sum": %{containerResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' + + PV_resource_utilization_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "volumeName", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{volumeNameDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceUtilizationPercentage}, + "max": %{pvResourceUtilizationPercentage}, + "sum": %{pvResourceUtilizationPercentage}, + "count": 1 + } + ] + } + } + }' + + PV_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "volumeName", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{volumeNameDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceThresholdViolated}, + "max": %{pvResourceThresholdViolated}, + "sum": %{pvResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' + Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3d75dc6f4..6641456af 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -8,9 +8,11 @@ class MdmMetricsGenerator require_relative "MdmAlertTemplates" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "oms_common" @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" @log = Logger.new(@log_path, 1, 5000000) + @@hostName = (OMS::Common.get_hostname) @oom_killed_container_count_hash = {} @container_restart_count_hash = {} @@ -37,8 +39,24 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, } + @@container_metric_name_metric_threshold_violated_hash = { + Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC, + } + + @@pod_metric_name_metric_percentage_name_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC, + } + + @@pod_metric_name_metric_threshold_violated_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_THRESHOLD_VIOLATED_METRIC, + } + # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -89,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podControllerNameDimValue = key_elements[0] podNamespaceDimValue = key_elements[1] - record = metricsTemplate % { - timestamp: batch_time, - metricName: metricName, - controllerNameDimValue: podControllerNameDimValue, - namespaceDimValue: podNamespaceDimValue, - containerCountMetricValue: value, - } + # Special handling for jobs since we need to send the threshold as a dimension as it is configurable + if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + metric_threshold_hash = getContainerResourceUtilizationThresholds + #Converting this to hours since we already have olderThanHours dimension. + jobCompletionThresholdHours = (metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0).round(2) + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + jobCompletionThreshold: jobCompletionThresholdHours, + } + else + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + } + end records.push(Yajl::Parser.parse(StringIO.new(record))) } else @@ -122,9 +155,11 @@ def flushPodMdmMetricTelemetry staleJobHashValues = @stale_job_count_hash.values staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x } + metric_threshold_hash = getContainerResourceUtilizationThresholds properties["ContainerRestarts"] = containerRestartMetricCount properties["OomKilledContainers"] = oomKilledContainerMetricCount properties["OldCompletedJobs"] = staleJobMetricCount + properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties) ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {}) rescue => errorStr @@ -151,29 +186,63 @@ def zeroFillMetricRecords(records, batch_time) metric_threshold_hash = getContainerResourceUtilizationThresholds container_zero_fill_dims = [Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL].join("~~") - containerCpuRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::CPU_USAGE_NANO_CORES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES]) - if !containerCpuRecord.nil? && !containerCpuRecord.empty? && !containerCpuRecord[0].nil? && !containerCpuRecord[0].empty? - records.push(containerCpuRecord[0]) + containerCpuRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::CPU_USAGE_NANO_CORES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES], + true) + if !containerCpuRecords.nil? && !containerCpuRecords.empty? + containerCpuRecords.each { |cpuRecord| + if !cpuRecord.nil? && !cpuRecord.empty? + records.push(cpuRecord) + end + } end - containerMemoryRssRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_RSS_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_RSS_BYTES]) - if !containerMemoryRssRecord.nil? && !containerMemoryRssRecord.empty? && !containerMemoryRssRecord[0].nil? && !containerMemoryRssRecord[0].empty? - records.push(containerMemoryRssRecord[0]) + containerMemoryRssRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_RSS_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_RSS_BYTES], + true) + if !containerMemoryRssRecords.nil? && !containerMemoryRssRecords.empty? + containerMemoryRssRecords.each { |memoryRssRecord| + if !memoryRssRecord.nil? && !memoryRssRecord.empty? + records.push(memoryRssRecord) + end + } end - containerMemoryWorkingSetRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_WORKING_SET_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES]) - if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty? - records.push(containerMemoryWorkingSetRecord[0]) + containerMemoryWorkingSetRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_WORKING_SET_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES], + true) + if !containerMemoryWorkingSetRecords.nil? && !containerMemoryWorkingSetRecords.empty? + containerMemoryWorkingSetRecords.each { |workingSetRecord| + if !workingSetRecord.nil? && !workingSetRecord.empty? + records.push(workingSetRecord) + end + } + end + + pvZeroFillDims = {} + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = Constants::VOLUME_NAME_ZERO_FILL + pvResourceUtilMetricRecords = getPVResourceUtilMetricRecords(batch_time, + Constants::PV_USED_BYTES, + @@hostName, + 0, + pvZeroFillDims, + metric_threshold_hash[Constants::PV_USED_BYTES], + true) + if !pvResourceUtilMetricRecords.nil? && !pvResourceUtilMetricRecords.empty? + pvResourceUtilMetricRecords.each { |pvRecord| + if !pvRecord.nil? && !pvRecord.empty? + records.push(pvRecord) + end + } end rescue => errorStr @log.info "Error in zeroFillMetricRecords: #{errorStr}" @@ -185,10 +254,13 @@ def zeroFillMetricRecords(records, batch_time) def appendAllPodMetrics(records, batch_time) begin @log.info "in appendAllPodMetrics..." - if @sendZeroFilledMetrics == true + timeDifference = (DateTime.now.to_time.to_i - @zeroFilledMetricsTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if @sendZeroFilledMetrics == true || (timeDifferenceInMinutes >= Constants::ZERO_FILL_METRICS_INTERVAL_IN_MINUTES) records = zeroFillMetricRecords(records, batch_time) # Setting it to false after startup @sendZeroFilledMetrics = false + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i end records = appendPodMetrics(records, Constants::MDM_OOM_KILLED_CONTAINER_COUNT, @@ -223,7 +295,7 @@ def appendAllPodMetrics(records, batch_time) return records end - def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) records = [] begin if dims.nil? @@ -252,6 +324,19 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::Container_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@container_metric_name_metric_threshold_violated_hash[metricName], + containerNameDimValue: containerName, + podNameDimValue: podName, + controllerNameDimValue: controllerName, + namespaceDimValue: podNamespace, + containerResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -259,6 +344,46 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) + records = [] + begin + containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] + podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] + podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] + volumeName = dims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] + + resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + volumeNameDimValue: volumeName, + pvResourceUtilizationPercentage: percentageMetricValue, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::PV_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_threshold_violated_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + volumeNameDimValue: volumeName, + pvResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) + rescue => errorStr + @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return records + end + def getDiskUsageMetricRecords(record) records = [] usedPercent = nil @@ -296,22 +421,22 @@ def getMetricRecords(record) begin dimNames = String.new "" #mutable string dimValues = String.new "" - noDimVal ="-" + noDimVal = "-" metricValue = 0 if !record["tags"].nil? - dimCount = 0 - record["tags"].each { |k, v| - dimCount = dimCount+1 - if (dimCount <= 10) #MDM = 10 dims - dimNames.concat("\"#{k}\"") - dimNames.concat(",") - if !v.nil? && v.length >0 - dimValues.concat("\"#{v}\"") - else - dimValues.concat("\"#{noDimVal}\"") - end - dimValues.concat(",") + dimCount = 0 + record["tags"].each { |k, v| + dimCount = dimCount + 1 + if (dimCount <= 10) #MDM = 10 dims + dimNames.concat("\"#{k}\"") + dimNames.concat(",") + if !v.nil? && v.length > 0 + dimValues.concat("\"#{v}\"") + else + dimValues.concat("\"#{noDimVal}\"") end + dimValues.concat(",") + end } if (dimNames.end_with?(",")) dimNames.chomp!(",") @@ -324,19 +449,19 @@ def getMetricRecords(record) convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601 if !record["fields"].nil? record["fields"].each { |k, v| - if is_numeric(v) - metricRecord = MdmAlertTemplates::Generic_metric_template % { - timestamp: convertedTimestamp, - metricName: k, - namespaceSuffix: record["name"], - dimNames: dimNames, - dimValues: dimValues, - metricValue: v, - } - records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) - #@log.info "pushed mdmgenericmetric: #{k},#{v}" - end - } + if is_numeric(v) + metricRecord = MdmAlertTemplates::Generic_metric_template % { + timestamp: convertedTimestamp, + metricName: k, + namespaceSuffix: record["name"], + dimNames: dimNames, + dimValues: dimValues, + metricValue: v, + } + records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) + #@log.info "pushed mdmgenericmetric: #{k},#{v}" + end + } end rescue => errorStr @log.info "getMetricRecords:Error: #{errorStr} for record #{record}" @@ -346,7 +471,7 @@ def getMetricRecords(record) end def is_numeric(o) - true if Float(o) rescue false + true if Float(o) rescue false end def getContainerResourceUtilizationThresholds @@ -356,6 +481,8 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -375,6 +502,18 @@ def getContainerResourceUtilizationThresholds memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2) metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + + pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] + if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? + pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) + metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat + end + + jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"] + if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty? + jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index ef55c3257..552dafb1f 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -18,7 +18,7 @@ class ArcK8sClusterIdentity @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" - @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" def initialize @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" @@ -26,6 +26,7 @@ def initialize @log.info "initialize start @ #{Time.now.utc.iso8601}" @token_expiry_time = Time.now @cached_access_token = String.new + @isLastTokenRenewalUpdatePending = false @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl @@ -33,20 +34,28 @@ def initialize @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" end @http_client = get_http_client - @service_account_token = get_service_account_token + @service_account_token = get_service_account_token + @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] + @log.info "extension name:#{@extensionName} @ #{Time.now.utc.iso8601}" @log.info "initialize complete @ #{Time.now.utc.iso8601}" end def get_cluster_identity_token() begin - # get the cluster msi identity token either if its empty or near expirty. Token is valid 24 hrs. + # get the cluster msi identity token either if its empty or near expiry. Token is valid 24 hrs. if @cached_access_token.to_s.empty? || (Time.now + 60 * 60 > @token_expiry_time) # Refresh token 1 hr from expiration # renew the token if its near expiry if !@cached_access_token.to_s.empty? && (Time.now + 60 * 60 > @token_expiry_time) - @log.info "renewing the token since its near expiry @ #{Time.now.utc.iso8601}" - renew_near_expiry_token - # sleep 60 seconds to get the renewed token available - sleep 60 + if !@isLastTokenRenewalUpdatePending + @log.info "token expiry - @ #{@token_expiry_time}" + @log.info "renewing the token since token has near expiry @ #{Time.now.utc.iso8601}" + renew_near_expiry_token + # sleep 60 seconds to get the renewed token available + sleep 60 + @isLastTokenRenewalUpdatePending = true + else + @log.warn "last token renewal update still pending @ #{Time.now.utc.iso8601}" + end end @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" tokenReference = get_token_reference_from_crd @@ -59,6 +68,7 @@ def get_cluster_identity_token() token = get_token_from_secret(token_secret_name, token_secret_data_name) if !token.nil? @cached_access_token = token + @isLastTokenRenewalUpdatePending = false else @log.warn "got token nil from secret: #{@token_secret_name}" end @@ -121,7 +131,17 @@ def get_token_reference_from_crd() tokenReference["expirationTime"] = status["expirationTime"] tokenReference["secretName"] = status["tokenReference"]["secretName"] tokenReference["dataName"] = status["tokenReference"]["dataName"] - end + elsif get_response.code.to_i == 404 # this might happen if the crd resource deleted by user accidently + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + crd_request_body = get_crd_request_body + crd_request_body_json = crd_request_body.to_json + create_request = Net::HTTP::Post.new(crd_request_uri) + create_request["Content-Type"] = "application/json" + create_request["Authorization"] = "Bearer #{@service_account_token}" + create_request.body = crd_request_body_json + create_response = @http_client.request(create_request) + @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" + end rescue => err @log.warn "get_token_reference_from_crd call failed: #{err}" ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) @@ -139,20 +159,23 @@ def renew_near_expiry_token() cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, cluster_identity_resource_name: @@cluster_identity_resource_name, } - crd_request_body = get_crd_request_body - crd_request_body_json = crd_request_body.to_json - update_request = Net::HTTP::Patch.new(crd_request_uri) + update_crd_request_body = { 'status': {'expirationTime': ''} } + update_crd_request_body_json = update_crd_request_body.to_json + update_crd_request_uri = crd_request_uri + "/status" + update_request = Net::HTTP::Patch.new(update_crd_request_uri) update_request["Content-Type"] = "application/merge-patch+json" update_request["Authorization"] = "Bearer #{@service_account_token}" - update_request.body = crd_request_body_json + update_request.body = update_crd_request_body_json update_response = @http_client.request(update_request) - @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" + @log.info "Got response of #{update_response.code} for PATCH #{update_crd_request_uri} @ #{Time.now.utc.iso8601}" if update_response.code.to_i == 404 - @log.info "since crd resource doesnt exist since creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" create_request = Net::HTTP::Post.new(crd_request_uri) create_request["Content-Type"] = "application/json" create_request["Authorization"] = "Bearer #{@service_account_token}" - create_request.body = crd_request_body_json + create_crd_request_body = get_crd_request_body + create_crd_request_body_json = create_crd_request_body.to_json + create_request.body = create_crd_request_body_json create_response = @http_client.request(create_request) @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" end @@ -211,6 +234,9 @@ def get_crd_request_body body["metadata"]["namespace"] = @@cluster_identity_resource_namespace body["spec"] = {} body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience + if !@extensionName.nil? && !@extensionName.empty? + body["spec"]["resourceId"] = @extensionName + end return body end end diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index dd1ba24b3..906019b95 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -1,77 +1,107 @@ # frozen_string_literal: true class Constants - INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" - INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" - INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" - INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" - INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" - INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" - INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" - INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" - INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" - INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" - INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" - INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" - INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" - REASON_OOM_KILLED = "oomkilled" - #Kubestate (common) - INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate" - INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime" - #Kubestate (deployments) - INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available" - #Kubestate (HPA) - INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas" - - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime" - # MDM Metric names - MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount" - MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount" - MDM_POD_READY_PERCENTAGE = "podReadyPercentage" - MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount" - MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage" - MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" - MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" - MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" - MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" - MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" - MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" + INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" + INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" + INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" + INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" + INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" + INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" + INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" + INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" + INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" + INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" + INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" + INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUid" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" + INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" + INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" + REASON_OOM_KILLED = "oomkilled" + #Kubestate (common) + INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate" + INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime" + #Kubestate (deployments) + INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available" + #Kubestate (HPA) + INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas" - CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 - OBJECT_NAME_K8S_CONTAINER = "K8SContainer" - OBJECT_NAME_K8S_NODE = "K8SNode" - CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" - CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" - MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" - MEMORY_RSS_BYTES = "memoryRssBytes" - DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 - CONTROLLER_KIND_JOB = "job" - CONTAINER_TERMINATION_REASON_COMPLETED = "completed" - CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 - TELEGRAF_DISK_METRICS = "container.azm.ms/disk" - OMSAGENT_ZERO_FILL = "omsagent" - KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime" + # MDM Metric names + MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount" + MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount" + MDM_POD_READY_PERCENTAGE = "podReadyPercentage" + MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount" + MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage" + MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" + MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" + MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" + MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC = "cpuThresholdViolated" + MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC = "memoryRssThresholdViolated" + MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC = "memoryWorkingSetThresholdViolated" + MDM_PV_THRESHOLD_VIOLATED_METRIC = "pvUsageThresholdViolated" + MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" + MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" + MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" - #Telemetry constants - CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" - POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" - CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" - TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 - KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 - MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" -end \ No newline at end of file + CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 + OBJECT_NAME_K8S_CONTAINER = "K8SContainer" + OBJECT_NAME_K8S_NODE = "K8SNode" + CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" + CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" + MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" + MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" + JOB_COMPLETION_TIME = "completedJobTimeMinutes" + DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360 + CONTROLLER_KIND_JOB = "job" + CONTAINER_TERMINATION_REASON_COMPLETED = "completed" + CONTAINER_STATE_TERMINATED = "terminated" + TELEGRAF_DISK_METRICS = "container.azm.ms/disk" + OMSAGENT_ZERO_FILL = "omsagent" + KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + VOLUME_NAME_ZERO_FILL = "-" + PV_TYPES = ["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] + + #Telemetry constants + CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" + POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" + CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" + PV_INVENTORY_HEART_BEAT_EVENT = "KubePVInventoryHeartBeatEvent" + TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 + KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 + ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30 + MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" + MDM_EXCEPTION_TELEMETRY_METRIC = "AKSCustomMetricsMdmExceptions" + MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL = 30 + + #Pod Statuses + POD_STATUS_TERMINATING = "Terminating" + + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME = "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" +end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index fd43ef98b..659e3000c 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -9,14 +9,14 @@ module Fluent require_relative "CustomMetricsUtils" require_relative "kubelet_utils" require_relative "MdmMetricsGenerator" + require_relative "in_kube_nodes" class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter("filter_cadvisor2mdm", self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" - config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -24,6 +24,7 @@ class CAdvisor2MdmFilter < Filter @metrics_to_collect_hash = {} @@metric_threshold_hash = {} + @@controller_type = "" def initialize super @@ -42,15 +43,17 @@ def configure(conf) def start super begin - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i # These variables keep track if any resource utilization threshold exceeded in the last 10 minutes @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false # initialize cpu and memory limit if @process_incoming_stream @@ -60,7 +63,9 @@ def start @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} + @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds + @NodeCache = Fluent::NodeStatsCache.new() end rescue => e @log.info "Error initializing plugin #{e}" @@ -87,6 +92,8 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true + elsif metricName == Constants::PV_USED_BYTES + @pvExceededUsageThreshold = true end rescue => errorStr @log.info "Error in setThresholdExceededTelemetry: #{errorStr}" @@ -109,13 +116,30 @@ def flushMetricTelemetry properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) - @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # Also send for PV usage metrics + begin + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) + @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - @log.info "Error in flushMetricTelemetry: #{errorStr}" + @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -123,6 +147,13 @@ def flushMetricTelemetry def filter(tag, time, record) begin if @process_incoming_stream + + # Check if insights metrics for PV metrics + data_type = record["DataType"] + if data_type == "INSIGHTS_METRICS_BLOB" + return filterPVInsightsMetrics(record) + end + object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] percentage_metric_value = 0.0 @@ -133,19 +164,40 @@ def filter(tag, time, record) if counter_name == Constants::CPU_USAGE_NANO_CORES metric_name = Constants::CPU_USAGE_MILLI_CORES metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc - @log.info "Metric_value: #{metric_value} CPU Capacity #{@cpu_capacity}" - if @cpu_capacity != 0.0 - percentage_metric_value = (metric_value) * 100 / @cpu_capacity + if @@controller_type.downcase == "replicaset" + target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["DataItems"][0]["Host"]) / 1000000 + else + target_node_cpu_capacity_mc = @cpu_capacity + end + @log.info "Metric_value: #{metric_value} CPU Capacity #{target_node_cpu_capacity_mc}" + if target_node_cpu_capacity_mc != 0.0 + percentage_metric_value = (metric_value) * 100 / target_node_cpu_capacity_mc end end if counter_name.start_with?("memory") metric_name = counter_name - if @memory_capacity != 0.0 - percentage_metric_value = metric_value * 100 / @memory_capacity + if @@controller_type.downcase == "replicaset" + target_node_mem_capacity = @NodeCache.mem.get_capacity(record["DataItems"][0]["Host"]) + else + target_node_mem_capacity = @memory_capacity + end + @log.info "Metric_value: #{metric_value} Memory Capacity #{target_node_mem_capacity}" + if target_node_mem_capacity != 0.0 + percentage_metric_value = metric_value * 100 / target_node_mem_capacity end + end + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["DataItems"][0]["Host"]} percentage: #{percentage_metric_value}" + + # do some sanity checking. Do we want this? + if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 + telemetryProperties = {} + telemetryProperties["Computer"] = record["DataItems"][0]["Host"] + telemetryProperties["MetricName"] = metric_name + telemetryProperties["MetricPercentageValue"] = percentage_metric_value + ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) end - # return get_metric_records(record, metric_name, metric_value, percentage_metric_value) + return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) instanceName = record["DataItems"][0]["InstanceName"] @@ -204,14 +256,55 @@ def filter(tag, time, record) end end + def filterPVInsightsMetrics(record) + begin + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + rescue Exception => e + @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] #return empty array if we ran into any errors + end + end + def ensure_cpu_memory_capacity_set if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" return end - controller_type = ENV["CONTROLLER_TYPE"] - if controller_type.downcase == "replicaset" + @@controller_type = ENV["CONTROLLER_TYPE"] + if @@controller_type.downcase == "replicaset" @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" begin @@ -237,10 +330,18 @@ def ensure_cpu_memory_capacity_set @log.info "Error getting memory_capacity" end end - elsif controller_type.downcase == "daemonset" + elsif @@controller_type.downcase == "daemonset" capacity_from_kubelet = KubeletUtils.get_node_capacity - @cpu_capacity = capacity_from_kubelet[0] - @memory_capacity = capacity_from_kubelet[1] + + # Error handling in case /metrics/cadvsior endpoint fails + if !capacity_from_kubelet.nil? && capacity_from_kubelet.length > 1 + @cpu_capacity = capacity_from_kubelet[0] + @memory_capacity = capacity_from_kubelet[1] + else + # cpu_capacity and memory_capacity keep initialized value of 0.0 + @log.error "Error getting capacity_from_kubelet: cpu_capacity and memory_capacity" + end + end end diff --git a/source/plugins/ruby/filter_inventory2mdm.rb b/source/plugins/ruby/filter_inventory2mdm.rb index b5ef587ff..38ccab885 100644 --- a/source/plugins/ruby/filter_inventory2mdm.rb +++ b/source/plugins/ruby/filter_inventory2mdm.rb @@ -13,7 +13,6 @@ class Inventory2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' - config_param :custom_metrics_azure_regions, :string @@node_count_metric_name = 'nodesCount' @@pod_count_metric_name = 'podCount' @@ -98,7 +97,7 @@ def configure(conf) def start super - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" end diff --git a/source/plugins/ruby/filter_telegraf2mdm.rb b/source/plugins/ruby/filter_telegraf2mdm.rb index 98d258ea5..88ae428d1 100644 --- a/source/plugins/ruby/filter_telegraf2mdm.rb +++ b/source/plugins/ruby/filter_telegraf2mdm.rb @@ -15,7 +15,6 @@ class Telegraf2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log" - config_param :custom_metrics_azure_regions, :string @process_incoming_stream = true @@ -36,7 +35,7 @@ def configure(conf) def start super begin - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" rescue => errorStr @log.info "Error initializing plugin #{errorStr}" diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index a44365e9d..b706ff00a 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -88,6 +88,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6f59a3fc1..f50019a01 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -17,8 +17,9 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" - # 30000 events account to approximately 5MB - @EVENTS_CHUNK_SIZE = 30000 + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @EVENTS_CHUNK_SIZE = 0 # Initializing events count for telemetry @eventsCount = 0 @@ -36,6 +37,15 @@ def configure(conf) def start if @run_interval + if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 + @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_events::start: setting to default value since got EVENTS_CHUNK_SIZE nil or empty") + @EVENTS_CHUNK_SIZE = 4000 + end + $log.info("in_kube_events::start : EVENTS_CHUNK_SIZE @ #{@EVENTS_CHUNK_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -82,6 +92,8 @@ def enumerate end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -91,6 +103,8 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -115,6 +129,7 @@ def enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f + @@istestvar = ENV["ISTEST"] begin eventStream = MultiEventStream.new events["items"].each do |items| @@ -157,6 +172,9 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeEventsInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 4d58382f5..4993edd7b 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -9,6 +9,7 @@ class Kube_nodeInventory_Input < Input @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" @@kubeperfTag = "oms.api.KubePerf" @@ -19,7 +20,10 @@ class Kube_nodeInventory_Input < Input @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] + @@rsPromMonitorPodsLabelSelectorLength = ENV["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] + @@rsPromMonitorPodsFieldSelectorLength = ENV["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + @@osmNamespaceCount = ENV["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] def initialize super @@ -32,8 +36,15 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" - @NODES_CHUNK_SIZE = "400" + # refer tomlparser-agent-config for the defaults + @NODES_CHUNK_SIZE = 0 + @NODES_EMIT_STREAM_BATCH_SIZE = 0 + + @nodeInventoryE2EProcessingLatencyMs = 0 + @nodesAPIE2ELatencyMs = 0 require_relative "constants" + + @NodeCache = NodeStatsCache.new() end config_param :run_interval, :time, :default => 60 @@ -45,11 +56,30 @@ def configure(conf) def start if @run_interval + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_nodes::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 + end + $log.info("in_kube_nodes::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + + if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_nodes::start: setting to default value since got NODES_EMIT_STREAM_BATCH_SIZE nil or empty") + @NODES_EMIT_STREAM_BATCH_SIZE = 100 + end + $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -69,14 +99,20 @@ def enumerate currentTime = Time.now batchTime = currentTime.utc.iso8601 + @nodesAPIE2ELatencyMs = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" @@ -84,14 +120,26 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" end end + @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) + timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) + ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i + end # Setting this to nil so that we dont hold memory until GC kicks in nodeInventory = nil rescue => errorStr @@ -109,77 +157,32 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new insightsMetricsEventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" - - if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? - if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack - record["KubernetesProviderID"] = "azurestack" - else - #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here - #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev - provider = items["spec"]["providerID"].split(":")[0] - if !provider.nil? && !provider.empty? - record["KubernetesProviderID"] = provider - else - record["KubernetesProviderID"] = items["spec"]["providerID"] - end - end - else - record["KubernetesProviderID"] = "onprem" - end - - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] - end - end - if !allNodeConditions.empty? - record["Status"] = allNodeConditions + nodeInventory["items"].each do |item| + # node inventory + nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + eventStream = MultiEventStream.new end - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] - if containerRuntimeVersion.downcase.start_with?("docker://") - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - # using containerRuntimeVersion as DockerVersion as is for non docker runtimes - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion - end - # ContainerNodeInventory data for docker version and operating system. + # container node inventory + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) containerNodeInventoryWrapper = { "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", "IPName" => "ContainerInsights", @@ -187,33 +190,107 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) } containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + # Only CPU and Memory capacity for windows nodes get added to the cache (at end of file) + is_windows_node = false + if !item["status"].nil? && !item["status"]["nodeInfo"].nil? && !item["status"]["nodeInfo"]["operatingSystem"].nil? + operatingSystem = item["status"]["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + is_windows_node = true + end + end + + # node metrics records + nodeMetricRecords = [] + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.cpu.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.mem.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end + end + nodeMetricRecords.each do |metricRecord| + metricRecord["DataType"] = "LINUX_PERF_BLOB" + metricRecord["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, metricRecord) if metricRecord + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + # node GPU metrics record + nodeGPUInsightsMetricsRecords = [] + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - # DockerVersion field holds docker version if runtime is docker/moby else :// - if containerRuntimeVersion.downcase.start_with?("docker://") - properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - properties["DockerVersion"] = containerRuntimeVersion - end - properties["KubernetesProviderID"] = record["KubernetesProviderID"] - properties["KernelVersion"] = nodeInfo["kernelVersion"] - properties["OSImage"] = nodeInfo["osImage"] + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] - capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - begin if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] @@ -242,77 +319,55 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromUrl"] = @@rsPromUrlCount properties["rsPromMonPods"] = @@rsPromMonitorPods properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true end end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + if eventStream.count > 0 + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + eventStream = nil end - #:optimize:kubeperf merge - begin - #if(!nodeInventory.empty?) - nodeMetricDataItems = [] - #allocatable metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "memory", "memoryAllocatableBytes", batchTime)) - #capacity metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - - nodeMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record + if containerNodeInventoryEventStream.count > 0 + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - #start GPU InsightsMetrics items - begin - nodeGPUInsightsMetricsDataItems = [] - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - end + end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + if kubePerfEventStream.count > 0 + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - #:optimize:end kubeperf merge - rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -352,5 +407,185 @@ def run_periodic end @mutex.unlock end + + # TODO - move this method to KubernetesClient or helper class + def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + record = {} + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = item["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + record["Labels"] = [item["metadata"]["labels"]] + record["Status"] = "" + + if !item["spec"]["providerID"].nil? && !item["spec"]["providerID"].empty? + if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack + record["KubernetesProviderID"] = "azurestack" + else + #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here + #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev + provider = item["spec"]["providerID"].split(":")[0] + if !provider.nil? && !provider.empty? + record["KubernetesProviderID"] = provider + else + record["KubernetesProviderID"] = item["spec"]["providerID"] + end + end + else + record["KubernetesProviderID"] = "onprem" + end + + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. + if item["status"].key?("conditions") && !item["status"]["conditions"].empty? + allNodeConditions = "" + item["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end + end + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + end + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end + end + nodeInfo = item["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + rescue => errorStr + $log.warn "in_kube_nodes::getNodeInventoryRecord:Failed: #{errorStr}" + end + return record + end + + # TODO - move this method to KubernetesClient or helper class + def getContainerNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + containerNodeInventoryRecord = {} + begin + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion + end + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeInventoryRecord:Failed: #{errorStr}" + end + return containerNodeInventoryRecord + end + + # TODO - move this method to KubernetesClient or helper class + def getNodeTelemetryProps(item) + properties = {} + begin + properties["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + properties["KubeletVersion"] = nodeInfo["kubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["KernelVersion"] = nodeInfo["kernelVersion"] + properties["OSImage"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + properties["DockerVersion"] = containerRuntimeVersion + end + properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE + properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" + end + return properties + end end # Kube_Node_Input + + + class NodeStatsCache + # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) + # (to reduce code duplication) + class NodeCache + + @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + + def initialize + @cacheHash = {} + @timeAdded = {} # records when an entry was last added + @lock = Mutex.new + @lastCacheClearTime = 0 + + @cacheHash.default = 0.0 + @lastCacheClearTime = DateTime.now.to_time.to_i + end + + def get_capacity(node_name) + @lock.synchronize do + retval = @cacheHash[node_name] + return retval + end + end + + def set_capacity(host, val) + # check here if the cache has not been cleaned in a while. This way calling code doesn't have to remember to clean the cache + current_time = DateTime.now.to_time.to_i + if current_time - @lastCacheClearTime > @@RECORD_TIME_TO_LIVE + clean_cache + @lastCacheClearTime = current_time + end + + @lock.synchronize do + @cacheHash[host] = val + @timeAdded[host] = current_time + end + end + + def clean_cache() + $log.info "in_kube_nodes::clean_cache: cleaning node cpu/mem cache" + cacheClearTime = DateTime.now.to_time.to_i + @lock.synchronize do + nodes_to_remove = [] # first make a list of nodes to remove, then remove them. This intermediate + # list is used so that we aren't modifying a hash while iterating through it. + @cacheHash.each do |key, val| + if cacheClearTime - @timeAdded[key] > @@RECORD_TIME_TO_LIVE + nodes_to_remove.push(key) + end + end + + nodes_to_remove.each do node_name + @cacheHash.delete(node_name) + @timeAdded.delete(node_name) + end + end + end + end # NodeCache + + + @@cpuCache = NodeCache.new + @@memCache = NodeCache.new + + def cpu() + return @@cpuCache + end + + def mem() + return @@memCache + end + end + end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bffa725ee..5256eb159 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true module Fluent - require_relative "podinventory_to_mdm" + require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input Plugin.register_input("kubepodinventory", self) @@ -19,7 +19,7 @@ def initialize require "yajl" require "set" require "time" - + require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" @@ -27,24 +27,48 @@ def initialize require_relative "omslog" require_relative "constants" - @PODS_CHUNK_SIZE = "1500" + # refer tomlparser-agent-config for updating defaults + # this configurable via configmap + @PODS_CHUNK_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} + @podInventoryE2EProcessingLatencyMs = 0 + @podsAPIE2ELatencyMs = 0 end config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" - config_param :custom_metrics_azure_regions, :string def configure(conf) super - @inventoryToMdmConvertor = Inventory2MdmConvertor.new(@custom_metrics_azure_regions) + @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end def start if @run_interval + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") + @PODS_CHUNK_SIZE = 1000 + end + $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") + @PODS_EMIT_STREAM_BATCH_SIZE = 200 + end + $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -68,12 +92,15 @@ def enumerate(podList = nil) podInventory = podList telemetryFlush = false @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 - + serviceRecords = [] + @podInventoryE2EProcessingLatencyMs = 0 + podInventoryStartTime = (Time.now.to_f * 1000).to_i # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") @@ -85,32 +112,48 @@ def enumerate(podList = nil) serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInfo = nil + # service inventory records much smaller and fixed size compared to serviceList + serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) + # updating for telemetry + @serviceCount += serviceRecords.length + serviceList = nil end + # to track e2e processing latency + @podsAPIE2ELatencyMs = 0 + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end end + @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil - serviceList = nil + serviceRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -123,14 +166,19 @@ def enumerate(podList = nil) if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName + telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE + telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end + ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyMs", @podsAPIE2ELatencyMs, telemetryProperties) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr @@ -138,257 +186,144 @@ def enumerate(podList = nil) $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - end + end - def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray - - podInventory["items"].each do |items| #podInventory block start - containerInventoryRecords = [] - records = [] - record = {} - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Name"] = items["metadata"]["name"] - podNameSpace = items["metadata"]["namespace"] - - # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes - if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && - (items["spec"]["nodeName"].downcase.start_with?("infra-") || - items["spec"]["nodeName"].downcase.start_with?("master-")) - next - end - - podUid = KubernetesApiClient.getPodUid(podNameSpace, items["metadata"]) - if podUid.nil? - next - end - record["PodUid"] = podUid - record["PodLabel"] = [items["metadata"]["labels"]] - record["Namespace"] = podNameSpace - record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] - #for unscheduled (non-started) pods startTime does NOT exist - if !items["status"]["startTime"].nil? - record["PodStartTime"] = items["status"]["startTime"] - else - record["PodStartTime"] = "" - end - #podStatus - # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running - podReadyCondition = true - if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? - items["status"]["conditions"].each do |condition| - if condition["type"] == "Ready" && condition["status"] == "False" - podReadyCondition = false - break - end + podInventory["items"].each do |item| #podInventory block start + # pod inventory records + podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) + podInventoryRecords.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) end end - - if podReadyCondition == false - record["PodStatus"] = "Unknown" - else - record["PodStatus"] = items["status"]["phase"] - end - #for unscheduled (non-started) pods podIP does NOT exist - if !items["status"]["podIP"].nil? - record["PodIp"] = items["status"]["podIP"] - else - record["PodIp"] = "" - end - #for unscheduled (non-started) pods nodeName does NOT exist - if !items["spec"]["nodeName"].nil? - record["Computer"] = items["spec"]["nodeName"] - else - record["Computer"] = "" - end - # Setting this flag to true so that we can send ContainerInventory records for containers # on windows nodes and parse environment variables for these containers if winNodes.length > 0 - if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - containerInventoryRecordsInPodItem = KubernetesContainerInventory.getContainerInventoryRecords(items, batchTime, clusterCollectEnvironmentVar, true) - containerInventoryRecordsInPodItem.each do |containerRecord| - containerInventoryRecords.push(containerRecord) - end + containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) + # Send container inventory records for containers on windows nodes + @winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end end - record["ClusterId"] = KubernetesApiClient.getClusterId - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) - - if !items["metadata"]["ownerReferences"].nil? - record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] - record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] - @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) - #Adding controller kind to telemetry ro information about customer workload - if (@controllerData[record["ControllerKind"]].nil?) - @controllerData[record["ControllerKind"]] = 1 - else - controllerValue = @controllerData[record["ControllerKind"]] - @controllerData[record["ControllerKind"]] += 1 + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(@tag, eventStream) if eventStream + eventStream = MultiEventStream.new end - podRestartCount = 0 - record["PodRestartCount"] = 0 - #Invoke the helper method to compute ready/not ready mdm metric - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], items["status"]["conditions"]) + #container perf records + containerMetricDataItems = [] + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - podContainers = [] - if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? - podContainers = podContainers + items["status"]["containerStatuses"] - end - # Adding init containers to the record list as well. - if items["status"].key?("initContainerStatuses") && !items["status"]["initContainerStatuses"].empty? - podContainers = podContainers + items["status"]["initContainerStatuses"] + containerMetricDataItems.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, record) if record end - # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start - if !podContainers.empty? #container status block start - podContainers.each do |container| - containerRestartCount = 0 - lastFinishedTime = nil - # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container["containerID"].nil? - record["ContainerID"] = container["containerID"].split("//")[1] - else - # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record["ContainerID"] = "" - end - #keeping this as which is same as InstanceName in perf table - if podUid.nil? || container["name"].nil? - next - else - record["ContainerName"] = podUid + "/" + container["name"] - end - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container["restartCount"] - record["ContainerRestartCount"] = containerRestartCount - - containerStatus = container["state"] - record["ContainerStatusReason"] = "" - # state is of the following form , so just picking up the first key name - # "state": { - # "waiting": { - # "reason": "CrashLoopBackOff", - # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" - # } - # }, - # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running - if podReadyCondition == false - record["ContainerStatus"] = "Unknown" - else - record["ContainerStatus"] = containerStatus.keys[0] - end - #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric - #Picking up both container and node start time from cAdvisor to be consistent - if containerStatus.keys[0] == "running" - record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] - else - if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? - record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] - end - # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm - if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) - end - end - - # Record the last state of the container. This may have information on why a container was killed. - begin - if !container["lastState"].nil? && container["lastState"].keys.length == 1 - lastStateName = container["lastState"].keys[0] - lastStateObject = container["lastState"][lastStateName] - if !lastStateObject.is_a?(Hash) - raise "expected a hash object. This could signify a bug or a kubernetes API change" - end - - if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - newRecord = Hash.new - newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) - lastStateReason = lastStateObject["reason"] - # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) - newRecord["reason"] = lastStateReason # (ex: OOMKilled) - newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) - lastFinishedTime = lastStateObject["finishedAt"] - newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) - - # only write to the output field if everything previously ran without error - record["ContainerLastStatus"] = newRecord - - #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled - if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - lastStateReason = nil - else - record["ContainerLastStatus"] = Hash.new - end - else - record["ContainerLastStatus"] = Hash.new - end - - #Populate mdm metric for container restart count if greater than 0 - if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - record["ContainerLastStatus"] = Hash.new - end - - podRestartCount += containerRestartCount - records.push(record.dup) - end - else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) - end #container status block end - records.each do |record| - if !record.nil? - record["PodRestartCount"] = podRestartCount - wrapper = { - "DataType" => "KUBE_POD_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + kubePerfEventStream = MultiEventStream.new end - # Send container inventory records for containers on windows nodes - @winContainerCount += containerInventoryRecords.length - containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - ciwrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], - } - eventStream.add(emitTime, ciwrapper) if ciwrapper + + # container GPU records + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = MultiEventStream.new end end #podInventory block end - router.emit_stream(@tag, eventStream) if eventStream + if eventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + eventStream = nil + end + + if kubePerfEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + insightsMetricsEventStream = nil + end - if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send + if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" @@ -399,101 +334,42 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end - #:optimize:kubeperf merge - begin - #if(!podInventory.empty?) - containerMetricDataItems = [] - #hostName = (OMS::Common.get_hostname) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "cpu", "cpuRequestNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "memory", "memoryRequestBytes", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "cpu", "cpuLimitNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new - - containerMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record - end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - begin - #start GPU InsightsMetrics items - - containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - #:optimize:end kubeperf merge - - #:optimize:start kubeservices merge - begin - if (!serviceList.nil? && !serviceList.empty?) - kubeServicesEventStream = MultiEventStream.new - serviceList["items"].each do |items| - kubeServiceRecord = {} - kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - kubeServiceRecord["ServiceName"] = items["metadata"]["name"] - kubeServiceRecord["Namespace"] = items["metadata"]["namespace"] - kubeServiceRecord["SelectorLabels"] = [items["spec"]["selector"]] + if continuationToken.nil? # sending kube services inventory records + kubeServicesEventStream = MultiEventStream.new + serviceRecords.each do |kubeServiceRecord| + if !kubeServiceRecord.nil? + # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName - kubeServiceRecord["ClusterIP"] = items["spec"]["clusterIP"] - kubeServiceRecord["ServiceType"] = items["spec"]["type"] - # : Add ports and status fields kubeServicewrapper = { "DataType" => "KUBE_SERVICES_BLOB", "IPName" => "ContainerInsights", "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], } kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + kubeServicesEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end end + end + + if kubeServicesEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubeServices from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + kubeServicesEventStream = nil end - #:optimize:end kubeservices merge #Updating value for AppInsights telemetry @podCount += podInventory["items"].length - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -533,25 +409,238 @@ def run_periodic @mutex.unlock end - def getServiceNameFromLabels(namespace, labels, serviceList) + # TODO - move this method to KubernetesClient or helper class + def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) + records = [] + record = {} + + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = item["metadata"]["name"] + podNameSpace = item["metadata"]["namespace"] + podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"]) + if podUid.nil? + return records + end + + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes + if KubernetesApiClient.isAROv3MasterOrInfraPod(nodeName) + return records + end + + record["PodUid"] = podUid + record["PodLabel"] = [item["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + #for unscheduled (non-started) pods startTime does NOT exist + if !item["status"]["startTime"].nil? + record["PodStartTime"] = item["status"]["startTime"] + else + record["PodStartTime"] = "" + end + #podStatus + # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running + podReadyCondition = true + if !item["status"]["reason"].nil? && item["status"]["reason"] == "NodeLost" && !item["status"]["conditions"].nil? + item["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" + podReadyCondition = false + break + end + end + end + if podReadyCondition == false + record["PodStatus"] = "Unknown" + # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home + elsif !item["metadata"]["deletionTimestamp"].nil? && !item["metadata"]["deletionTimestamp"].empty? + record["PodStatus"] = Constants::POD_STATUS_TERMINATING + else + record["PodStatus"] = item["status"]["phase"] + end + #for unscheduled (non-started) pods podIP does NOT exist + if !item["status"]["podIP"].nil? + record["PodIp"] = item["status"]["podIP"] + else + record["PodIp"] = "" + end + + record["Computer"] = nodeName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(item["metadata"]["namespace"], item["metadata"]["labels"], serviceRecords) + + if !item["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = item["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = item["metadata"]["ownerReferences"][0]["name"] + @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (@controllerData[record["ControllerKind"]].nil?) + @controllerData[record["ControllerKind"]] = 1 + else + controllerValue = @controllerData[record["ControllerKind"]] + @controllerData[record["ControllerKind"]] += 1 + end + end + podRestartCount = 0 + record["PodRestartCount"] = 0 + + #Invoke the helper method to compute ready/not ready mdm metric + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) + + podContainers = [] + if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? + podContainers = podContainers + item["status"]["containerStatuses"] + end + # Adding init containers to the record list as well. + if item["status"].key?("initContainerStatuses") && !item["status"]["initContainerStatuses"].empty? + podContainers = podContainers + item["status"]["initContainerStatuses"] + end + # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + if !podContainers.empty? #container status block start + podContainers.each do |container| + containerRestartCount = 0 + lastFinishedTime = nil + # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else + # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 + record["ContainerID"] = "" + end + #keeping this as which is same as InstanceName in perf table + if podUid.nil? || container["name"].nil? + next + else + record["ContainerName"] = podUid + "/" + container["name"] + end + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + + containerStatus = container["state"] + record["ContainerStatusReason"] = "" + # state is of the following form , so just picking up the first key name + # "state": { + # "waiting": { + # "reason": "CrashLoopBackOff", + # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" + # } + # }, + # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running + if podReadyCondition == false + record["ContainerStatus"] = "Unknown" + else + record["ContainerStatus"] = containerStatus.keys[0] + end + #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric + #Picking up both container and node start time from cAdvisor to be consistent + if containerStatus.keys[0] == "running" + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] + else + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] + end + # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm + if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) + end + end + + # Record the last state of the container. This may have information on why a container was killed. + begin + if !container["lastState"].nil? && container["lastState"].keys.length == 1 + lastStateName = container["lastState"].keys[0] + lastStateObject = container["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + newRecord = Hash.new + newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) + lastStateReason = lastStateObject["reason"] + # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) + newRecord["reason"] = lastStateReason # (ex: OOMKilled) + newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) + lastFinishedTime = lastStateObject["finishedAt"] + newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) + + # only write to the output field if everything previously ran without error + record["ContainerLastStatus"] = newRecord + + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil + else + record["ContainerLastStatus"] = Hash.new + end + else + record["ContainerLastStatus"] = Hash.new + end + + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + record["ContainerLastStatus"] = Hash.new + end + + podRestartCount += containerRestartCount + records.push(record.dup) + end + else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod + records.push(record) + end #container status block end + + records.each do |record| + if !record.nil? + record["PodRestartCount"] = podRestartCount + end + end + rescue => error + $log.warn("getPodInventoryRecords failed: #{error}") + end + return records + end + + # TODO - move this method to KubernetesClient or helper class + def getServiceNameFromLabels(namespace, labels, serviceRecords) serviceName = "" begin if !labels.nil? && !labels.empty? - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) - serviceList["items"].each do |item| - found = 0 - if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace - selectorLabels = item["spec"]["selector"] - if !selectorLabels.empty? - selectorLabels.each do |key, value| - if !(labels.select { |k, v| k == key && v == value }.length > 0) - break - end - found = found + 1 + serviceRecords.each do |kubeServiceRecord| + found = 0 + if kubeServiceRecord["Namespace"] == namespace + selectorLabels = {} + # selector labels wrapped in array in kube service records so unwrapping here + if !kubeServiceRecord["SelectorLabels"].nil? && kubeServiceRecord["SelectorLabels"].length > 0 + selectorLabels = kubeServiceRecord["SelectorLabels"][0] + end + if !selectorLabels.nil? && !selectorLabels.empty? + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) + break end + found = found + 1 end + # service can have no selectors if found == selectorLabels.length - return item["metadata"]["name"] + return kubeServiceRecord["ServiceName"] end end end diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb new file mode 100644 index 000000000..4efe86f61 --- /dev/null +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -0,0 +1,256 @@ +module Fluent + class Kube_PVInventory_Input < Input + Plugin.register_input("kubepvinventory", self) + + @@hostName = (OMS::Common.get_hostname) + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "time" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + + # Response size is around 1500 bytes per PV + @PV_CHUNK_SIZE = "5000" + @pvTypeToCountHash = {} + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + pvInventory = nil + telemetryFlush = false + @pvTypeToCountHash = {} + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + continuationToken = nil + $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") + $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + + # If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}&continue=#{continuationToken}") + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + pvInventory = nil + + # Adding telemetry to send pod telemetry every 10 minutes + timeDifference = (DateTime.now.to_time.to_i - @@pvTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + telemetryFlush = true + end + + # Flush AppInsights telemetry once all the processing is done + if telemetryFlush == true + telemetryProperties = {} + telemetryProperties["CountsOfPVTypes"] = @pvTypeToCountHash.to_json + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i + end + + rescue => errorStr + $log.warn "in_kube_pvinventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end # end enumerate + + def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = currentTime.to_f + eventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] + begin + records = [] + pvInventory["items"].each do |item| + + # Node, pod, & usage info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName + record = {} + record["CollectionTime"] = batchTime + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["PVName"] = item["metadata"]["name"] + record["PVStatus"] = item["status"]["phase"] + record["PVAccessModes"] = item["spec"]["accessModes"].join(', ') + record["PVStorageClassName"] = item["spec"]["storageClassName"] + record["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) + record["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + + # Optional values + pvcNamespace, pvcName = getPVCInfo(item) + type, typeInfo = getTypeInfo(item) + record["PVCNamespace"] = pvcNamespace + record["PVCName"] = pvcName + record["PVType"] = type + record["PVTypeInfo"] = typeInfo + + records.push(record) + + # Record telemetry + if type == nil + type = "empty" + end + if (@pvTypeToCountHash.has_key? type) + @pvTypeToCountHash[type] += 1 + else + @pvTypeToCountHash[type] = 1 + end + end + + records.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_PV_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + end + end + + router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePVInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + + rescue => errorStr + $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def getPVCInfo(item) + begin + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + claimRef = item["spec"]["claimRef"] + pvcNamespace = claimRef["namespace"] + pvcName = claimRef["name"] + return pvcNamespace, pvcName + end + rescue => errorStr + $log.warn "Failed in getPVCInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # No PVC or an error + return nil, nil + end + + def getTypeInfo(item) + begin + if !item["spec"].nil? + (Constants::PV_TYPES).each do |pvType| + + # PV is this type + if !item["spec"][pvType].nil? + + # Get additional info if azure disk/file + typeInfo = {} + if pvType == "azureDisk" + azureDisk = item["spec"]["azureDisk"] + typeInfo["DiskName"] = azureDisk["diskName"] + typeInfo["DiskUri"] = azureDisk["diskURI"] + elsif pvType == "azureFile" + typeInfo["FileShareName"] = item["spec"]["azureFile"]["shareName"] + end + + # Can only have one type: return right away when found + return pvType, typeInfo + + end + end + end + rescue => errorStr + $log.warn "Failed in getTypeInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # No matches from list of types or an error + return nil, {} + end + + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_pvinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_pvinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_pvinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + end # Kube_PVInventory_Input +end # module \ No newline at end of file diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index bcf397150..27e4709a2 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -2,230 +2,238 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_Deployments_Input < Input - Plugin.register_input("kubestatedeployments", self) - @@istestvar = ENV["ISTEST"] - # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m - @@deploymentsCount = 0 - - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "date" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each deployment is 8k - # 1000 deployments account to approximately 8MB - @DEPLOYMENTS_CHUNK_SIZE = 1000 - @DEPLOYMENTS_API_GROUP = "apps" - @@telemetryLastSentTime = DateTime.now.to_time.to_i - - - @deploymentsRunningTotal = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_Deployments_Input < Input + Plugin.register_input("kubestatedeployments", self) + @@istestvar = ENV["ISTEST"] + # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m + @@deploymentsCount = 0 + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "date" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @DEPLOYMENTS_CHUNK_SIZE = 0 + + @DEPLOYMENTS_API_GROUP = "apps" + @@telemetryLastSentTime = DateTime.now.to_time.to_i + + @deploymentsRunningTotal = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 + @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kubestate_deployments::start: setting to default value since got DEPLOYMENTS_CHUNK_SIZE nil or empty") + @DEPLOYMENTS_CHUNK_SIZE = 500 end + $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def enumerate - begin - deploymentList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) - $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + end + + def enumerate + begin + deploymentList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + #set the running total for this batch to 0 + @deploymentsRunningTotal = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) + $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(deploymentList, batchTime) + else + $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) - if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) - parse_and_emit_records(deploymentList, batchTime) - else - $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" - end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + deploymentList = nil + + $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@deploymentsRunningTotal > @@deploymentsCount) + @@deploymentsCount = @deploymentsRunningTotal + end + if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs) / 60) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES + #send telemetry + $log.info "sending deployemt telemetry..." + ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) + #reset last sent value & time + @@deploymentsCount = 0 + @@telemetryLastSentTime = DateTime.now.to_time.to_i + end + rescue => errorStr + $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = deployments + metricInfo["items"].each do |deployment| + deploymentName = deployment["metadata"]["name"] + deploymentNameSpace = deployment["metadata"]["namespace"] + deploymentCreatedTime = "" + if !deployment["metadata"]["creationTimestamp"].nil? + deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] + end + deploymentStrategy = "RollingUpdate" #default when not specified as per spec + if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? + deploymentStrategy = deployment["spec"]["strategy"]["type"] end - - # Setting this to nil so that we dont hold memory until GC kicks in - deploymentList = nil - - $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@deploymentsRunningTotal > @@deploymentsCount) - @@deploymentsCount = @deploymentsRunningTotal + deploymentSpecReplicas = 1 #default is 1 as per k8s spec + if !deployment["spec"]["replicas"].nil? + deploymentSpecReplicas = deployment["spec"]["replicas"] end - if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs)/60 ) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES - #send telemetry - $log.info "sending deployemt telemetry..." - ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) - #reset last sent value & time - @@deploymentsCount = 0 - @@telemetryLastSentTime = DateTime.now.to_time.to_i + deploymentStatusReadyReplicas = 0 + if !deployment["status"]["readyReplicas"].nil? + deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + deploymentStatusUpToDateReplicas = 0 + if !deployment["status"]["updatedReplicas"].nil? + deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] + end + deploymentStatusAvailableReplicas = 0 + if !deployment["status"]["availableReplicas"].nil? + deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE + metricItem["Value"] = deploymentStatusReadyReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY] = deploymentStrategy + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = deployments - metricInfo["items"].each do |deployment| - deploymentName = deployment["metadata"]["name"] - deploymentNameSpace = deployment["metadata"]["namespace"] - deploymentCreatedTime = "" - if !deployment["metadata"]["creationTimestamp"].nil? - deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] - end - deploymentStrategy = "RollingUpdate" #default when not specified as per spec - if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? - deploymentStrategy = deployment["spec"]["strategy"]["type"] - end - deploymentSpecReplicas = 1 #default is 1 as per k8s spec - if !deployment["spec"]["replicas"].nil? - deploymentSpecReplicas = deployment["spec"]["replicas"] - end - deploymentStatusReadyReplicas = 0 - if !deployment["status"]["readyReplicas"].nil? - deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] - end - deploymentStatusUpToDateReplicas = 0 - if !deployment["status"]["updatedReplicas"].nil? - deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] - end - deploymentStatusAvailableReplicas = 0 - if !deployment["status"]["availableReplicas"].nil? - deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] - end - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE - metricItem["Value"] = deploymentStatusReadyReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY ] = deploymentStrategy - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") - @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") + + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") + + @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - + rescue => error + $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 3ce63a75a..afecf8e3b 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -2,231 +2,236 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_HPA_Input < Input - Plugin.register_input("kubestatehpa", self) - @@istestvar = ENV["ISTEST"] - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each HPA is 3k - # 2000 HPAs account to approximately 6-7MB - @HPA_CHUNK_SIZE = 2000 - @HPA_API_GROUP = "autoscaling" - - # telemetry - @hpaCount = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_HPA_Input < Input + Plugin.register_input("kubestatehpa", self) + @@istestvar = ENV["ISTEST"] + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @HPA_CHUNK_SIZE = 0 + + @HPA_API_GROUP = "autoscaling" + + # telemetry + @hpaCount = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 + @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kubestate_hpa::start: setting to default value since got HPA_CHUNK_SIZE nil or empty") + @HPA_CHUNK_SIZE = 2000 end + $log.info("in_kubestate_hpa::start : HPA_CHUNK_SIZE @ #{@HPA_CHUNK_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def enumerate - begin - hpaList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - @hpaCount = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) - $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + end + + def enumerate + begin + hpaList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + @hpaCount = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) + $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) + parse_and_emit_records(hpaList, batchTime) + else + $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) parse_and_emit_records(hpaList, batchTime) else $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) - if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) - parse_and_emit_records(hpaList, batchTime) - else - $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + # Setting this to nil so that we dont hold memory until GC kicks in + hpaList = nil + + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@hpaCount > 0) + # this will not be a useful telemetry, as hpa counts will not be huge, just log for now + $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") + #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + end + rescue => errorStr + $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = hpas + metricInfo["items"].each do |hpa| + hpaName = hpa["metadata"]["name"] + hpaNameSpace = hpa["metadata"]["namespace"] + hpaCreatedTime = "" + if !hpa["metadata"]["creationTimestamp"].nil? + hpaCreatedTime = hpa["metadata"]["creationTimestamp"] + end + hpaSpecMinReplicas = 1 #default is 1 as per k8s spec + if !hpa["spec"]["minReplicas"].nil? + hpaSpecMinReplicas = hpa["spec"]["minReplicas"] + end + hpaSpecMaxReplicas = 0 + if !hpa["spec"]["maxReplicas"].nil? + hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] + end + hpaSpecScaleTargetKind = "" + hpaSpecScaleTargetName = "" + if !hpa["spec"]["scaleTargetRef"].nil? + if !hpa["spec"]["scaleTargetRef"]["kind"].nil? + hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] + end + if !hpa["spec"]["scaleTargetRef"]["name"].nil? + hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] end end - - # Setting this to nil so that we dont hold memory until GC kicks in - hpaList = nil - - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@hpaCount > 0) - # this will not be a useful telemetry, as hpa counts will not be huge, just log for now - $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") - #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + hpaStatusCurrentReplicas = 0 + if !hpa["status"]["currentReplicas"].nil? + hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + hpaStatusDesiredReplicas = 0 + if !hpa["status"]["desiredReplicas"].nil? + hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] + end + + hpaStatuslastScaleTime = "" + if !hpa["status"]["lastScaleTime"].nil? + hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE + metricItem["Value"] = hpaStatusCurrentReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = hpas - metricInfo["items"].each do |hpa| - hpaName = hpa["metadata"]["name"] - hpaNameSpace = hpa["metadata"]["namespace"] - hpaCreatedTime = "" - if !hpa["metadata"]["creationTimestamp"].nil? - hpaCreatedTime = hpa["metadata"]["creationTimestamp"] - end - hpaSpecMinReplicas = 1 #default is 1 as per k8s spec - if !hpa["spec"]["minReplicas"].nil? - hpaSpecMinReplicas = hpa["spec"]["minReplicas"] - end - hpaSpecMaxReplicas = 0 - if !hpa["spec"]["maxReplicas"].nil? - hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] - end - hpaSpecScaleTargetKind = "" - hpaSpecScaleTargetName = "" - if !hpa["spec"]["scaleTargetRef"].nil? - if !hpa["spec"]["scaleTargetRef"]["kind"].nil? - hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] - end - if !hpa["spec"]["scaleTargetRef"]["name"].nil? - hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] - end - - end - hpaStatusCurrentReplicas = 0 - if !hpa["status"]["currentReplicas"].nil? - hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] - end - hpaStatusDesiredReplicas = 0 - if !hpa["status"]["desiredReplicas"].nil? - hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] - end - - hpaStatuslastScaleTime = "" - if !hpa["status"]["lastScaleTime"].nil? - hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] - end - - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE - metricItem["Value"] = hpaStatusCurrentReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - + rescue => error + $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 38868f2f5..9c267cf4f 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -10,7 +10,7 @@ class Win_CAdvisor_Perf_Input < Input def initialize super require "yaml" - require 'yajl/json_gem' + require "yajl/json_gem" require "time" require_relative "CAdvisorMetricsAPIClient" @@ -52,8 +52,6 @@ def shutdown def enumerate() time = Time.now.to_f begin - eventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@istestvar = ENV["ISTEST"] @@ -70,6 +68,7 @@ def enumerate() @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| + eventStream = MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601) metricData.each do |record| if !record.empty? @@ -81,7 +80,6 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -90,6 +88,7 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) + insightsMetricsEventStream = MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| wrapper = { @@ -101,14 +100,15 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end + end rescue => errorStr $log.warn "Failed when processing GPU Usage metrics in_win_cadvisor_perf : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end #end GPU InsightsMetrics items end diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index 599640d8f..e2c731b79 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -21,9 +21,11 @@ def get_node_capacity response = CAdvisorMetricsAPIClient.getAllMetricsCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? all_metrics = response.body.split("\n") - cpu_capacity = all_metrics.select{|m| m.start_with?('machine_cpu_cores') && m.split.first.strip == 'machine_cpu_cores' }.first.split.last.to_f * 1000 + #cadvisor machine metrics can exist with (>=1.19) or without dimensions (<1.19) + #so just checking startswith of metric name would be good enough to pick the metric value from exposition format + cpu_capacity = all_metrics.select { |m| m.start_with?("machine_cpu_cores") }.first.split.last.to_f * 1000 @log.info "CPU Capacity #{cpu_capacity}" - memory_capacity_e = all_metrics.select{|m| m.start_with?('machine_memory_bytes') && m.split.first.strip == 'machine_memory_bytes' }.first.split.last + memory_capacity_e = all_metrics.select { |m| m.start_with?("machine_memory_bytes") }.first.split.last memory_capacity = BigDecimal(memory_capacity_e).to_f @log.info "Memory Capacity #{memory_capacity}" return [cpu_capacity, memory_capacity] @@ -87,9 +89,9 @@ def get_all_container_limits @log.info "cpuLimit: #{cpuLimit}" @log.info "memoryLimit: #{memoryLimit}" # Get cpu limit in nanocores - containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : 0 + containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : nil # Get memory limit in bytes - containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : 0 + containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : nil end end end diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 4fe728579..82e36c8cc 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,30 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end - # image is of the format - repository/image:imagetag - imageValue = containerStatus["image"] - if !imageValue.nil? && !imageValue.empty? - # Find delimiters in the string of format repository/image:imagetag - slashLocation = imageValue.index("/") - colonLocation = imageValue.index(":") - if !colonLocation.nil? - if slashLocation.nil? - # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] - else - # repository/image:imagetag - containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] - containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] - end - containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - end - elsif !imageIdValue.nil? && !imageIdValue.empty? - # Getting repo information from imageIdValue when no tag in ImageId - if !atLocation.nil? - containerInventoryRecord["Repository"] = imageIdValue[0..(atLocation - 1)] - end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -107,6 +84,51 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + imageValue = containerInfoMap["image"] + if !imageValue.nil? && !imageValue.empty? + # Find delimiters in image format + atLocation = imageValue.index("@") + isDigestSpecified = false + if !atLocation.nil? + # repository/image@digest or repository/image:imagetag@digest, image@digest + imageValue = imageValue[0..(atLocation - 1)] + # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. + if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + end + isDigestSpecified = true + end + slashLocation = imageValue.index("/") + colonLocation = imageValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] + end + containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] + else + if slashLocation.nil? + # image + containerInventoryRecord["Image"] = imageValue + else + # repo/image + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] + end + # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. + # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names + if isDigestSpecified == false + containerInventoryRecord["ImageTag"] = "latest" + end + end + end + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -165,6 +187,7 @@ def getContainersInfoMap(podItem, isWindows) podContainers.each do |container| containerInfoMap = {} containerName = container["name"] + containerInfoMap["image"] = container["image"] containerInfoMap["ElementName"] = containerName containerInfoMap["Computer"] = nodeName containerInfoMap["PodName"] = podName @@ -189,34 +212,47 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars @ #{Time.now.utc.iso8601}") + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin - unless @@containerCGroupCache.has_key?(containerId) - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars fetching cGroup parent pid @ #{Time.now.utc.iso8601} for containerId: #{containerId}") + isCGroupPidFetchRequired = false + if !@@containerCGroupCache.has_key?(containerId) + isCGroupPidFetchRequired = true + else + cGroupPid = @@containerCGroupCache[containerId] + if cGroupPid.nil? || cGroupPid.empty? + isCGroupPidFetchRequired = true + @@containerCGroupCache.delete(containerId) + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + isCGroupPidFetchRequired = true + @@containerCGroupCache.delete(containerId) + end + end + + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin - if File.file?(filename) && File.foreach(filename).grep(/#{containerId}/).any? + if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] - if tempCGroupPid > cGroupPid + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) + if @@containerCGroupCache.has_key?(containerId) + tempCGroupPid = @@containerCGroupCache[containerId] + if tempCGroupPid.to_i > cGroupPid.to_i + @@containerCGroupCache[containerId] = cGroupPid + end + else @@containerCGroupCache[containerId] = cGroupPid - end - else - @@containerCGroupCache[containerId] = cGroupPid + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars cGroupPid: #{cGroupPid} environFilePath: #{environFilePath} for containerId: #{containerId}") + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -229,8 +265,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length - $log.info("KubernetesContainerInventory::environment vars filename @ #{environFilePath} envVars size @ #{envValueStringLength}") + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? @@ -341,5 +376,8 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end + def is_number?(value) + true if Integer(value) rescue false + end end end diff --git a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb index 33ac49286..bedbae4ee 100644 --- a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb +++ b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb @@ -66,12 +66,12 @@ def send(data_to_send) request.body = compressed_data if @proxy.nil? || @proxy.empty? http = Net::HTTP.new uri.hostname, uri.port - else + else http = Net::HTTP.new(uri.hostname, uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) end if uri.scheme.downcase == 'https' http.use_ssl = true - http.verify_mode = OpenSSL::SSL::VERIFY_NONE + http.verify_mode = OpenSSL::SSL::VERIFY_PEER end response = http.request(request) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index db3d2ff9e..6238eb51a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -50,6 +50,10 @@ def initialize @cluster_identity = nil @isArcK8sCluster = false @get_access_token_backoff_expiry = Time.now + + @mdm_exceptions_hash = {} + @mdm_exceptions_count = 0 + @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i end def configure(conf) @@ -67,7 +71,11 @@ def start if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @can_send_data_to_mdm = false + elsif !aks_resource_id.downcase.include?("/microsoft.containerservice/managedclusters/") && !aks_resource_id.downcase.include?("/microsoft.kubernetes/connectedclusters/") + @log.info "MDM Metris not supported for this cluster type resource: #{aks_resource_id}" + @can_send_data_to_mdm = false end + if aks_region.to_s.empty? @log.info "Environment Variable AKS_REGION is not set.. " @can_send_data_to_mdm = false @@ -217,10 +225,49 @@ def format(tag, time, record) end end + def exception_aggregator(error) + begin + errorStr = error.to_s + if (@mdm_exceptions_hash[errorStr].nil?) + @mdm_exceptions_hash[errorStr] = 1 + else + @mdm_exceptions_hash[errorStr] += 1 + end + #Keeping track of all exceptions to send the total in the last flush interval as a metric + @mdm_exceptions_count += 1 + rescue => error + @log.info "Error in MDM exception_aggregator method: #{error}" + ApplicationInsightsUtility.sendExceptionTelemetry(error) + end + end + + def flush_mdm_exception_telemetry + begin + #Flush out exception telemetry as a metric for the last 30 minutes + timeDifference = (DateTime.now.to_time.to_i - @mdm_exception_telemetry_time_tracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL) + telemetryProperties = {} + telemetryProperties["ExceptionsHashForFlushInterval"] = @mdm_exceptions_hash.to_json + telemetryProperties["FlushInterval"] = Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL + ApplicationInsightsUtility.sendMetricTelemetry(Constants::MDM_EXCEPTION_TELEMETRY_METRIC, @mdm_exceptions_count, telemetryProperties) + # Resetting values after flushing + @mdm_exceptions_count = 0 + @mdm_exceptions_hash = {} + @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i + end + rescue => error + @log.info "Error in flush_mdm_exception_telemetry method: #{error}" + ApplicationInsightsUtility.sendExceptionTelemetry(error) + end + end + # This method is called every flush interval. Send the buffer chunk to MDM. # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin + # Adding this before trying to flush out metrics, since adding after can lead to metrics never being sent + flush_mdm_exception_telemetry if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] chunk.msgpack_each { |(tag, record)| @@ -243,7 +290,8 @@ def write(chunk) end end rescue Exception => e - ApplicationInsightsUtility.sendExceptionTelemetry(e) + # Adding exceptions to hash to aggregate and send telemetry for all write errors + exception_aggregator(e) @log.info "Exception when writing to MDM: #{e}" raise e end @@ -278,7 +326,6 @@ def send_to_mdm(post_body) else @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" end - #@log.info "MDM request : #{post_body}" @log.debug_backtrace(e.backtrace) if !response.code.empty? && response.code == 403.to_s @log.info "Response Code #{response.code} Updating @last_post_attempt_time" @@ -293,15 +340,15 @@ def send_to_mdm(post_body) @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e end + # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes + exception_aggregator(e) rescue Errno::ETIMEDOUT => e @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(e) raise e rescue Exception => e @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(e) raise e end end diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 834515969..d9cb71bd4 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -80,14 +80,15 @@ class Inventory2MdmConvertor @@pod_phase_values = ["Running", "Pending", "Succeeded", "Failed", "Unknown"] @process_incoming_stream = false - def initialize(custom_metrics_azure_regions) + def initialize() @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" @log = Logger.new(@log_path, 1, 5000000) @pod_count_hash = {} @no_phase_dim_values_hash = {} @pod_count_by_phase = {} @pod_uids = {} - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability + @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames if !containerFinishedTime.nil? && !containerFinishedTime.empty? finishedTimeParsed = Time.parse(containerFinishedTime) # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES - if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES + if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME] MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue, podNamespaceDimValue) end diff --git a/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml b/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml new file mode 100644 index 000000000..cc3dd5259 --- /dev/null +++ b/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: logs-400persec-2kentrysize +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: logs-400persec-2kentrysize + spec: + volumes: + - name: logs-400persec-2kentrysize-scripts-volume + configMap: + name: logs-400persec-test-scripts + containers: + - name: logs-400persec-2kentrysize + image: ubuntu + volumeMounts: + - mountPath: /logs-400persec-test-scripts + name: logs-400persec-2kentrysize-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /logs-400persec-test-scripts" + ls -lh /logs-400persec-test-scripts + echo "copy scripts to /tmp" + cp /logs-400persec-test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..400..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: logs-400persec-test-scripts +kind: List +metadata: {} diff --git a/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml b/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml new file mode 100644 index 000000000..42188631a --- /dev/null +++ b/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: logs-400persec-5kentrysize +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: logs-400persec-5kentrysize + spec: + volumes: + - name: logs-400persec-5kentrysize-scripts-volume + configMap: + name: logs-400persec-5kentrysize-test-scripts + containers: + - name: logs-400persec-5kentrysize + image: ubuntu + volumeMounts: + - mountPath: /logs-400persec-5kentrysize-test-scripts + name: logs-400persec-5kentrysize-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /logs-400persec-5kentrysize-test-scripts" + ls -lh /logs-400persec-5kentrysize-test-scripts + echo "copy scripts to /tmp" + cp /logs-400persec-5kentrysize-test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..1024..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: logs-400persec-5kentrysize-test-scripts +kind: List +metadata: {} diff --git a/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml b/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml new file mode 100644 index 000000000..ff619a822 --- /dev/null +++ b/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: ci-log-scale +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: ci-log-scale + spec: + volumes: + - name: ci-log-scale-scripts-volume + configMap: + name: test-scripts + containers: + - name: ci-log-scale + image: ubuntu + volumeMounts: + - mountPath: /test-scripts + name: ci-log-scale-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /test-scripts" + ls -lh /test-scripts + echo "copy scripts to /tmp" + cp /test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..1024..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: test-scripts +kind: List +metadata: {} diff --git a/test/e2e/e2e-tests.yaml b/test/e2e/e2e-tests.yaml new file mode 100644 index 000000000..06dfa1fb0 --- /dev/null +++ b/test/e2e/e2e-tests.yaml @@ -0,0 +1,178 @@ + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: sonobuoy +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + component: sonobuoy + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sonobuoy-serviceaccount-sonobuoy +subjects: +- kind: ServiceAccount + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' +- nonResourceURLs: + - '/metrics' + - '/logs' + - '/logs/*' + verbs: + - 'get' +--- +apiVersion: v1 +data: + config.json: | + {"Description":"DEFAULT","UUID":"bf5c02ed-1948-48f1-b12d-5a2d74435e46","Version":"v0.20.0","ResultsDir":"/tmp/sonobuoy","Resources":["apiservices","certificatesigningrequests","clusterrolebindings","clusterroles","componentstatuses","configmaps","controllerrevisions","cronjobs","customresourcedefinitions","daemonsets","deployments","endpoints","ingresses","jobs","leases","limitranges","mutatingwebhookconfigurations","namespaces","networkpolicies","nodes","persistentvolumeclaims","persistentvolumes","poddisruptionbudgets","pods","podlogs","podsecuritypolicies","podtemplates","priorityclasses","replicasets","replicationcontrollers","resourcequotas","rolebindings","roles","servergroups","serverversion","serviceaccounts","services","statefulsets","storageclasses","validatingwebhookconfigurations","volumeattachments"],"Filters":{"Namespaces":".*","LabelSelector":""},"Limits":{"PodLogs":{"Namespaces":"","SonobuoyNamespace":true,"FieldSelectors":[],"LabelSelector":"","Previous":false,"SinceSeconds":null,"SinceTime":null,"Timestamps":false,"TailLines":null,"LimitBytes":null,"LimitSize":"","LimitTime":""}},"QPS":30,"Burst":50,"Server":{"bindaddress":"0.0.0.0","bindport":8080,"advertiseaddress":"","timeoutseconds":10800},"Plugins":null,"PluginSearchPath":["./plugins.d","/etc/sonobuoy/plugins.d","~/sonobuoy/plugins.d"],"Namespace":"sonobuoy","WorkerImage":"sonobuoy/sonobuoy:v0.20.0","ImagePullPolicy":"IfNotPresent","ImagePullSecrets":"","ProgressUpdatesPort":"8099"} +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-config-cm + namespace: sonobuoy +--- +apiVersion: v1 +data: + plugin-0.yaml: | + podSpec: + containers: [] + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - key: CriticalAddonsOnly + operator: Exists + - key: kubernetes.io/e2e-evict-taint-key + operator: Exists + sonobuoy-config: + driver: Job + plugin-name: agenttests + result-format: junit + spec: + env: + # Update values of CLIENT_ID, CLIENT_SECRET of the service principal which has permission to query LA ad Metrics API + # Update value of TENANT_ID corresponding your Azure Service principal + - name: CLIENT_ID + value: "SP_CLIENT_ID_VALUE" + - name: CLIENT_SECRET + value: "CLIENT_SECRET_VALUE" + - name: TENANT_ID + value: "SP_TENANT_ID_VALUE" + - name: DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: AGENT_POD_EXPECTED_RESTART_COUNT + value: "0" + - name: AZURE_CLOUD + value: "AZURE_PUBLIC_CLOUD" + # image tag should be updated if new tests being added after this image + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciagenttest02152021 + imagePullPolicy: IfNotPresent + name: plugin + resources: {} + volumeMounts: + - mountPath: /tmp/results + name: results +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-plugins-cm + namespace: sonobuoy +--- +apiVersion: v1 +kind: Pod +metadata: + labels: + component: sonobuoy + run: sonobuoy-master + sonobuoy-component: aggregator + tier: analysis + name: sonobuoy + namespace: sonobuoy +spec: + containers: + - env: + - name: SONOBUOY_ADVERTISE_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: sonobuoy/sonobuoy:v0.20.0 + imagePullPolicy: IfNotPresent + name: kube-sonobuoy + volumeMounts: + - mountPath: /etc/sonobuoy + name: sonobuoy-config-volume + - mountPath: /plugins.d + name: sonobuoy-plugins-volume + - mountPath: /tmp/sonobuoy + name: output-volume + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - key: "kubernetes.io/e2e-evict-taint-key" + operator: "Exists" + volumes: + - configMap: + name: sonobuoy-config-cm + name: sonobuoy-config-volume + - configMap: + name: sonobuoy-plugins-cm + name: sonobuoy-plugins-volume + - emptyDir: {} + name: output-volume +--- +apiVersion: v1 +kind: Service +metadata: + labels: + component: sonobuoy + sonobuoy-component: aggregator + name: sonobuoy-aggregator + namespace: sonobuoy +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 8080 + selector: + sonobuoy-component: aggregator + type: ClusterIP + diff --git a/test/e2e/src/common/arm_rest_utility.py b/test/e2e/src/common/arm_rest_utility.py new file mode 100644 index 000000000..604f8b791 --- /dev/null +++ b/test/e2e/src/common/arm_rest_utility.py @@ -0,0 +1,25 @@ +import adal +import pytest + +from msrestazure.azure_active_directory import AADTokenCredentials + + +# Function to fetch aad token from spn id and password +def fetch_aad_token(client_id, client_secret, authority_uri, resource_uri): + """ + Authenticate using service principal w/ key. + """ + try: + context = adal.AuthenticationContext(authority_uri, api_version=None) + return context.acquire_token_with_client_credentials(resource_uri, client_id, client_secret) + except Exception as e: + pytest.fail("Error occured while fetching aad token: " + str(e)) + + +# Function that returns aad token credentials for a given spn +def fetch_aad_token_credentials(client_id, client_secret, authority_uri, resource_uri): + mgmt_token = fetch_aad_token(client_id, client_secret, authority_uri, resource_uri) + try: + return AADTokenCredentials(mgmt_token, client_id) + except Exception as e: + pytest.fail("Error occured while fetching credentials: " + str(e)) diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py new file mode 100644 index 000000000..770964cb5 --- /dev/null +++ b/test/e2e/src/common/constants.py @@ -0,0 +1,119 @@ +AZURE_PUBLIC_CLOUD_ENDPOINTS = { + "activeDirectory": "https://login.microsoftonline.com/", + "activeDirectoryDataLakeResourceId": "https://datalake.azure.net/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": "https://api.applicationinsights.io", + "appInsightsTelemetryChannel": "https://dc.applicationinsights.azure.com/v2/track", + "batchResourceId": "https://batch.core.windows.net/", + "gallery": "https://gallery.azure.com/", + "logAnalytics": "https://api.loganalytics.io", + "management": "https://management.core.windows.net/", + "mediaResourceId": "https://rest.media.azure.net", + "microsoftGraphResourceId": "https://graph.microsoft.com/", + "ossrdbmsResourceId": "https://ossrdbms-aad.database.windows.net", + "resourceManager": "https://management.azure.com/", + "sqlManagement": "https://management.core.windows.net:8443/", + "vmImageAliasDoc": "https://raw.githubusercontent.com/Azure/azure-rest-api-specs/master/arm-compute/quickstart-templates/aliases.json" +} + +AZURE_DOGFOOD_ENDPOINTS = { + "activeDirectory": "https://login.windows-ppe.net/", + "activeDirectoryDataLakeResourceId": None, + "activeDirectoryGraphResourceId": "https://graph.ppe.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": None, + "appInsightsTelemetryChannel": None, + "batchResourceId": None, + "gallery": "https://df.gallery.azure-test.net/", + "logAnalytics": None, + "management": "https://management-preview.core.windows-int.net/", + "mediaResourceId": None, + "microsoftGraphResourceId": None, + "ossrdbmsResourceId": None, + "resourceManager": "https://api-dogfood.resources.windows-int.net/", + "sqlManagement": None, + "vmImageAliasDoc": None +} + +AZURE_CLOUD_DICT = {"AZURE_PUBLIC_CLOUD" : AZURE_PUBLIC_CLOUD_ENDPOINTS, "AZURE_DOGFOOD": AZURE_DOGFOOD_ENDPOINTS} + +TIMEOUT = 300 + +# Azure Monitor for Container Extension related +AGENT_RESOURCES_NAMESPACE = 'kube-system' +AGENT_DEPLOYMENT_NAME = 'omsagent-rs' +AGENT_DAEMONSET_NAME = 'omsagent' +AGENT_WIN_DAEMONSET_NAME = 'omsagent-win' + +AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR = 'rsName=omsagent-rs' +AGENT_DAEMON_SET_PODS_LABEL_SELECTOR = 'component=oms-agent' +AGENT_OMSAGENT_LOG_PATH = '/var/opt/microsoft/omsagent/log/omsagent.log' +AGENT_REPLICASET_WORKFLOWS = ["kubePodInventoryEmitStreamSuccess", "kubeNodeInventoryEmitStreamSuccess"] + +# override this through setting enviornment variable if the expected restart count is > 0 for example applying configmap +AGENT_POD_EXPECTED_RESTART_COUNT = 0 + +# replicaset workflow streams +KUBE_POD_INVENTORY_EMIT_STREAM = "kubePodInventoryEmitStreamSuccess" +KUBE_NODE_INVENTORY_EMIT_STREAM = "kubeNodeInventoryEmitStreamSuccess" +KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM = "kubestatedeploymentsInsightsMetricsEmitStreamSuccess" +KUBE_CONTAINER_PERF_EMIT_STREAM = "kubeContainerPerfEventEmitStreamSuccess" +KUBE_SERVICES_EMIT_STREAM = "kubeServicesEventEmitStreamSuccess" +KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM = "containerNodeInventoryEmitStreamSuccess" +KUBE_EVENTS_EMIT_STREAM = "kubeEventsInventoryEmitStreamSuccess" +# daemonset workflow streams +CONTAINER_PERF_EMIT_STREAM = "cAdvisorPerfEmitStreamSuccess" +CONTAINER_INVENTORY_EMIT_STREAM = "containerInventoryEmitStreamSuccess" + +# simple log analytics queries to validate for e2e workflows +DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES = 10 +KUBE_POD_INVENTORY_QUERY = "KubePodInventory | where TimeGenerated > ago({0}) | count" +KUBE_NODE_INVENTORY_QUERY = "KubeNodeInventory | where TimeGenerated > ago({0}) | count" +KUBE_SERVICES_QUERY = "KubeServices | where TimeGenerated > ago({0}) | count" +KUBE_EVENTS_QUERY = "KubeEvents | where TimeGenerated > ago({0}) | count" +CONTAINER_NODE_INVENTORY_QUERY = "ContainerNodeInventory | where TimeGenerated > ago({0}) | count" +CONTAINER_INVENTORY_QUERY = "ContainerInventory | where TimeGenerated > ago({0}) | count" +# node perf +NODE_PERF_CPU_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuCapacityNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryCapacityBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuAllocatableNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryAllocatableBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName =='memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container perf +CONTAINER_PERF_CPU_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuLimitNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryLimitBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuRequestNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRequestBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container log +CONTAINER_LOG_QUERY = "ContainerLog | where TimeGenerated > ago({0}) | count" +# insights metrics +INSIGHTS_METRICS_QUERY = "InsightsMetrics | where TimeGenerated > ago({0}) | count" + +# custom metrics +METRICS_API_VERSION = '2019-07-01' +DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES = 10 + +# node metrics +NODE_METRICS_NAMESPACE = 'insights.container/nodes' +NODE_METRIC_METRIC_AGGREGATION = 'average' +NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME = 'cpuUsageMilliCores' +NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME = 'cpuUsagePercentage' +NODE_MEMORY_RSS_METRIC_NAME = 'memoryRssBytes' +NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME = 'memoryRssPercentage' +NODE_MEMORY_WS_METRIC_NAME = 'memoryWorkingSetBytes' +NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME = 'memoryWorkingSetPercentage' +NODE_COUNT_METRIC_NAME = 'nodesCount' +NODE_DISK_USAGE_PERCENTAGE_METRIC_NAME = 'diskUsedPercentage(Preview)' + +# pod metrics +POD_METRICS_NAMESPACE = 'insights.container/pods' +POD_METRIC_METRIC_AGGREGATION = 'average' +POD_COUNT_METRIC_NAME = 'PodCount' diff --git a/test/e2e/src/common/helm_utility.py b/test/e2e/src/common/helm_utility.py new file mode 100644 index 000000000..6eac1e071 --- /dev/null +++ b/test/e2e/src/common/helm_utility.py @@ -0,0 +1,68 @@ +import os +import pytest +import subprocess + + +# Function to pull helm charts +def pull_helm_chart(registry_path): + os.environ['HELM_EXPERIMENTAL_OCI'] = '1' + cmd_helm_chart_pull = ["helm", "chart", "pull", registry_path] + response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_pull, error_helm_chart_pull = response_helm_chart_pull.communicate() + if response_helm_chart_pull.returncode != 0: + pytest.fail("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + return output_helm_chart_pull.decode("ascii") + + +# Function to export helm charts +def export_helm_chart(registry_path, destination): + cmd_helm_chart_export = ["helm", "chart", "export", registry_path, "--destination", destination] + response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_export, error_helm_chart_export = response_helm_chart_export.communicate() + if response_helm_chart_export.returncode != 0: + pytest.fail("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + return output_helm_chart_export.decode("ascii") + + +# Function to add a helm repository +def add_helm_repo(repo_name, repo_url): + cmd_helm_repo = ["helm", "repo", "add", repo_name, repo_url] + response_helm_repo = subprocess.Popen(cmd_helm_repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_repo, error_helm_repo = response_helm_repo.communicate() + if response_helm_repo.returncode != 0: + pytest.fail("Unable to add repository {} to helm: ".format(repo_url) + error_helm_repo.decode("ascii")) + return output_helm_repo.decode("ascii") + + +# Function to install helm charts +def install_helm_chart(helm_release_name, helm_release_namespace, helm_chart_path, wait=False, **kwargs): + cmd_helm_install = ["helm", "install", helm_release_name, helm_chart_path, "--namespace", helm_release_namespace] + if wait: + cmd_helm_install.extend(["--wait"]) + for key, value in kwargs.items(): + cmd_helm_install.extend(["--set", "{}={}".format(key, value)]) + response_helm_install = subprocess.Popen(cmd_helm_install, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_install, error_helm_install = response_helm_install.communicate() + if response_helm_install.returncode != 0: + pytest.fail("Unable to install helm release: " + error_helm_install.decode("ascii")) + return output_helm_install.decode("ascii") + + +# Function to delete helm chart +def delete_helm_release(helm_release_name, helm_release_namespace): + cmd_helm_delete = ["helm", "delete", helm_release_name, "--namespace", helm_release_namespace] + response_helm_delete = subprocess.Popen(cmd_helm_delete, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_delete, error_helm_delete = response_helm_delete.communicate() + if response_helm_delete.returncode != 0: + pytest.fail("Error occured while deleting the helm release: " + error_helm_delete.decode("ascii")) + return output_helm_delete.decode("ascii") + + +# Function to list helm release +def list_helm_release(helm_release_namespace): + cmd_helm_list = ["helm", "list", "--namespace", helm_release_namespace] + response_helm_list = subprocess.Popen(cmd_helm_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_list, error_helm_list = response_helm_list.communicate() + if response_helm_list.returncode != 0: + pytest.fail("Error occured while fetching the helm release: " + error_helm_list.decode("ascii")) + return output_helm_list.decode("ascii") diff --git a/test/e2e/src/common/kubernetes_configmap_utility.py b/test/e2e/src/common/kubernetes_configmap_utility.py new file mode 100644 index 000000000..caee9628e --- /dev/null +++ b/test/e2e/src/common/kubernetes_configmap_utility.py @@ -0,0 +1,8 @@ +import pytest + + +def get_namespaced_configmap(api_instance, namespace, configmap_name): + try: + return api_instance.read_namespaced_config_map(configmap_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving configmap: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_crd_utility.py b/test/e2e/src/common/kubernetes_crd_utility.py new file mode 100644 index 000000000..f84092878 --- /dev/null +++ b/test/e2e/src/common/kubernetes_crd_utility.py @@ -0,0 +1,27 @@ +import pytest + +from kubernetes import watch + + +# Function to get the CRD instance +def get_crd_instance(api_instance, group, version, namespace, plural, crd_name): + try: + return api_instance.get_namespaced_custom_object(group, version, namespace, plural, crd_name) + except Exception as e: + pytest.fail("Error occurred when retrieving crd information: " + str(e)) + + +# Function that watches events corresponding to given CRD instance and passes the events to a callback function +def watch_crd_instance(api_instance, group, version, namespace, plural, crd_name, timeout, callback=None): + if not callback: + pytest.fail("callback should be specified") + + field_selector = "metadata.name={}".format(crd_name) if crd_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_custom_object, group, version, namespace, plural, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when watching crd instance events: " + str(e)) + pytest.fail("The watch on the crd instance events has timed out.") diff --git a/test/e2e/src/common/kubernetes_daemonset_utility.py b/test/e2e/src/common/kubernetes_daemonset_utility.py new file mode 100644 index 000000000..dd76a11d9 --- /dev/null +++ b/test/e2e/src/common/kubernetes_daemonset_utility.py @@ -0,0 +1,36 @@ +import pytest +from kubernetes import watch + +# Returns a list of daemon_sets in a given namespace +def list_daemon_set(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_daemon_set(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving daemon_sets: " + str(e)) + +# Deletes a daemon_set +def delete_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.delete_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting daemon_set: " + str(e)) + +# Read a daemon_set +def read_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.read_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading daemon_set: " + str(e)) + +# Function that watches events corresponding to daemon_sets in the given namespace and passes the events to a callback function +def watch_daemon_set_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_daemon_set, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking daemon_set status: " + str(e)) + print("The watch on the daemon_set status has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_deployment_utility.py b/test/e2e/src/common/kubernetes_deployment_utility.py new file mode 100644 index 000000000..1be7a6b71 --- /dev/null +++ b/test/e2e/src/common/kubernetes_deployment_utility.py @@ -0,0 +1,38 @@ +import pytest +from kubernetes import watch + +# Returns a list of deployments in a given namespace +def list_deployment(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_deployment(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving deployments: " + str(e)) + +# Deletes a deployment +def delete_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.delete_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting deployment: " + str(e)) + + +# Read a deployment +def read_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.read_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading deployment: " + str(e)) + +# Function that watches events corresponding to deployments in the given namespace and passes the events to a callback function +def watch_deployment_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_deployment, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking deployment status: " + str(e)) + print("The watch on the deployment status has timed out. Please see the pod logs for more info.") + \ No newline at end of file diff --git a/test/e2e/src/common/kubernetes_namespace_utility.py b/test/e2e/src/common/kubernetes_namespace_utility.py new file mode 100644 index 000000000..cea5788c5 --- /dev/null +++ b/test/e2e/src/common/kubernetes_namespace_utility.py @@ -0,0 +1,32 @@ +import pytest +from kubernetes import watch + + +# Function that watches events corresponding to kubernetes namespaces and passes the events to a callback function +def watch_namespace(api_instance, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking namespace status: " + str(e)) + pytest.fail("The watch on the namespaces has timed out.") + + +# Function to list all kubernetes namespaces +def list_namespace(api_instance): + try: + return api_instance.list_namespace() + except Exception as e: + pytest.fail("Error occured when retrieving namespaces: " + str(e)) + + +# Function to delete a kubernetes namespaces +def delete_namespace(api_instance, namespace_name): + try: + return api_instance.delete_namespace(namespace_name) + except Exception as e: + pytest.fail("Error occured when deleting namespace: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_node_utility.py b/test/e2e/src/common/kubernetes_node_utility.py new file mode 100644 index 000000000..050ce8b87 --- /dev/null +++ b/test/e2e/src/common/kubernetes_node_utility.py @@ -0,0 +1,12 @@ +import pytest + +def get_kubernetes_node_count(api_instance): + node_list = list_kubernetes_nodes(api_instance) + return len(node_list.items) + +def list_kubernetes_nodes(api_instance): + try: + return api_instance.list_node() + except Exception as e: + pytest.fail("Error occured while retrieving node information: " + str(e)) + diff --git a/test/e2e/src/common/kubernetes_pod_utility.py b/test/e2e/src/common/kubernetes_pod_utility.py new file mode 100644 index 000000000..27345fae7 --- /dev/null +++ b/test/e2e/src/common/kubernetes_pod_utility.py @@ -0,0 +1,65 @@ +import pytest +import time + +from kubernetes import watch +from kubernetes.stream import stream + +# Returns a kubernetes pod object in given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod(api_instance, namespace, pod_name): + try: + return api_instance.read_namespaced_pod(pod_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving pod information: " + str(e)) + + +# Returns a list of kubernetes pod objects in a given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod_list(api_instance, namespace, label_selector=""): + try: + return api_instance.list_namespaced_pod(namespace, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occurred when retrieving pod information: " + str(e)) + +# get the content of the log file in the container via exec +def get_log_file_content(api_instance, namespace, podName, logfilePath): + try: + exec_command = ['tar','cf', '-', logfilePath] + return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False) + except Exception as e: + pytest.fail("Error occurred when retrieving log file content: " + str(e)) + +# Function that watches events corresponding to pods in the given namespace and passes the events to a callback function +def watch_pod_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_pod, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking pod status: " + str(e)) + pytest.fail("The watch on the pods has timed out. Please see the pod logs for more info.") + + +# Function that watches events corresponding to pod logs and passes them to a callback function +def watch_pod_logs(api_instance, namespace, pod_name, container_name, timeout_seconds, callback=None): + if not callback: + return + try: + w = watch.Watch() + timeout = time.time() + timeout_seconds + for event in w.stream(api_instance.read_namespaced_pod_log, pod_name, namespace, container=container_name): + if callback(event): + return + if time.time() > timeout: + pytest.fail("The watch on the pod logs has timed out.") + except Exception as e: + pytest.fail("Error occurred when checking pod logs: " + str(e)) + + +# Function that returns the pod logs of a given container. +def get_pod_logs(api_instance, pod_namespace, pod_name, container_name): + try: + return api_instance.read_namespaced_pod_log(pod_name, pod_namespace, container=container_name) + except Exception as e: + pytest.fail("Error occurred when fetching pod logs: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_secret_utility.py b/test/e2e/src/common/kubernetes_secret_utility.py new file mode 100644 index 000000000..8cc07fd4d --- /dev/null +++ b/test/e2e/src/common/kubernetes_secret_utility.py @@ -0,0 +1,26 @@ +import sys + +from kubernetes import watch + + +# This function returns the kubernetes secret object present in a given namespace +def get_kubernetes_secret(api_instance, namespace, secret_name): + try: + return api_instance.read_namespaced_secret(secret_name, namespace) + except Exception as e: + sys.exit("Error occurred when retrieving secret '{}': ".format(secret_name) + str(e)) + + +# Function that watches events corresponding to kubernetes secrets and passes the events to a callback function +def watch_kubernetes_secret(api_instance, namespace, secret_name, timeout, callback=None): + if not callback: + return + field_selector = "metadata.name={}".format(secret_name) if secret_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_secret, namespace, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + sys.exit("Error occurred when watching kubernetes secret events: " + str(e)) + sys.exit("The watch on the kubernetes secret events has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_service_utility.py b/test/e2e/src/common/kubernetes_service_utility.py new file mode 100644 index 000000000..694af885a --- /dev/null +++ b/test/e2e/src/common/kubernetes_service_utility.py @@ -0,0 +1,19 @@ +import pytest + +from kubernetes import watch + + +# Returns a list of services in a given namespace +def list_service(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_service(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving services: " + str(e)) + + +# Deletes a service +def delete_service(api_instance, namespace, service_name): + try: + return api_instance.delete_namespaced_service(service_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting service: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_version_utility.py b/test/e2e/src/common/kubernetes_version_utility.py new file mode 100644 index 000000000..884d1df2f --- /dev/null +++ b/test/e2e/src/common/kubernetes_version_utility.py @@ -0,0 +1,9 @@ +import pytest + + +def get_kubernetes_server_version(api_instance): + try: + api_response = api_instance.get_code() + return api_response.git_version + except Exception as e: + pytest.fail("Error occured when retrieving kubernetes server version: " + str(e)) diff --git a/test/e2e/src/common/results_utility.py b/test/e2e/src/common/results_utility.py new file mode 100644 index 000000000..14066bf16 --- /dev/null +++ b/test/e2e/src/common/results_utility.py @@ -0,0 +1,24 @@ +import pytest +import shutil +import tarfile + +from pathlib import Path + + + +# Function to create the test result directory +def create_results_dir(results_dir): + print(results_dir) + try: + Path(results_dir).mkdir(parents=True, exist_ok=True) + except Exception as e: + pytest.fail("Unable to create the results directory: " + str(e)) + + +# Function to append logs from the test run into a result file +def append_result_output(message, result_file_path): + try: + with open(result_file_path, "a") as result_file: + result_file.write(message) + except Exception as e: + pytest.fail("Error while appending message '{}' to results file: ".format(message) + str(e)) diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile new file mode 100644 index 000000000..9f85bdf4c --- /dev/null +++ b/test/e2e/src/core/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.6 + +RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org pytest pytest-xdist filelock requests kubernetes adal msrestazure + +RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \ + && helm version + +COPY ./core/e2e_tests.sh / +COPY ./core/pytest.ini /e2etests/ +COPY ./core/conftest.py /e2etests/ +COPY ./core/helper.py /e2etests/ +COPY ./core/ /e2etests/ +COPY ./common/ /e2etests/ +COPY ./tests/ /e2etests/ + +RUN ["chmod", "+x", "/e2e_tests.sh"] +ENTRYPOINT ["./e2e_tests.sh"] diff --git a/test/e2e/src/core/conftest.py b/test/e2e/src/core/conftest.py new file mode 100644 index 000000000..e659d5189 --- /dev/null +++ b/test/e2e/src/core/conftest.py @@ -0,0 +1,90 @@ +import pytest +import os +import time +import pickle + +import constants + +from filelock import FileLock +from pathlib import Path +from results_utility import create_results_dir, append_result_output + +pytestmark = pytest.mark.agentests + +# Fixture to collect all the environment variables, install pre-requisites. It will be run before the tests. +@pytest.fixture(scope='session', autouse=True) +def env_dict(): + my_file = Path("env.pkl") # File to store the environment variables. + with FileLock(str(my_file) + ".lock"): # Locking the file since each test will be run in parallel as separate subprocesses and may try to access the file simultaneously. + env_dict = {} + if not my_file.is_file(): + # Creating the results directory + create_results_dir('/tmp/results') + + # Setting some environment variables + env_dict['SETUP_LOG_FILE'] = '/tmp/results/setup' + env_dict['TEST_AGENT_LOG_FILE'] = '/tmp/results/containerinsights' + env_dict['NUM_TESTS_COMPLETED'] = 0 + + print("Starting setup...") + append_result_output("Starting setup...\n", env_dict['SETUP_LOG_FILE']) + + # Collecting environment variables + env_dict['TENANT_ID'] = os.getenv('TENANT_ID') + env_dict['CLIENT_ID'] = os.getenv('CLIENT_ID') + env_dict['CLIENT_SECRET'] = os.getenv('CLIENT_SECRET') + + # get default query time interval for log analytics queries + queryTimeInterval = int(os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + # add minute suffix since this format required for LA queries + env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] = str(queryTimeInterval) + "m" + + # get default query time interval for metrics queries + env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] = int(os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + + + # expected agent pod restart count + env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] = int(os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT')) if os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT') else constants.AGENT_POD_EXPECTED_RESTART_COUNT + + # default to azure public cloud if AZURE_CLOUD not specified + env_dict['AZURE_ENDPOINTS'] = constants.AZURE_CLOUD_DICT.get(os.getenv('AZURE_CLOUD')) if os.getenv('AZURE_CLOUD') else constants.AZURE_PUBLIC_CLOUD_ENDPOINTS + + if not env_dict.get('TENANT_ID'): + pytest.fail('ERROR: variable TENANT_ID is required.') + + if not env_dict.get('CLIENT_ID'): + pytest.fail('ERROR: variable CLIENT_ID is required.') + + if not env_dict.get('CLIENT_SECRET'): + pytest.fail('ERROR: variable CLIENT_SECRET is required.') + + print("Setup Complete.") + append_result_output("Setup Complete.\n", env_dict['SETUP_LOG_FILE']) + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) + else: + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + yield env_dict + + my_file = Path("env.pkl") + with FileLock(str(my_file) + ".lock"): + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + env_dict['NUM_TESTS_COMPLETED'] = 1 + env_dict.get('NUM_TESTS_COMPLETED') + if env_dict['NUM_TESTS_COMPLETED'] == int(os.getenv('NUM_TESTS')): + # Checking if cleanup is required. + if os.getenv('SKIP_CLEANUP'): + return + print('Starting cleanup...') + append_result_output("Starting Cleanup...\n", env_dict['SETUP_LOG_FILE']) + + print("Cleanup Complete.") + append_result_output("Cleanup Complete.\n", env_dict['SETUP_LOG_FILE']) + return + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) diff --git a/test/e2e/src/core/e2e_tests.sh b/test/e2e/src/core/e2e_tests.sh new file mode 100644 index 000000000..3bfafdce9 --- /dev/null +++ b/test/e2e/src/core/e2e_tests.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +results_dir="${RESULTS_DIR:-/tmp/results}" + +# saveResults prepares the results for handoff to the Sonobuoy worker. +# See: https://github.com/vmware-tanzu/sonobuoy/blob/master/docs/plugins.md +saveResults() { + cd ${results_dir} + + # Sonobuoy worker expects a tar file. + tar czf results.tar.gz * + + # Signal to the worker that we are done and where to find the results. + printf ${results_dir}/results.tar.gz > ${results_dir}/done +} + +# Ensure that we tell the Sonobuoy worker we are done regardless of results. +trap saveResults EXIT + +# The variable 'TEST_LIST' should be provided if we want to run specific tests. If not provided, all tests are run + +NUM_PROCESS=$(pytest /e2etests/ --collect-only -k "$TEST_NAME_LIST" -m "$TEST_MARKER_LIST" | grep " 0): + pytest.fail("numberMisscheduled shouldnt be greater than 0 for the daemonset {}.".format( + daemonset_name)) + + except Exception as e: + pytest.fail("Error occured while checking daemonset status: " + str(e)) + +# This function checks the status of kubernetes pods +def check_kubernetes_pods_status(pod_namespace, label_selector, expectedPodRestartCount, outfile=None): + try: + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, pod_namespace, label_selector) + append_result_output("podlist output {}\n".format(pod_list), outfile) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + pods = pod_list.items + if not pods: + pytest.fail("pod items shouldnt be null or empty") + if len(pods) <= 0: + pytest.fail("pod count should be greater than 0") + for pod in pods: + status = pod.status + podstatus = status.phase + if not podstatus: + pytest.fail("status should not be null or empty") + if podstatus != "Running": + pytest.fail("pod status should be in running state") + containerStatuses = status.container_statuses + if not containerStatuses: + pytest.fail("containerStatuses shouldnt be nil or empty") + if len(containerStatuses) <= 0: + pytest.fail("length containerStatuses should be greater than 0") + for containerStatus in containerStatuses: + containerId = containerStatus.container_id + if not containerId: + pytest.fail("containerId shouldnt be nil or empty") + image = containerStatus.image + if not image: + pytest.fail("image shouldnt be nil or empty") + imageId = containerStatus.image_id + if not imageId: + pytest.fail("imageId shouldnt be nil or empty") + restartCount = containerStatus.restart_count + if restartCount > expectedPodRestartCount: + pytest.fail("restartCount shouldnt be greater than expected pod restart count: {}".format(expectedPodRestartCount)) + ready = containerStatus.ready + if not ready: + pytest.fail("container status should be in ready state") + containerState = containerStatus.state + if not containerState.running: + pytest.fail("container state should be in running state") + except Exception as e: + pytest.fail("Error occured while checking pods status: " + str(e)) + + +def check_namespace_status_using_watch(outfile=None, namespace_list=None, timeout=300): + namespace_dict = {} + for namespace in namespace_list: + namespace_dict[namespace] = 0 + append_result_output( + "Namespace dict: {}\n".format(namespace_dict), outfile) + print("Generated the namespace dictionary.") + + # THe callback function to check the namespace status + def namespace_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + namespace_name = event['raw_object'].get('metadata').get('name') + namespace_status = event['raw_object'].get('status') + if not namespace_status: + return False + if namespace_status.get('phase') == 'Active': + namespace_dict[namespace_name] = 1 + if all(ele == 1 for ele in list(namespace_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the namespace event: " + str(e)) + + # Checking the namespace status + api_instance = client.CoreV1Api() + watch_namespace(api_instance, timeout, namespace_event_callback) + +# This function checks the status of daemonset in a given namespace. The daemonset to be monitored are identified using the pod label list parameter. +def check_kubernetes_daemonset_status_using_watch(daemonset_namespace, outfile=None, daemonset_label_list=None, timeout=300): + daemonset_label_dict = {} + if daemonset_label_list: # This parameter is a list of label values to identify the daemonsets that we want to monitor in the given namespace + for daemonset_label in daemonset_label_list: + daemonset_label_dict[daemonset_label] = 0 + append_result_output("daemonset label dict: {}\n".format( + daemonset_label_dict), outfile) + print("Generated the daemonset dictionary.") + + # The callback function to check if the pod is in running state + def daemonset_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + daemonset_status = event['raw_object'].get('status') + daemonset_metadata = event['raw_object'].get('metadata') + daemonset_metadata_labels = daemonset_metadata.get('labels') + if not daemonset_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + daemonset_metadata_label_values = daemonset_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in daemonset_metadata_label_values: + if label_value in daemonset_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + currentNumberScheduled = daemonset_status.get( + 'currentNumberScheduled') + desiredNumberScheduled = daemonset_status.get( + 'desiredNumberScheduled') + numberAvailable = daemonset_status.get('numberAvailable') + numberReady = daemonset_status.get('numberReady') + numberMisscheduled = daemonset_status.get('numberMisscheduled') + + if (currentNumberScheduled != desiredNumberScheduled): + pytest.fail("currentNumberScheduled doesnt match with currentNumberScheduled for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberAvailable != numberReady): + pytest.fail("numberAvailable doesnt match with expected numberReady for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberMisscheduled > 0): + pytest.fail("numberMisscheduled is greater than 0 for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if daemonset_label_dict: + api_instance = client.AppsV1Api() + watch_daemon_set_status( + api_instance, daemonset_namespace, timeout, daemonset_event_callback) + +# This function checks the status of deployment in a given namespace. The deployment to be monitored are identified using the pod label list parameter. +def check_kubernetes_deployments_status_using_watch(deployment_namespace, outfile=None, deployment_label_list=None, timeout=300): + deployment_label_dict = {} + if deployment_label_list: # This parameter is a list of label values to identify the deployments that we want to monitor in the given namespace + for deployment_label in deployment_label_list: + deployment_label_dict[deployment_label] = 0 + append_result_output("Deployment label dict: {}\n".format( + deployment_label_dict), outfile) + print("Generated the deployment dictionary.") + + # The callback function to check if the pod is in running state + def deployment_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + deployment_status = event['raw_object'].get('status') + deployment_metadata = event['raw_object'].get('metadata') + deployment_metadata_labels = deployment_metadata.get('labels') + if not deployment_metadata_labels: + return False + + # It contains the list of all label values for the deployment whose event was called. + deployment_metadata_label_values = deployment_metadata_labels.values() + # This label value will be common in deployment event and label list provided and will be monitored + current_label_value = None + for label_value in deployment_metadata_label_values: + if label_value in deployment_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + availableReplicas = deployment_status.get('availableReplicas') + readyReplicas = deployment_status.get('readyReplicas') + replicas = deployment_status.get('replicas') + + if (replicas != availableReplicas): + pytest.fail("availableReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + if (replicas != readyReplicas): + pytest.fail("readyReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if deployment_label_dict: + api_instance = client.AppsV1Api() + watch_deployment_status( + api_instance, deployment_namespace, timeout, deployment_event_callback) + +# This function checks the status of pods in a given namespace. The pods to be monitored are identified using the pod label list parameter. +def check_kubernetes_pods_status_using_watch(pod_namespace, outfile=None, pod_label_list=None, timeout=300): + pod_label_dict = {} + if pod_label_list: # This parameter is a list of label values to identify the pods that we want to monitor in the given namespace + for pod_label in pod_label_list: + pod_label_dict[pod_label] = 0 + append_result_output( + "Pod label dict: {}\n".format(pod_label_dict), outfile) + print("Generated the pods dictionary.") + + # The callback function to check if the pod is in running state + def pod_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + pod_status = event['raw_object'].get('status') + pod_metadata = event['raw_object'].get('metadata') + pod_metadata_labels = pod_metadata.get('labels') + if not pod_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + pod_metadata_label_values = pod_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in pod_metadata_label_values: + if label_value in pod_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + if pod_status.get('containerStatuses'): + for container in pod_status.get('containerStatuses'): + if container.get('restartCount') > 0: + pytest.fail("The pod {} was restarted. Please see the pod logs for more info.".format( + container.get('name'))) + if not container.get('state').get('running'): + pod_label_dict[current_label_value] = 0 + return False + else: + pod_label_dict[current_label_value] = 1 + if all(ele == 1 for ele in list(pod_label_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if pod_label_dict: + api_instance = client.CoreV1Api() + watch_pod_status(api_instance, pod_namespace, + timeout, pod_event_callback) + + +# Function to check if the crd instance status has been updated with the status fields mentioned in the 'status_list' parameter +def check_kubernetes_crd_status_using_watch(crd_group, crd_version, crd_namespace, crd_plural, crd_name, status_dict={}, outfile=None, timeout=300): + # The callback function to check if the crd event received has been updated with the status fields + def crd_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + crd_status = event['raw_object'].get('status') + if not crd_status: + return False + for status_field in status_dict: + if not crd_status.get(status_field): + return False + if crd_status.get(status_field) != status_dict.get(status_field): + pytest.fail( + "The CRD instance status has been updated with incorrect value for '{}' field.".format(status_field)) + return True + except Exception as e: + pytest.fail("Error occured while processing crd event: " + str(e)) + + # Checking if CRD instance has been updated with status fields + api_instance = client.CustomObjectsApi() + watch_crd_instance(api_instance, crd_group, crd_version, crd_namespace, + crd_plural, crd_name, timeout, crd_event_callback) + + +# Function to monitor the pod logs. It will ensure that are logs passed in the 'log_list' parameter are present in the container logs. +def check_kubernetes_pod_logs_using_watch(pod_namespace, pod_name, container_name, logs_list=None, error_logs_list=None, outfile=None, timeout=300): + logs_dict = {} + for log in logs_list: + logs_dict[log] = 0 + print("Generated the logs dictionary.") + + # The callback function to examine the pod log + def pod_log_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + for error_log in error_logs_list: + if error_log in event: + pytest.fail("Error log found: " + event) + for log in logs_dict: + if log in event: + logs_dict[log] = 1 + if all(ele == 1 for ele in list(logs_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing pod log event: " + str(e)) + + # Checking the pod logs + api_instance = client.CoreV1Api() + watch_pod_logs(api_instance, pod_namespace, pod_name, + container_name, timeout, pod_log_event_callback) + +# Function to monitor the kubernetes secret. It will determine if the secret has been successfully created. +def check_kubernetes_secret_using_watch(secret_namespace, secret_name, timeout=300): + # The callback function to check if the secret event received has secret data + def secret_event_callback(event): + try: + secret_data = event['raw_object'].get('data') + if not secret_data: + return False + return True + except Exception as e: + pytest.fail( + "Error occured while processing secret event: " + str(e)) + + # Checking the kubernetes secret + api_instance = client.CoreV1Api() + watch_kubernetes_secret(api_instance, secret_namespace, + secret_name, timeout, secret_event_callback) diff --git a/test/e2e/src/core/pytest.ini b/test/e2e/src/core/pytest.ini new file mode 100644 index 000000000..f4dc462f0 --- /dev/null +++ b/test/e2e/src/core/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + agentests: marks tests are a part of arc agent conformance tests (deselect with '-m "not agentests"') + \ No newline at end of file diff --git a/test/e2e/src/tests/test_ds_workflows.py b/test/e2e/src/tests/test_ds_workflows.py new file mode 100755 index 000000000..81ef08325 --- /dev/null +++ b/test/e2e/src/tests/test_ds_workflows.py @@ -0,0 +1,60 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of ds agent workflows +def test_ds_workflows(env_dict): + print("Starting daemonset agent workflows test.") + append_result_output("test_ds_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting daemonset pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("daemonset pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in daemonset pod list should be greater than 0") + + for podItem in pod_list.items: + podName = podItem.metadata.name + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for pod: " + podName) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0 for pod :" + podName) + + IsContainerPerfEmitStream = False + IsContainerInventoryStream = False + for line in loglines: + if line.find(constants.CONTAINER_PERF_EMIT_STREAM) >= 0: + IsContainerPerfEmitStream = True + if line.find(constants.CONTAINER_INVENTORY_EMIT_STREAM) >= 0: + IsContainerInventoryStream = True + + if IsContainerPerfEmitStream == False: + pytest.fail("ContainerPerf stream not emitted successfully from pod:" + podName) + if IsContainerInventoryStream == False: + pytest.fail("ContainerInventory stream not emitted successfully from pod:" + podName) + + append_result_output("test_ds_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed daemonset workflows test.") diff --git a/test/e2e/src/tests/test_e2e_workflows.py b/test/e2e/src/tests/test_e2e_workflows.py new file mode 100755 index 000000000..11a8e18e3 --- /dev/null +++ b/test/e2e/src/tests/test_e2e_workflows.py @@ -0,0 +1,330 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output + + +pytestmark = pytest.mark.agentests + +# validation of workflows e2e +def test_e2e_workflows(env_dict): + print("Starting e2e workflows test.") + append_result_output("test_e2e_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for LA queries + queryTimeInterval = env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not queryTimeInterval: + pytest.fail("DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail("environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail("failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for log analytics resource for the queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get('activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resource = env_dict.get('AZURE_ENDPOINTS').get('logAnalytics') + aad_token = fetch_aad_token(client_id, client_secret, authority_uri, resource) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate e2e workflows by checking data in log analytics workspace through resource centric queries + queryUrl = resource + "/v1" + clusterResourceId + "/query" + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json" + } + # KubePodInventory + query = constants.KUBE_POD_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_POD_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} and workflow: {1}".format(clusterResourceId, 'KUBE_POD_INVENTORY')) + + # KubeNodeInventory + query = constants.KUBE_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_NODE_INVENTORY')) + + # KubeServices + query = constants.KUBE_SERVICES_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_SERVICES')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_SERVICES')) + + # KubeEvents + query = constants.KUBE_EVENTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_EVENTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_EVENTS')) + + # Container Node Inventory + query = constants.CONTAINER_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_NODE_INVENTORY')) + + # Node Perf + # cpu capacity + query = constants.NODE_PERF_CPU_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_CAPCITY')) + + # memory capacity + query = constants.NODE_PERF_MEMORY_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_CAPCITY')) + + # cpu allocatable + query = constants.NODE_PERF_CPU_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_ALLOCATABLE')) + + # memory allocatable + query = constants.NODE_PERF_MEMORY_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_ALLOCATABLE')) + + # cpu usage + query = constants.NODE_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.NODE_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.NODE_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_WS_USAGE')) + + # restartime epoch + query = constants.NODE_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_RESTART_TIME_EPOCH')) + + # Container Perf + # container cpu limits + query = constants.CONTAINER_PERF_CPU_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_LIMITS')) + + # container memory limits + query = constants.CONTAINER_PERF_MEMORY_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_LIMITS')) + + # cpu requests + query = constants.CONTAINER_PERF_CPU_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_REQUESTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_REQUESTS')) + + # memory requests + query = constants.CONTAINER_PERF_MEMORY_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_REQUESTS_QUERY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_REQUESTS')) + + # cpu usage + query = constants.CONTAINER_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.CONTAINER_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_WS_USAGE')) + + # restart time epoch + query = constants.CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_RESTART_TIME_EPOCH')) + + # Container log + query = constants.CONTAINER_LOG_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_LOG')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_LOG')) + + # InsightsMetrics + query = constants.INSIGHTS_METRICS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('INSIGHTS_METRICS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'INSIGHTS_METRICS')) + + append_result_output("test_e2e_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_node_metrics_e2e_workflow.py b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py new file mode 100755 index 000000000..4346f89a8 --- /dev/null +++ b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py @@ -0,0 +1,420 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of node metrics e2e workflow +def test_node_metrics_e2e_workflow(env_dict): + print("Starting node metrics e2e workflow test.") + append_result_output("test_node_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metric queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metric queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # node metric - memoryRssBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryRssPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORYE_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsageMilliCores + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsagePercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - nodesCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_COUNT_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + append_result_output("test_node_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed node metrics e2e workflow test.") diff --git a/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py new file mode 100755 index 000000000..cd4260f76 --- /dev/null +++ b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py @@ -0,0 +1,134 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of pod metrics e2e workflows +def test_pod_metrics_e2e_workflow(env_dict): + print("Starting pod metrics e2e workflows test.") + append_result_output("test_pod_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metrics queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metrics queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # pod metric - PodCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.POD_COUNT_METRIC_NAME, + constants.POD_METRIC_METRIC_AGGREGATION, + constants.POD_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.POD_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.POD_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.POD_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.POD_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + + append_result_output("test_pod_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_resource_status.py b/test/e2e/src/tests/test_resource_status.py new file mode 100755 index 000000000..bb63dac7c --- /dev/null +++ b/test/e2e/src/tests/test_resource_status.py @@ -0,0 +1,43 @@ +import pytest +import constants + +from kubernetes import client, config +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status + +pytestmark = pytest.mark.agentests + +# validate all the critical resources such as ds, rs, ds pods and rs pod etc. are up and running +def test_resource_status(env_dict): + print("Starting resource status check.") + append_result_output("test_resource_status start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + #config.load_kube_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # checking the deployment status + check_kubernetes_deployment_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DEPLOYMENT_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + # checking the daemonset status + check_kubernetes_daemonset_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DAEMONSET_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + expectedPodRestartCount = env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] + # checking deployment pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + # checking daemonset pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + append_result_output("test_resource_status end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully checked resource status check.") diff --git a/test/e2e/src/tests/test_rs_workflows.py b/test/e2e/src/tests/test_rs_workflows.py new file mode 100755 index 000000000..aef422171 --- /dev/null +++ b/test/e2e/src/tests/test_rs_workflows.py @@ -0,0 +1,93 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of replicaset agent workflows +def test_rs_workflows(env_dict): + print("Starting replicaset agent workflows test.") + append_result_output("test_rs_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + rspodName = pod_list.items[0].metadata.name + if not rspodName: + pytest.fail("replicaset pod name should not be null or empty") + + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for rs pod: {}".format(rspodName)) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0") + + IsKubePodInventorySuccessful = False + IsKubeNodeInventorySuccessful = False + IsKubeDeploymentInventorySuccessful = False + IsKubeContainerPerfInventorySuccessful = False + IsKubeServicesInventorySuccessful = False + IsContainerNodeInventorySuccessful = False + IsKubeEventsSuccessful = False + for line in loglines: + if line.find(constants.KUBE_POD_INVENTORY_EMIT_STREAM) >= 0: + IsKubePodInventorySuccessful = True + if line.find(constants.KUBE_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsKubeNodeInventorySuccessful = True + if line.find(constants.KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM) >= 0: + IsKubeDeploymentInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_PERF_EMIT_STREAM) >= 0: + IsKubeContainerPerfInventorySuccessful = True + if line.find(constants.KUBE_SERVICES_EMIT_STREAM) >= 0: + IsKubeServicesInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsContainerNodeInventorySuccessful = True + if line.find(constants.KUBE_EVENTS_EMIT_STREAM) >= 0: + IsKubeEventsSuccessful = True + + if IsKubePodInventorySuccessful == False: + pytest.fail("KubePodInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeNodeInventorySuccessful == False: + pytest.fail("KubeNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeDeploymentInventorySuccessful == False: + pytest.fail("KubeDeploymentInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeContainerPerfInventorySuccessful == False: + pytest.fail("KubeContainerPerfInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeServicesInventorySuccessful == False: + pytest.fail("KubeServicesInventory stream not emitted successfully from pod:" + rspodName) + + if IsContainerNodeInventorySuccessful == False: + pytest.fail("ContainerNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeEventsSuccessful == False: + pytest.fail("KubeEventsInventory stream not emitted successfully from rs pod:" + rspodName) + + append_result_output("test_rs_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed replicaset workflows test.")