diff --git a/.pipelines/update-place-holdres-in-e2e-tests.sh b/.pipelines/update-place-holdres-in-e2e-tests.sh new file mode 100755 index 000000000..5fec73684 --- /dev/null +++ b/.pipelines/update-place-holdres-in-e2e-tests.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "start: update placeholders of e2e-tests.yaml ..." + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + TENANT_ID) TENANT_ID=$VALUE ;; + *) + esac +done + +echo "start: read appid and appsecret" +# used the same SP which used for acr +CLIENT_ID=$(cat ~/acrappid) +CLIENT_SECRET=$(cat ~/acrappsecret) +echo "end: read appid and appsecret" + +echo "Service Principal CLIENT_ID:$CLIENT_ID" +echo "replace CLIENT_ID value" +sed -i "s=SP_CLIENT_ID_VALUE=$CLIENT_ID=g" e2e-tests.yaml + +# only uncomment for debug purpose +# echo "Service Principal CLIENT_SECRET:$CLIENT_SECRET" +echo "replace CLIENT_SECRET value" +sed -i "s=SP_CLIENT_SECRET_VALUE=$CLIENT_SECRET=g" e2e-tests.yaml + +echo "Service Principal TENANT_ID:$TENANT_ID" +echo "replace TENANT_ID value" +sed -i "s=SP_TENANT_ID_VALUE=$TENANT_ID=g" e2e-tests.yaml + +echo "end: update placeholders of e2e-tests.yaml." diff --git a/.pipelines/validate-e2e-tests-results.sh b/.pipelines/validate-e2e-tests-results.sh new file mode 100644 index 000000000..c38fa0f50 --- /dev/null +++ b/.pipelines/validate-e2e-tests-results.sh @@ -0,0 +1,71 @@ +#!/bin/bash +echo "start: validating results of e2e-tests ..." +DEFAULT_SONOBUOY_VERSION="0.20.0" +DEFAULT_TIME_OUT_IN_MINS=60 +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + SONOBUOY_VERSION) SONOBUOY_VERSION=$VALUE ;; + *) + esac +done + +if [ -z $SONOBUOY_VERSION ]; then + SONOBUOY_VERSION=$DEFAULT_SONOBUOY_VERSION +fi + +echo "sonobuoy version: ${SONOBUOY_VERSION}" + +echo "start: downloading sonobuoy" +curl -LO https://github.com/vmware-tanzu/sonobuoy/releases/download/v${SONOBUOY_VERSION}/sonobuoy_${SONOBUOY_VERSION}_linux_amd64.tar.gz +echo "end: downloading sonobuoy" + +echo "start: extract sonobuoy tar file" +mkdir -p sonobuoy-install/ +tar -zxf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz -C sonobuoy-install/ +echo "end: extract sonobuoy tar file" + +echo "start: move sonobuoy binaries to /usr/local/bin/" +mv -f sonobuoy-install/sonobuoy /usr/local/bin/ +echo "end: move sonobuoy binaries to /usr/local/bin/" + +rm -rf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz sonobuoy-install/ + +results=$(sonobuoy retrieve) +mins=0 +IsSucceeded=true +while [ $mins -le $DEFAULT_TIME_OUT_IN_MINS ] +do + # check the status + echo "checking test status" + status=$(sonobuoy status) + status=$(echo $status | sed 's/`//g') + if [[ $status == *"completed"* ]]; then + echo "test run completed" + mins=$DEFAULT_TIME_OUT_IN_MINS + if [[ $status == *"failed"* ]]; then + IsSucceeded=false + fi + else + echo "sleep for 1m to check the status again" + sleep 1m + fi + mins=$(( $mins + 1 )) +done +echo "status:${IsSucceeded}" + +results=$(sonobuoy retrieve) +sonobuoy results $results + +if $IsSucceeded == true; then + echo "all test passed" + exit 0 +else + echo "tests are failed. please review the results by downloading tar file via sonobuoy retrieve command" + exit 1 +fi + +echo "end: validating results of e2e-tests ..." diff --git a/README.md b/README.md index 3eec1f344..3564345ee 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ The general directory structure is: │ │ | ... - plugins in, out and filters code in ruby │ ├── toml-parser/ - code for parsing of toml configuration files ├── test/ - source code for tests +│ ├── e2e/ - e2e tests to validate agent and e2e workflow(s) │ ├── unit-tests/ - unit tests code │ ├── scenario/ - scenario tests code ├── !_README.md - this file @@ -271,6 +272,36 @@ For DEV and PROD branches, automatically deployed latest yaml with latest agent # E2E Tests +## For executing tests + +1. Deploy the omsagent.yaml with your agent image. In the yaml, make sure `ISTEST` environment variable set to `true` if its not set already +2. Update the Service Principal CLIENT_ID, CLIENT_SECRET and TENANT_ID placeholder values and apply e2e-tests.yaml to execute the tests + > Note: Service Principal requires reader role on log analytics workspace and cluster resource to query LA and metrics + ``` + cd ~/Docker-Provider/test/e2e # based on your repo path + kubectl apply -f e2e-tests.yaml # this will trigger job to run the tests in sonobuoy namespace + kubectl get po -n sonobuoy # to check the pods and jobs associated to tests + ``` +3. Download (sonobuoy)[https://github.com/vmware-tanzu/sonobuoy/releases] on your dev box to view the results of the tests + ``` + results=$(sonobuoy retrieve) # downloads tar file which has logs and test results + sonobuoy results $results # get the summary of the results + tar -xzvf # extract downloaded tar file and look for pod logs, results and other k8s resources if there are any failures + ``` + +## For adding new tests + +1. Add the test python file with your test code under `tests` directory +2. Build the docker image, recommended to use ACR & MCR + ``` + cd ~/Docker-Provider/test/e2e/src # based on your repo path + docker login -u -p # login to acr + docker build -f ./core/Dockerfile -t /: . + docker push /: + ``` +3. update existing agentest image tag in e2e-tests.yaml with newly built image tag with MCR repo + +# Scenario Tests Clusters are used in release pipeline already has the yamls under test\scenario deployed. Make sure to validate these scenarios. If you have new interesting scenarios, please add/update them. diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 2afe16481..80d6f188d 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,6 +10,20 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 02/23/2021 - +##### Version microsoft/oms:ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021 (linux) +##### Version microsoft/oms:win-ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021 (windows) +##### Code change log +- ContainerLogV2 schema support for LogAnalytics & ADX (not usable externally yet) +- Fix nodemetrics (cpuusageprecentage & memoryusagepercentage) metrics not flowing. This is fixed upstream for k8s versions >= 1.19.7 and >=1.20.2. +- Fix cpu & memory usage exceeded threshold container metrics not flowing when requests and/or limits were not set +- Mute some unused exceptions from going to telemetry +- Collect containerimage (repository, image & imagetag) from spec (instead of runtime) +- Add support for extension MSI for k8s arc +- Use cloud specific instrumentation keys for telemetry +- Picked up newer version for apt +- Add priority class to daemonset (in our chart only) + ### 01/11/2021 - ##### Version microsoft/oms:ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021 (linux) ##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) @@ -27,68 +41,6 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Enable ADX route for windows container logs - Remove logging to termination log in windows agent liveness probe - -### 11/09/2020 - -##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) -##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) -##### Code change log -- Fix for duplicate windows metrics - -### 10/27/2020 - -##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) -##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) -##### Code change log -- Activate oneagent in few AKS regions (koreacentral,norwayeast) -- Disable syslog -- Fix timeout for Windows daemonset liveness probe -- Make request == limit for Windows daemonset resources (cpu & memory) -- Schema v2 for container log (ADX only - applicable only for select customers for piloting) - -### 10/05/2020 - -##### Version microsoft/oms:ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020 (linux) -##### Version microsoft/oms:win-ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) -##### Code change log -- Health CRD to version v1 (from v1beta1) for k8s versions >= 1.19.0 -- Collection of PV usage metrics for PVs mounted by pods (kube-system pods excluded by default)(doc-link-needed) -- Zero fill few custom metrics under a timer, also add zero filling for new PV usage metrics -- Collection of additional Kubelet metrics ('kubelet_running_pod_count','volume_manager_total_volumes','kubelet_node_config_error','process_resident_memory_bytes','process_cpu_seconds_total','kubelet_runtime_operations_total','kubelet_runtime_operations_errors_total'). This also includes updates to 'kubelet' workbook to include these new metrics -- Collection of Azure NPM (Network Policy Manager) metrics (basic & advanced. By default, NPM metrics collection is turned OFF)(doc-link-needed) -- Support log collection when docker root is changed with knode. Tracked by [this](https://github.com/Azure/AKS/issues/1373) issue -- Support for Pods in 'Terminating' state for nodelost scenarios -- Fix for reduction in telemetry for custom metrics ingestion failures -- Fix CPU capacity/limits metrics being 0 for Virtual nodes (VK) -- Add new custom metric regions (eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth) -- Enable strict SSL validation for AppInsights Ruby SDK -- Turn off custom metrics upload for unsupported cluster types -- Install CA certs from wire server for windows (in certain clouds) - -### 09/16/2020 - -> Note: This agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update -##### Version microsoft/oms:ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020 (linux) -##### Version microsoft/oms:win-ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09162020 (windows) -##### Code change log -- Collection of Azure Network Policy Manager Basic and Advanced metrics -- Add support in Windows Agent for Container log collection of CRI runtimes such as ContainerD -- Alertable metrics support Arc K8s cluster to parity with AKS -- Support for multiple container log mount paths when docker is updated through knode -- Bug fix related to MDM telemetry - -### 08/07/2020 - -##### Version microsoft/oms:ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020 (linux) -##### Version microsoft/oms:win-ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020 (windows) -##### Code change log -- Collection of KubeState metrics for deployments and HPA -- Add the Proxy support for Windows agent -- Fix for ContainerState in ContainerInventory to handle Failed state and collection of environment variables for terminated and failed containers -- Change /spec to /metrics/cadvisor endpoint to collect node capacity metrics -- Disable Health Plugin by default and can enabled via configmap -- Pin version of jq to 1.5+dfsg-2 -- Bug fix for showing node as 'not ready' when there is disk pressure -- oneagent integration (disabled by default) -- Add region check before sending alertable metrics to MDM -- Telemetry fix for agent telemetry for sov. clouds - - ### 11/09/2020 - ##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) ##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) @@ -97,7 +49,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 10/27/2020 - ##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) -##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020 (windows) ##### Code change log - Activate oneagent in few AKS regions (koreacentral,norwayeast) - Disable syslog diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index fe26f639e..a0f3c2f0a 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -23,6 +23,7 @@ @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false +@containerLogSchemaVersion = "" @collectAllKubeEvents = false @containerLogsRoute = "" @@ -138,6 +139,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") end + #Get container log schema version setting + begin + if !parsedConfig[:log_collection_settings][:schema].nil? && !parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version].nil? + @containerLogSchemaVersion = parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version] + puts "config::Using config map setting for container log schema version" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for container log schema version - #{errorStr}, using defaults, please check config map for errors") + end + #Get kube events enrichment setting begin if !parsedConfig[:log_collection_settings][:collect_all_kube_events].nil? && !parsedConfig[:log_collection_settings][:collect_all_kube_events][:enabled].nil? @@ -200,6 +211,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") + file.write("export AZMON_CONTAINER_LOG_SCHEMA_VERSION=#{@containerLogSchemaVersion}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -246,6 +258,8 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) file.write(commands) + commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) + file.write(commands) # Close file after writing all environment variables file.close diff --git a/build/version b/build/version index 711a96921..2da3efa39 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=12 +CONTAINER_BUILDVERSION_MAJOR=13 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210111 +CONTAINER_BUILDVERSION_DATE=20210223 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index a809a4e69..ce64fd1ce 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.0 +version: 2.8.1 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md index 469fac94a..a3f17b509 100644 --- a/charts/azuremonitor-containers/README.md +++ b/charts/azuremonitor-containers/README.md @@ -93,6 +93,7 @@ The following table lists the configurable parameters of the MSOMS chart and the | `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided | | `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) | | `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) | +| `omsagent.priority` | DaemonSet Pod Priority | This is the [priority](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/) to use for the daemonsets such that they get scheduled onto the node ahead of "normal" pods - must be an integer, defaults to 10 | > Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName` @@ -100,6 +101,7 @@ The following table lists the configurable parameters of the MSOMS chart and the - Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap. - onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName +- The `omsagent.priority` parameter sets the priority of the omsagent daemonset priority class. This pod priority class is used for daemonsets to allow them to have priority over pods that can be scheduled elsewhere. Without a priority class, it is possible for a node to fill up with "normal" pods before the daemonset pods get to be created for the node or get scheduled. Note that pods are not "daemonset" pods - they are just pods created by the daemonset controller but they have a specific affinity set during creation to the specific node each pod was created to run on. You want this value to be greater than 0 (default is 10) and generally greater than pods that have the flexibility to run on different nodes such that they do not block the node specific pods. ## Agent data collection settings diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml index ebdd5ea3f..b7482b8b5 100644 --- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -1,4 +1,18 @@ {{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }} +#extension model +{{- if not (empty .Values.Azure.Extension.Name) }} +apiVersion: clusterconfig.azure.com/v1beta1 +kind: AzureExtensionIdentity +metadata: + name: {{ .Values.Azure.Extension.Name }} + namespace: azure-arc +spec: + serviceAccounts: + - name: omsagent + namespace: kube-system + tokenNamespace: azure-arc +--- +{{- end }} apiVersion: clusterconfig.azure.com/v1beta1 kind: AzureClusterIdentityRequest metadata: @@ -6,4 +20,7 @@ metadata: namespace: azure-arc spec: audience: https://monitoring.azure.com/ + {{- if not (empty .Values.Azure.Extension.Name) }} + resourceId: {{ .Values.Azure.Extension.Name }} + {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 81003c704..82d210f3d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -27,10 +27,11 @@ spec: checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} nodeSelector: kubernetes.io/os: windows diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 3d29ede42..615cd0485 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -28,10 +28,11 @@ spec: checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -70,6 +71,10 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" {{- if .Values.omsagent.logsettings.logflushintervalsecs }} @@ -84,6 +89,8 @@ spec: - name: FBIT_TAIL_BUFFER_MAX_SIZE value: {{ .Values.omsagent.logsettings.tailbufmaxsizemegabytes | quote }} {{- end }} + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 8609d25c9..012dd2720 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -67,8 +67,14 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml new file mode 100644 index 000000000..4d9980ab3 --- /dev/null +++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml @@ -0,0 +1,22 @@ +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} +# This pod priority class is used for daemonsets to allow them to have priority +# over pods that can be scheduled elsewhere. Without a priority class, it is +# possible for a node to fill up with pods before the daemonset pods get to be +# created for the node or get scheduled. Note that pods are not "daemonset" +# pods - they are just pods created by the daemonset controller but they have +# a specific affinity set during creation to the specific node each pod was +# created to run on (daemonset controller takes care of that) +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: omsagent + # Priority classes don't have labels :-) + annotations: + chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + component: oms-agent +value: {{ .Values.omsagent.priority }} +globalDefault: false +description: "This is the daemonset priority class for omsagent" +{{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index bd4e9baf3..5db5c2dab 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -33,10 +33,14 @@ rules: verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] +#arc k8s extension model grants access as part of the extension msi +#remove this explicit permission once the extension available in public preview +{{- if (empty .Values.Azure.Extension.Name) }} - apiGroups: [""] resources: ["secrets"] resourceNames: ["container-insights-clusteridentityrequest-token"] verbs: ["get"] +{{- end }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 52e80dff8..afe789d56 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -4,19 +4,40 @@ ## Microsoft OMS Agent image for kubernetes cluster monitoring ## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod -## Values of ResourceId and Region under Azure->Cluster being populated by Azure Arc K8s RP during the installation of the extension +## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension Azure: Cluster: Region: ResourceId: + Extension: + Name: "" + ResourceId: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod01112021" - tagWindows: "win-ciprod01112021" + tag: "ciprod02232021" + tagWindows: "win-ciprod02232021" pullPolicy: IfNotPresent - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" agentVersion: "1.10.0.1" + + # The priority used by the omsagent priority class for the daemonset pods + # Note that this is not execution piority - it is scheduling priority, as + # in getting scheduled to the node. This needs to be greater than 0 such + # that the daemonset pods, which can not schedule onto different nodes as + # they are defined to run on specific nodes, are not accidentally frozen + # out of a node due to other pods showing up earlier in scheduling. + # (DaemonSet pods by definition only are created once the node exists for + # them to be created for and thus it is possible to have "normal" pods + # already in line to run on the node before the DeamonSet controller got a + # chance to build pod for the node and give it to the scheduler) + # Should be some number greater than default (0) + priority: 10 + + # This used for running agent pods in test mode. + # if set to true additional agent workflow logs will be emitted which are used for e2e and arc k8s conformance testing + ISTEST: false + ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. @@ -82,6 +103,21 @@ omsagent: operator: NotIn values: - virtual-kubelet + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -93,10 +129,10 @@ omsagent: operator: NotIn values: - virtual-kubelet - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 deployment: affinity: nodeAffinity: @@ -125,10 +161,10 @@ omsagent: operator: NotIn values: - master - - key: kubernetes.io/arch + - key: kubernetes.io/arch operator: In values: - - amd64 + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -144,10 +180,10 @@ omsagent: operator: NotIn values: - master - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 2e1118922..bee718a31 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod01112021 +ARG IMAGE_TAG=ciprod02232021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh b/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh old mode 100644 new mode 100755 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 2f88c07ac..c4067f25e 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -161,6 +161,39 @@ fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +# Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) +if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) + for BACKOFF in {1..4}; do + KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL ) + # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + break + else + sleep $((2**$BACKOFF / 4)) # (exponential backoff) + fi + done + + # validate that the retrieved data is an instrumentation key + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + echo "Using cloud-specific instrumentation key" + else + # no ikey can be retrieved. Disable telemetry and continue + export DISABLE_TELEMETRY=true + echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + fi +fi + + +aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc + +source ~/.bashrc + + #Parse the configmap to set the right environment variables. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb @@ -190,15 +223,6 @@ cat integration_npm_config_env_var | while read line; do done source integration_npm_config_env_var -#Parse the configmap to set the right environment variables for network policy manager (npm) integration. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb - -cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source integration_npm_config_env_var - #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset if [ ! -e "/etc/config/kube.conf" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb @@ -521,6 +545,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "starting mdsd ..." mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 fi fi @@ -589,11 +614,6 @@ echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc export HOST_VAR=/hostfs/var echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc -aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc - -source ~/.bashrc #start telegraf /opt/telegraf --config $telegrafConfFile & diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml index 76d2104fc..49d329791 100644 --- a/kubernetes/linux/mdsd.xml +++ b/kubernetes/linux/mdsd.xml @@ -48,20 +48,31 @@ --> - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - + + @@ -97,15 +108,22 @@ priority events to be delivered sooner than the next five-minute interval. --> - - - - + + + + + + + + + - @@ -118,7 +136,16 @@ - ]]> + ]]> + + + + + + + + + ]]> diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 352be06d7..fe6c0565a 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -2,8 +2,8 @@ TMPDIR="/opt" cd $TMPDIR #Download utf-8 encoding capability on the omsagent container. - -apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales +#upgrade apt to latest version +apt-get update && apt-get install -y apt && DEBIAN_FRONTEND=noninteractive apt-get install -y locales sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 67bd9cdde..ebf0257af 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: @@ -383,6 +383,9 @@ spec: value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -521,13 +524,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: @@ -541,6 +544,9 @@ spec: value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -675,7 +681,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -685,7 +691,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index f852bd236..d4f118449 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod01112021 +ARG IMAGE_TAG=win-ciprod02232021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index a297e3801..722392157 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -119,10 +119,48 @@ function Set-EnvironmentVariables { $env:AZMON_AGENT_CFG_SCHEMA_VERSION } - # Set environment variable for TELEMETRY_APPLICATIONINSIGHTS_KEY - $aiKey = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Process") - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Machine") + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) + $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') + if ($aiKeyURl) { + $aiKeyFetched = "" + # retry up to 5 times + for( $i = 1; $i -le 4; $i++) { + try { + $response = Invoke-WebRequest -uri $aiKeyURl -UseBasicParsing -TimeoutSec 5 -ErrorAction:Stop + + if ($response.StatusCode -ne 200) { + Write-Host "Expecting reponse code 200, was: $($response.StatusCode), retrying" + Start-Sleep -Seconds ([MATH]::Pow(2, $i) / 4) + } + else { + $aiKeyFetched = $response.Content + break + } + } + catch { + Write-Host "Exception encountered fetching instrumentation key:" + Write-Host $_.Exception + } + } + + # Check if the fetched IKey was properly encoded. if not then turn off telemetry + if ($aiKeyFetched -match '^[A-Za-z0-9=]+$') { + Write-Host "Using cloud-specific instrumentation key" + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Process") + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Machine") + } + else { + # Couldn't fetch the Ikey, turn telemetry off + Write-Host "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Process") + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Machine") + } + } + + $aiKeyDecoded = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb @@ -324,7 +362,3 @@ Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | For #check if fluentd service is running Get-Service fluentdwinaks - - - - diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json new file mode 100644 index 000000000..c68bfed17 --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json @@ -0,0 +1,113 @@ +{ + "mode": "Indexed", + "policyRule": { + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } + }, + "parameters": { + "workspaceResourceId": { + "type": "String", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json new file mode 100644 index 000000000..6281cdade --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json @@ -0,0 +1,9 @@ +{ + "workspaceResourceId": { + "type": "string", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json new file mode 100644 index 000000000..a113441ce --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json @@ -0,0 +1,101 @@ +{ + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } +} diff --git a/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml new file mode 100644 index 000000000..fce2fc582 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: containerHealth-read-logs-global +roleRef: + kind: ClusterRole + name: containerHealth-log-reader + apiGroup: rbac.authorization.k8s.io +subjects: + - kind: User + name: clusterUser + apiGroup: rbac.authorization.k8s.io diff --git a/scripts/onboarding/clusteruser/cluster-user-role.yaml b/scripts/onboarding/clusteruser/cluster-user-role.yaml new file mode 100644 index 000000000..b3519fdd3 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: containerHealth-log-reader +rules: + - apiGroups: ["", "metrics.k8s.io", "extensions", "apps"] + resources: + - "pods/log" + - "events" + - "nodes" + - "pods" + - "deployments" + - "replicasets" + verbs: ["get", "list"] diff --git a/scripts/onboarding/enable-monitoring-using-policy.md b/scripts/onboarding/enable-monitoring-using-policy.md new file mode 100644 index 000000000..e1e395ecc --- /dev/null +++ b/scripts/onboarding/enable-monitoring-using-policy.md @@ -0,0 +1,64 @@ +# How to enable AKS Monitoring Addon via Azure Policy +This doc describes how to enable AKS Monitoring Addon using Azure Custom Policy.Monitoring Addon Custom Policy can be assigned +either at subscription or resource group scope. If Azure Log Analytics workspace and AKS cluster are in different subscriptions then Managed Identity used by Policy assignnment has to have required role permissions on both the subscriptions or least on the resource of the Azure Log Aalytics workspace. Similarly, If the policy scoped to Resource Group, then Managed Identity should have required role permissions on the Log Analytics workspace if the workspace not in the selected Resource Group scope. + +Monitoring Addon require following roles on the Managed Identity used by Azure Policy + - [azure-kubernetes-service-contributor-role](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-contributor-role) + - [log-analytics-contributor](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#log-analytics-contributor) + +## Create and Assign Policy definition using Azure Portal + +### Create Policy Definition + +1. Download the Azure Custom Policy definition to enable AKS Monitoring Addon +``` sh + curl -o azurepolicy.json -L https://aka.ms/aks-enable-monitoring-custom-policy +``` +2. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and create policy definition with the following details in the Policy definition create dialogue box + + - Pick any Azure Subscription where you want to store Policy Definition + - Name - '(Preview)AKS-Monitoring-Addon' + - Description - 'Azure Custom Policy to enable Monitoring Addon onto Azure Kubernetes Cluster(s) in specified scope' + - Category - Choose "use existing" and pick 'Kubernetes' from drop down + - Remove the existing sample rules and copy the contents of azurepolicy.json downloaded in step #1 above + +### Assign Policy Definition to Specified Scope + +> Note: Managed Identity will be created automatically and assigned specified roles in the Policy definition. + +3. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and select the Policy Definition 'AKS Monitoring Addon' +4. Click an Assignment and select Scope, Exclusions (if any) +5. Provide the Resource Id of the Azure Log Analytics Workspace. The Resource Id should be in this format `/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/` +6. Create Remediation task in case if you want apply to policy to existing AKS clusters in selected scope +7. Click and Review & Create Option to create Policy Assignment + +## Create and Assign Policy definition using Azure CLI + +### Create Policy Definition + +1. Download the Azure Custom Policy definition rules and parameters files + ``` sh + curl -o azurepolicy.rules.json -L https://aka.ms/aks-enable-monitoring-custom-policy-rules + curl -o azurepolicy.parameters.json -L https://aka.ms/aks-enable-monitoring-custom-policy-parameters + ``` +2. Create policy definition using below command + + ``` sh + az cloud set -n # set the Azure cloud + az login # login to cloud environment + az account set -s + az policy definition create --name "(Preview)AKS-Monitoring-Addon" --display-name "(Preview)AKS-Monitoring-Addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules azurepolicy.rules.json --params azurepolicy.parameters.json + ``` +### Assign Policy Definition to Specified Scope + +3. Create policy assignment + +``` sh +az policy assignment create --name aks-monitoring-addon --policy "(Preview)AKS-Monitoring-Addon" --assign-identity --identity-scope /subscriptions/ --role Contributor --scope /subscriptions/ --location --role Contributor --scope /subscriptions/ -p "{ \"workspaceResourceId\": { \"value\": \"/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/\" } }" +``` + +## References +- https://docs.microsoft.com/en-us/azure/governance/policy/ +- https://docs.microsoft.com/en-us/azure/governance/policy/how-to/remediate-resources#how-remediation-security-works +- https://docs.microsoft.com/en-us/cli/azure/install-azure-cli +- https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview \ No newline at end of file diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 5e9cb8a25..d8ab7b345 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -50,8 +50,6 @@ param( [Parameter(mandatory = $false)] [string]$tenantId, [Parameter(mandatory = $false)] - [string]$kubeContext, - [Parameter(mandatory = $false)] [string]$azureCloudName ) @@ -66,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.0" +$mcrChartVersion = "2.8.1" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 2dc0a465f..9d0c0aca5 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.0" +mcrChartVersion="2.8.1" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 8826b6df6..6d14dfa5f 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.0" +mcrChartVersion="2.8.1" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json new file mode 100644 index 000000000..8ebef232a --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Resource Id of the Azure Arc Connected Cluster" + } + }, + "clusterRegion": { + "type": "string", + "metadata": { + "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" + } + }, + "proxyEndpointUrl": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "If the cluster behind forward proxy, then specify Proxy Endpoint URL in this format: http(s)://:@:" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace region e.g. \"eastus\"" + } + }, + "workspaceDomain": { + "type": "string", + "allowedValues": [ + "opinsights.azure.com", + "opinsights.azure.cn", + "opinsights.azure.us", + "opinsights.azure.eaglex.ic.gov", + "opinsights.azure.microsoft.scloud" + ], + "defaultValue": "opinsights.azure.com", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace Domain e.g. opinsights.azure.com" + } + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "apiVersion": "2015-11-01-preview", + "type": "Microsoft.OperationsManagement/solutions", + "location": "[parameters('workspaceRegion')]", + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "properties": { + "workspaceResourceId": "[parameters('workspaceResourceId')]" + }, + "plan": { + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "product": "[Concat('OMSGallery/', 'ContainerInsights')]", + "promotionCode": "", + "publisher": "Microsoft" + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-ci-extension', '-', uniqueString(parameters('clusterResourceId')))]", + "apiVersion": "2019-05-01", + "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", + "dependsOn": [ + "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.KubernetesConfiguration/extensions", + "apiVersion": "2020-07-01-preview", + "name": "azuremonitor-containers", + "location": "[parameters('clusterRegion')]", + "identity": {"type": "systemassigned"}, + "properties": { + "extensionType": "Microsoft.AzureMonitor.Containers", + "configurationSettings": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "omsagent.domain": "[parameters('workspaceDomain')]" + }, + "configurationProtectedSettings": { + "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" , + "omsagent.proxy": "[if(equals(parameters('proxyEndpointUrl'), ''), '', parameters('proxyEndpointUrl'))]" + }, + "autoUpgradeMinorVersion": true, + "releaseTrain": "Stable", + "scope": { + "Cluster": { + "releaseNamespace": "azuremonitor-containers" + } + } + }, + "scope": "[concat('Microsoft.Kubernetes/connectedClusters/', split(parameters('clusterResourceId'),'/')[8])]" + } + ] + } + } + } + ] +} diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json new file mode 100644 index 000000000..b74b5ac95 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json @@ -0,0 +1,24 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.Kubernetes/connectedClusters/" + }, + "clusterRegion": { + "value": "" + }, + "proxyEndpointUrl": { + "value": "" + }, + "workspaceResourceId": { + "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" + }, + "workspaceRegion": { + "value": "" + }, + "workspaceDomain": { + "value": "" + } + } +} diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 5a678781c..bda757eb8 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -32,13 +32,16 @@ import ( // DataType for Container Log const ContainerLogDataType = "CONTAINER_LOG_BLOB" +//DataType for Container Log v2 +const ContainerLogV2DataType = "CONTAINERINSIGHTS_CONTAINERLOGV2" + // DataType for Insights metric const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" // DataType for KubeMonAgentEvent const KubeMonAgentEventDataType = "KUBE_MON_AGENT_EVENTS_BLOB" -//env varibale which has ResourceId for LA +//env variable which has ResourceId for LA const ResourceIdEnv = "AKS_RESOURCE_ID" //env variable which has ResourceName for NON-AKS @@ -78,20 +81,26 @@ const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimpr const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" const WindowsContainerLogPluginConfFilePath = "/etc/omsagentwindows/out_oms.conf" -// IPName for Container Log -const IPName = "Containers" +// IPName +const IPName = "ContainerInsights" + + const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 //Eventsource name in mdsd -const MdsdSourceName = "ContainerLogSource" +const MdsdContainerLogSourceName = "ContainerLogSource" +const MdsdContainerLogV2SourceName = "ContainerLogV2Source" -//container logs route - v2 (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) +//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" +//container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) +const ContainerLogV2SchemaVersion = "v2" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -125,6 +134,8 @@ var ( ContainerLogsRouteV2 bool // container log route for routing thru ADX ContainerLogsRouteADX bool + // container log schema (applicable only for non-ADX route) + ContainerLogSchemaV2 bool //ADX Cluster URI AdxClusterUri string // ADX clientID @@ -180,8 +191,8 @@ var ( userAgent = "" ) -// DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin -type DataItem struct { +// DataItemLAv1 == ContainerLog table in LA +type DataItemLAv1 struct { LogEntry string `json:"LogEntry"` LogEntrySource string `json:"LogEntrySource"` LogEntryTimeStamp string `json:"LogEntryTimeStamp"` @@ -193,10 +204,25 @@ type DataItem struct { Computer string `json:"Computer"` } +// DataItemLAv2 == ContainerLogV2 table in LA +// Please keep the names same as destination column names, to avoid transforming one to another in the pipeline +type DataItemLAv2 struct { + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` + //PodLabels string `json:"PodLabels"` +} + +// DataItemADX == ContainerLogV2 table in ADX type DataItemADX struct { TimeGenerated string `json:"TimeGenerated"` Computer string `json:"Computer"` - ContainerID string `json:"ContainerID"` + ContainerId string `json:"ContainerId"` ContainerName string `json:"ContainerName"` PodName string `json:"PodName"` PodNamespace string `json:"PodNamespace"` @@ -227,10 +253,17 @@ type InsightsMetricsBlob struct { } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point -type ContainerLogBlob struct { +type ContainerLogBlobLAv1 struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []DataItemLAv1 `json:"DataItems"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type ContainerLogBlobLAv2 struct { DataType string `json:"DataType"` IPName string `json:"IPName"` - DataItems []DataItem `json:"DataItems"` + DataItems []DataItemLAv2 `json:"DataItems"` } // MsgPackEntry represents the object corresponding to a single messagepack event in the messagepack stream @@ -792,7 +825,8 @@ func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int // PostDataHelper sends data to the ODS endpoint or oneagent or ADX func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { start := time.Now() - var dataItems []DataItem + var dataItemsLAv1 []DataItemLAv1 + var dataItemsLAv2 []DataItemLAv2 var dataItemsADX []DataItemADX var msgPackEntries []MsgPackEntry @@ -830,26 +864,42 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } stringMap = make(map[string]string) + //below id & name are used by latency telemetry in both v1 & v2 LA schemas + id := "" + name := "" logEntry := ToString(record["log"]) logEntryTimeStamp := ToString(record["time"]) - stringMap["LogEntry"] = logEntry - stringMap["LogEntrySource"] = logEntrySource - stringMap["LogEntryTimeStamp"] = logEntryTimeStamp - stringMap["SourceSystem"] = "Containers" - stringMap["Id"] = containerID - - if val, ok := imageIDMap[containerID]; ok { - stringMap["Image"] = val - } + //ADX Schema & LAv2 schema are almost the same (except resourceId) + if (ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true) { + stringMap["Computer"] = Computer + stringMap["ContainerId"] = containerID + stringMap["ContainerName"] = containerName + stringMap["PodName"] = k8sPodName + stringMap["PodNamespace"] = k8sNamespace + stringMap["LogMessage"] = logEntry + stringMap["LogSource"] = logEntrySource + stringMap["TimeGenerated"] = logEntryTimeStamp + } else { + stringMap["LogEntry"] = logEntry + stringMap["LogEntrySource"] = logEntrySource + stringMap["LogEntryTimeStamp"] = logEntryTimeStamp + stringMap["SourceSystem"] = "Containers" + stringMap["Id"] = containerID + + if val, ok := imageIDMap[containerID]; ok { + stringMap["Image"] = val + } - if val, ok := nameIDMap[containerID]; ok { - stringMap["Name"] = val - } + if val, ok := nameIDMap[containerID]; ok { + stringMap["Name"] = val + } - stringMap["TimeOfCommand"] = start.Format(time.RFC3339) - stringMap["Computer"] = Computer - var dataItem DataItem + stringMap["TimeOfCommand"] = start.Format(time.RFC3339) + stringMap["Computer"] = Computer + } + var dataItemLAv1 DataItemLAv1 + var dataItemLAv2 DataItemLAv2 var dataItemADX DataItemADX var msgPackEntry MsgPackEntry @@ -866,50 +916,71 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } else if ContainerLogsRouteADX == true { if ResourceCentric == true { stringMap["AzureResourceId"] = ResourceID + } else { + stringMap["AzureResourceId"] = "" } stringMap["PodName"] = k8sPodName stringMap["PodNamespace"] = k8sNamespace stringMap["ContainerName"] = containerName dataItemADX = DataItemADX{ - TimeGenerated: stringMap["LogEntryTimeStamp"], + TimeGenerated: stringMap["TimeGenerated"], Computer: stringMap["Computer"], - ContainerID: stringMap["Id"], + ContainerId: stringMap["ContainerId"], ContainerName: stringMap["ContainerName"], PodName: stringMap["PodName"], PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogEntry"], - LogSource: stringMap["LogEntrySource"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], AzureResourceId: stringMap["AzureResourceId"], } //ADX dataItemsADX = append(dataItemsADX, dataItemADX) } else { - dataItem = DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: stringMap["TimeOfCommand"], - SourceSystem: stringMap["SourceSystem"], - Computer: stringMap["Computer"], - Image: stringMap["Image"], - Name: stringMap["Name"], + if (ContainerLogSchemaV2 == true) { + dataItemLAv2 = DataItemLAv2{ + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], + } + //ODS-v2 schema + dataItemsLAv2 = append(dataItemsLAv2, dataItemLAv2) + name = stringMap["ContainerName"] + id = stringMap["ContainerId"] + } else { + dataItemLAv1 = DataItemLAv1{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: stringMap["TimeOfCommand"], + SourceSystem: stringMap["SourceSystem"], + Computer: stringMap["Computer"], + Image: stringMap["Image"], + Name: stringMap["Name"], + } + //ODS-v1 schema + dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) + name = stringMap["Name"] + id = stringMap["Id"] } - //ODS - dataItems = append(dataItems, dataItem) } - if stringMap["LogEntryTimeStamp"] != "" { - loggedTime, e := time.Parse(time.RFC3339, stringMap["LogEntryTimeStamp"]) + if logEntryTimeStamp != "" { + loggedTime, e := time.Parse(time.RFC3339, logEntryTimeStamp) if e != nil { - message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) + message := fmt.Sprintf("Error while converting logEntryTimeStamp for telemetry purposes: %s", e.Error()) Log(message) SendException(message) } else { ltncy := float64(start.Sub(loggedTime) / time.Millisecond) if ltncy >= maxLatency { maxLatency = ltncy - maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + maxLatencyContainer = name + "=" + id } } } @@ -919,8 +990,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { //flush to mdsd + mdsdSourceName := MdsdContainerLogSourceName + if (ContainerLogSchemaV2 == true) { + mdsdSourceName = MdsdContainerLogV2SourceName + } fluentForward := MsgPackForward{ - Tag: MdsdSourceName, + Tag: mdsdSourceName, Entries: msgPackEntries, } @@ -967,7 +1042,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { elapsed = time.Since(start) if er != nil { - Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(dataItems), elapsed, er.Error()) + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) if MdsdMsgpUnixSocketClient != nil { MdsdMsgpUnixSocketClient.Close() MdsdMsgpUnixSocketClient = nil @@ -1013,14 +1088,14 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } } - // Setup a maximum time for completion to be 15 Seconds. + // Setup a maximum time for completion to be 30 Seconds. ctx, cancel := context.WithTimeout(ParentContext, 30*time.Second) defer cancel() //ADXFlushMutex.Lock() //defer ADXFlushMutex.Unlock() //MultiJSON support is not there yet - if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogv2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { + if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogV2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { Log("Error when streaming to ADX Ingestion: %s", ingestionErr.Error()) //ADXIngestor = nil //not required as per ADX team. Will keep it to indicate that we tried this approach @@ -1035,58 +1110,75 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else { - //flush to ODS - if len(dataItems) > 0 { - logEntry := ContainerLogBlob{ - DataType: ContainerLogDataType, + } else { //ODS + var logEntry interface{} + recordType := "" + loglinesCount := 0 + //schema v2 + if (len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true) { + logEntry = ContainerLogBlobLAv2{ + DataType: ContainerLogV2DataType, IPName: IPName, - DataItems: dataItems} - - marshalled, err := json.Marshal(logEntry) - if err != nil { - message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) - Log(message) - SendException(message) - return output.FLB_OK + DataItems: dataItemsLAv2} + loglinesCount = len(dataItemsLAv2) + recordType = "ContainerLogV2" + } else { + //schema v1 + if len(dataItemsLAv1) > 0 { + logEntry = ContainerLogBlobLAv1{ + DataType: ContainerLogDataType, + IPName: IPName, + DataItems: dataItemsLAv1} + loglinesCount = len(dataItemsLAv1) + recordType = "ContainerLog" } + } - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("User-Agent", userAgent) - reqId := uuid.New().String() - req.Header.Set("X-Request-ID", reqId) - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } + marshalled, err := json.Marshal(logEntry) + //Log("LogEntry::e %s", marshalled) + if err != nil { + message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) + Log(message) + SendException(message) + return output.FLB_OK + } - resp, err := HTTPClient.Do(req) - elapsed = time.Since(start) + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("User-Agent", userAgent) + reqId := uuid.New().String() + req.Header.Set("X-Request-ID", reqId) + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + resp, err := HTTPClient.Do(req) + elapsed = time.Since(start) - if err != nil { - message := fmt.Sprintf("Error when sending request %s \n", err.Error()) - Log(message) - // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation - //SendException(message) - Log("Failed to flush %d records after %s", len(dataItems), elapsed) + if err != nil { + message := fmt.Sprintf("Error when sending request %s \n", err.Error()) + Log(message) + // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation + //SendException(message) + + Log("Failed to flush %d records after %s", loglinesCount, elapsed) - return output.FLB_RETRY - } + return output.FLB_RETRY + } - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) - } - return output.FLB_RETRY + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) } + return output.FLB_RETRY + } - defer resp.Body.Close() - numContainerLogRecords = len(dataItems) - Log("PostDataHelper::Info::Successfully flushed %d container log records to ODS in %s", numContainerLogRecords, elapsed) + defer resp.Body.Close() + numContainerLogRecords = loglinesCount + Log("PostDataHelper::Info::Successfully flushed %d %s records to ODS in %s", numContainerLogRecords, recordType, elapsed) } - } ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() @@ -1374,10 +1466,22 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateADXClient() } + ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) + Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) + + ContainerLogSchemaV2 = false //default is v1 schema + + if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { + ContainerLogSchemaV2 = true + Log("Container logs schema=%s", ContainerLogV2SchemaVersion) + fmt.Fprintf(os.Stdout, "Container logs schema=%s... \n", ContainerLogV2SchemaVersion) + } + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - if enrichContainerLogs == true && ContainerLogsRouteADX != true { + //enrichment not applicable for ADX and v2 schema + if enrichContainerLogs == true && ContainerLogsRouteADX != true && ContainerLogSchemaV2 != true { Log("ContainerLogEnrichment=true; starting goroutine to update containerimagenamemaps \n") go updateContainerImageNameMaps() } else { diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 91791ae1a..61d047e52 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -145,7 +145,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogv2") + ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogV2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 67bd61667..d3a96d37d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -25,6 +25,7 @@ class CAdvisorMetricsAPIClient @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @clusterContainerLogEnrich = ENV["AZMON_CLUSTER_CONTAINER_LOG_ENRICH"] + @clusterContainerLogSchemaVersion = ENV["AZMON_CONTAINER_LOG_SCHEMA_VERSION"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] @@ -65,6 +66,7 @@ class CAdvisorMetricsAPIClient #cadvisor ports @@CADVISOR_SECURE_PORT = "10250" @@CADVISOR_NON_SECURE_PORT = "10255" + def initialize end @@ -85,40 +87,40 @@ def getPodsFromCAdvisor(winNode: nil) end def getBaseCAdvisorUri(winNode) - cAdvisorSecurePort = isCAdvisorOnSecurePort() + cAdvisorSecurePort = isCAdvisorOnSecurePort() + + if !!cAdvisorSecurePort == true + defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + else + defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + end + + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + if !nodeIP.nil? + @Log.info("Using #{nodeIP} for CAdvisor Host") if !!cAdvisorSecurePort == true - defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" else - defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" end - + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") if !winNode.nil? - nodeIP = winNode["InternalIP"] + return nil else - nodeIP = ENV["NODE_IP"] - end - - if !nodeIP.nil? - @Log.info("Using #{nodeIP} for CAdvisor Host") - if !!cAdvisorSecurePort == true - return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" - else - return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" - end - else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") - if !winNode.nil? - return nil - else - return defaultHost - end + return defaultHost end + end end def getCAdvisorUri(winNode, relativeUri) - baseUri = getBaseCAdvisorUri(winNode) - return baseUri + relativeUri + baseUri = getBaseCAdvisorUri(winNode) + return baseUri + relativeUri end def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) @@ -247,22 +249,26 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromFDC"] = @dsPromFieldDropCount telemetryProps["dsPromUrl"] = @dsPromUrlCount end - #telemetry about containerlogs Routing for daemonset + #telemetry about containerlog Routing for daemonset if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) telemetryProps["containerLogsRoute"] = "v2" elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) telemetryProps["containerLogsRoute"] = @containerLogsRoute end - #telemetry about health model - if (!@hmEnabled.nil? && !@hmEnabled.empty?) + #telemetry about health model + if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled - end - #telemetry for npm integration - if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) - telemetryProps["int-npm-a"] = "1" - elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) - telemetryProps["int-npm-b"] = "1" - end + end + #telemetry for npm integration + if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) + telemetryProps["int-npm-a"] = "1" + elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) + telemetryProps["int-npm-b"] = "1" + end + #telemetry for Container log schema version clusterContainerLogSchemaVersion + if (!@clusterContainerLogSchemaVersion.nil? && !@clusterContainerLogSchemaVersion.empty?) + telemetryProps["containerLogVer"] = @clusterContainerLogSchemaVersion + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end @@ -303,8 +309,8 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end if !metricInfo.nil? metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed", "containerGpumemoryUsedBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle", "containerGpuDutyCycle", metricTime)) metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @@ -327,7 +333,6 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric begin metricInfo = metricJSON metricInfo["pods"].each do |pod| - podNamespace = pod["podRef"]["namespace"] excludeNamespace = false if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" @@ -351,11 +356,11 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - + metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName @@ -365,7 +370,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end @@ -390,7 +395,6 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric return metricItems end - def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId @@ -410,18 +414,17 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings containerName = container["name"] metricValue = accelerator[metricNameToCollect] - metricItem = {} metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNametoReturn metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - + metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace @@ -437,9 +440,9 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator["id"].nil? && !accelerator["id"].empty?) metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"] end - + metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end @@ -916,13 +919,13 @@ def getResponse(winNode, relativeUri) uri = URI.parse(cAdvisorUri) if isCAdvisorOnSecurePort() Net::HTTP.start(uri.host, uri.port, - :use_ssl => true, :open_timeout => 20, :read_timeout => 40, - :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", - :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" + :use_ssl => true, :open_timeout => 20, :read_timeout => 40, + :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" end else Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| @@ -935,19 +938,24 @@ def getResponse(winNode, relativeUri) rescue => error @Log.warn("CAdvisor api request for #{cAdvisorUri} failed: #{error}") telemetryProps = {} - telemetryProps["Computer"] = winNode["Hostname"] + if !winNode.nil? + hostName = winNode["Hostname"] + else + hostName = (OMS::Common.get_hostname) + end + telemetryProps["Computer"] = hostName ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) end return response end def isCAdvisorOnSecurePort - cAdvisorSecurePort = false - # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port - if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" - cAdvisorSecurePort = true - end - return cAdvisorSecurePort + cAdvisorSecurePort = false + # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port + if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" + cAdvisorSecurePort = true + end + return cAdvisorSecurePort end end end diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index aca2142a0..c5a363741 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -791,7 +791,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty?) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index ef55c3257..7824f3d4e 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -18,7 +18,7 @@ class ArcK8sClusterIdentity @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" - @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" def initialize @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" @@ -33,7 +33,9 @@ def initialize @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" end @http_client = get_http_client - @service_account_token = get_service_account_token + @service_account_token = get_service_account_token + @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] + @log.info "extension name:#{@extensionName} @ #{Time.now.utc.iso8601}" @log.info "initialize complete @ #{Time.now.utc.iso8601}" end @@ -148,7 +150,7 @@ def renew_near_expiry_token() update_response = @http_client.request(update_request) @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" if update_response.code.to_i == 404 - @log.info "since crd resource doesnt exist since creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" create_request = Net::HTTP::Post.new(crd_request_uri) create_request["Content-Type"] = "application/json" create_request["Authorization"] = "Bearer #{@service_account_token}" @@ -211,6 +213,9 @@ def get_crd_request_body body["metadata"]["namespace"] = @@cluster_identity_resource_namespace body["spec"] = {} body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience + if !@extensionName.nil? && !@extensionName.empty? + body["spec"]["resourceId"] = @extensionName + end return body end end diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 4f6017cc5..f50019a01 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -129,6 +129,7 @@ def enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f + @@istestvar = ENV["ISTEST"] begin eventStream = MultiEventStream.new events["items"].each do |items| @@ -171,6 +172,9 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeEventsInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index e7c5060a5..c803c0fa2 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -188,6 +188,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # node metrics records @@ -217,6 +220,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # node GPU metrics record @@ -249,6 +255,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs @@ -300,23 +309,35 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if kubePerfEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if insightsMetricsEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" @@ -447,7 +468,7 @@ def getNodeTelemetryProps(item) properties["Computer"] = item["metadata"]["name"] nodeInfo = item["status"]["nodeInfo"] properties["KubeletVersion"] = nodeInfo["kubeletVersion"] - properties["OperatingSystem"] = nodeInfo["osImage"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] properties["KernelVersion"] = nodeInfo["kernelVersion"] properties["OSImage"] = nodeInfo["osImage"] containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 0cff2eefe..5256eb159 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -265,6 +265,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end kubePerfEventStream = MultiEventStream.new end @@ -306,6 +309,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if insightsMetricsEventStream.count > 0 @@ -345,6 +351,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream kubeServicesEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end end end @@ -352,6 +361,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if kubeServicesEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end kubeServicesEventStream = nil end diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 861b3a8e1..4efe86f61 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -106,7 +106,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f eventStream = MultiEventStream.new - + @@istestvar = ENV["ISTEST"] begin records = [] pvInventory["items"].each do |item| @@ -156,6 +156,9 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePVInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end rescue => errorStr $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index 599640d8f..e2c731b79 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -21,9 +21,11 @@ def get_node_capacity response = CAdvisorMetricsAPIClient.getAllMetricsCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? all_metrics = response.body.split("\n") - cpu_capacity = all_metrics.select{|m| m.start_with?('machine_cpu_cores') && m.split.first.strip == 'machine_cpu_cores' }.first.split.last.to_f * 1000 + #cadvisor machine metrics can exist with (>=1.19) or without dimensions (<1.19) + #so just checking startswith of metric name would be good enough to pick the metric value from exposition format + cpu_capacity = all_metrics.select { |m| m.start_with?("machine_cpu_cores") }.first.split.last.to_f * 1000 @log.info "CPU Capacity #{cpu_capacity}" - memory_capacity_e = all_metrics.select{|m| m.start_with?('machine_memory_bytes') && m.split.first.strip == 'machine_memory_bytes' }.first.split.last + memory_capacity_e = all_metrics.select { |m| m.start_with?("machine_memory_bytes") }.first.split.last memory_capacity = BigDecimal(memory_capacity_e).to_f @log.info "Memory Capacity #{memory_capacity}" return [cpu_capacity, memory_capacity] @@ -87,9 +89,9 @@ def get_all_container_limits @log.info "cpuLimit: #{cpuLimit}" @log.info "memoryLimit: #{memoryLimit}" # Get cpu limit in nanocores - containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : 0 + containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : nil # Get memory limit in bytes - containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : 0 + containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : nil end end end diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 69beca493..82e36c8cc 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,30 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end - # image is of the format - repository/image:imagetag - imageValue = containerStatus["image"] - if !imageValue.nil? && !imageValue.empty? - # Find delimiters in the string of format repository/image:imagetag - slashLocation = imageValue.index("/") - colonLocation = imageValue.index(":") - if !colonLocation.nil? - if slashLocation.nil? - # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] - else - # repository/image:imagetag - containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] - containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] - end - containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - end - elsif !imageIdValue.nil? && !imageIdValue.empty? - # Getting repo information from imageIdValue when no tag in ImageId - if !atLocation.nil? - containerInventoryRecord["Repository"] = imageIdValue[0..(atLocation - 1)] - end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -107,6 +84,51 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + imageValue = containerInfoMap["image"] + if !imageValue.nil? && !imageValue.empty? + # Find delimiters in image format + atLocation = imageValue.index("@") + isDigestSpecified = false + if !atLocation.nil? + # repository/image@digest or repository/image:imagetag@digest, image@digest + imageValue = imageValue[0..(atLocation - 1)] + # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. + if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + end + isDigestSpecified = true + end + slashLocation = imageValue.index("/") + colonLocation = imageValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] + end + containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] + else + if slashLocation.nil? + # image + containerInventoryRecord["Image"] = imageValue + else + # repo/image + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] + end + # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. + # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names + if isDigestSpecified == false + containerInventoryRecord["ImageTag"] = "latest" + end + end + end + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -165,6 +187,7 @@ def getContainersInfoMap(podItem, isWindows) podContainers.each do |container| containerInfoMap = {} containerName = container["name"] + containerInfoMap["image"] = container["image"] containerInfoMap["ElementName"] = containerName containerInfoMap["Computer"] = nodeName containerInfoMap["PodName"] = podName diff --git a/test/e2e/e2e-tests.yaml b/test/e2e/e2e-tests.yaml new file mode 100644 index 000000000..06dfa1fb0 --- /dev/null +++ b/test/e2e/e2e-tests.yaml @@ -0,0 +1,178 @@ + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: sonobuoy +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + component: sonobuoy + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sonobuoy-serviceaccount-sonobuoy +subjects: +- kind: ServiceAccount + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' +- nonResourceURLs: + - '/metrics' + - '/logs' + - '/logs/*' + verbs: + - 'get' +--- +apiVersion: v1 +data: + config.json: | + {"Description":"DEFAULT","UUID":"bf5c02ed-1948-48f1-b12d-5a2d74435e46","Version":"v0.20.0","ResultsDir":"/tmp/sonobuoy","Resources":["apiservices","certificatesigningrequests","clusterrolebindings","clusterroles","componentstatuses","configmaps","controllerrevisions","cronjobs","customresourcedefinitions","daemonsets","deployments","endpoints","ingresses","jobs","leases","limitranges","mutatingwebhookconfigurations","namespaces","networkpolicies","nodes","persistentvolumeclaims","persistentvolumes","poddisruptionbudgets","pods","podlogs","podsecuritypolicies","podtemplates","priorityclasses","replicasets","replicationcontrollers","resourcequotas","rolebindings","roles","servergroups","serverversion","serviceaccounts","services","statefulsets","storageclasses","validatingwebhookconfigurations","volumeattachments"],"Filters":{"Namespaces":".*","LabelSelector":""},"Limits":{"PodLogs":{"Namespaces":"","SonobuoyNamespace":true,"FieldSelectors":[],"LabelSelector":"","Previous":false,"SinceSeconds":null,"SinceTime":null,"Timestamps":false,"TailLines":null,"LimitBytes":null,"LimitSize":"","LimitTime":""}},"QPS":30,"Burst":50,"Server":{"bindaddress":"0.0.0.0","bindport":8080,"advertiseaddress":"","timeoutseconds":10800},"Plugins":null,"PluginSearchPath":["./plugins.d","/etc/sonobuoy/plugins.d","~/sonobuoy/plugins.d"],"Namespace":"sonobuoy","WorkerImage":"sonobuoy/sonobuoy:v0.20.0","ImagePullPolicy":"IfNotPresent","ImagePullSecrets":"","ProgressUpdatesPort":"8099"} +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-config-cm + namespace: sonobuoy +--- +apiVersion: v1 +data: + plugin-0.yaml: | + podSpec: + containers: [] + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - key: CriticalAddonsOnly + operator: Exists + - key: kubernetes.io/e2e-evict-taint-key + operator: Exists + sonobuoy-config: + driver: Job + plugin-name: agenttests + result-format: junit + spec: + env: + # Update values of CLIENT_ID, CLIENT_SECRET of the service principal which has permission to query LA ad Metrics API + # Update value of TENANT_ID corresponding your Azure Service principal + - name: CLIENT_ID + value: "SP_CLIENT_ID_VALUE" + - name: CLIENT_SECRET + value: "CLIENT_SECRET_VALUE" + - name: TENANT_ID + value: "SP_TENANT_ID_VALUE" + - name: DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: AGENT_POD_EXPECTED_RESTART_COUNT + value: "0" + - name: AZURE_CLOUD + value: "AZURE_PUBLIC_CLOUD" + # image tag should be updated if new tests being added after this image + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciagenttest02152021 + imagePullPolicy: IfNotPresent + name: plugin + resources: {} + volumeMounts: + - mountPath: /tmp/results + name: results +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-plugins-cm + namespace: sonobuoy +--- +apiVersion: v1 +kind: Pod +metadata: + labels: + component: sonobuoy + run: sonobuoy-master + sonobuoy-component: aggregator + tier: analysis + name: sonobuoy + namespace: sonobuoy +spec: + containers: + - env: + - name: SONOBUOY_ADVERTISE_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: sonobuoy/sonobuoy:v0.20.0 + imagePullPolicy: IfNotPresent + name: kube-sonobuoy + volumeMounts: + - mountPath: /etc/sonobuoy + name: sonobuoy-config-volume + - mountPath: /plugins.d + name: sonobuoy-plugins-volume + - mountPath: /tmp/sonobuoy + name: output-volume + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - key: "kubernetes.io/e2e-evict-taint-key" + operator: "Exists" + volumes: + - configMap: + name: sonobuoy-config-cm + name: sonobuoy-config-volume + - configMap: + name: sonobuoy-plugins-cm + name: sonobuoy-plugins-volume + - emptyDir: {} + name: output-volume +--- +apiVersion: v1 +kind: Service +metadata: + labels: + component: sonobuoy + sonobuoy-component: aggregator + name: sonobuoy-aggregator + namespace: sonobuoy +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 8080 + selector: + sonobuoy-component: aggregator + type: ClusterIP + diff --git a/test/e2e/src/common/arm_rest_utility.py b/test/e2e/src/common/arm_rest_utility.py new file mode 100644 index 000000000..604f8b791 --- /dev/null +++ b/test/e2e/src/common/arm_rest_utility.py @@ -0,0 +1,25 @@ +import adal +import pytest + +from msrestazure.azure_active_directory import AADTokenCredentials + + +# Function to fetch aad token from spn id and password +def fetch_aad_token(client_id, client_secret, authority_uri, resource_uri): + """ + Authenticate using service principal w/ key. + """ + try: + context = adal.AuthenticationContext(authority_uri, api_version=None) + return context.acquire_token_with_client_credentials(resource_uri, client_id, client_secret) + except Exception as e: + pytest.fail("Error occured while fetching aad token: " + str(e)) + + +# Function that returns aad token credentials for a given spn +def fetch_aad_token_credentials(client_id, client_secret, authority_uri, resource_uri): + mgmt_token = fetch_aad_token(client_id, client_secret, authority_uri, resource_uri) + try: + return AADTokenCredentials(mgmt_token, client_id) + except Exception as e: + pytest.fail("Error occured while fetching credentials: " + str(e)) diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py new file mode 100644 index 000000000..770964cb5 --- /dev/null +++ b/test/e2e/src/common/constants.py @@ -0,0 +1,119 @@ +AZURE_PUBLIC_CLOUD_ENDPOINTS = { + "activeDirectory": "https://login.microsoftonline.com/", + "activeDirectoryDataLakeResourceId": "https://datalake.azure.net/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": "https://api.applicationinsights.io", + "appInsightsTelemetryChannel": "https://dc.applicationinsights.azure.com/v2/track", + "batchResourceId": "https://batch.core.windows.net/", + "gallery": "https://gallery.azure.com/", + "logAnalytics": "https://api.loganalytics.io", + "management": "https://management.core.windows.net/", + "mediaResourceId": "https://rest.media.azure.net", + "microsoftGraphResourceId": "https://graph.microsoft.com/", + "ossrdbmsResourceId": "https://ossrdbms-aad.database.windows.net", + "resourceManager": "https://management.azure.com/", + "sqlManagement": "https://management.core.windows.net:8443/", + "vmImageAliasDoc": "https://raw.githubusercontent.com/Azure/azure-rest-api-specs/master/arm-compute/quickstart-templates/aliases.json" +} + +AZURE_DOGFOOD_ENDPOINTS = { + "activeDirectory": "https://login.windows-ppe.net/", + "activeDirectoryDataLakeResourceId": None, + "activeDirectoryGraphResourceId": "https://graph.ppe.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": None, + "appInsightsTelemetryChannel": None, + "batchResourceId": None, + "gallery": "https://df.gallery.azure-test.net/", + "logAnalytics": None, + "management": "https://management-preview.core.windows-int.net/", + "mediaResourceId": None, + "microsoftGraphResourceId": None, + "ossrdbmsResourceId": None, + "resourceManager": "https://api-dogfood.resources.windows-int.net/", + "sqlManagement": None, + "vmImageAliasDoc": None +} + +AZURE_CLOUD_DICT = {"AZURE_PUBLIC_CLOUD" : AZURE_PUBLIC_CLOUD_ENDPOINTS, "AZURE_DOGFOOD": AZURE_DOGFOOD_ENDPOINTS} + +TIMEOUT = 300 + +# Azure Monitor for Container Extension related +AGENT_RESOURCES_NAMESPACE = 'kube-system' +AGENT_DEPLOYMENT_NAME = 'omsagent-rs' +AGENT_DAEMONSET_NAME = 'omsagent' +AGENT_WIN_DAEMONSET_NAME = 'omsagent-win' + +AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR = 'rsName=omsagent-rs' +AGENT_DAEMON_SET_PODS_LABEL_SELECTOR = 'component=oms-agent' +AGENT_OMSAGENT_LOG_PATH = '/var/opt/microsoft/omsagent/log/omsagent.log' +AGENT_REPLICASET_WORKFLOWS = ["kubePodInventoryEmitStreamSuccess", "kubeNodeInventoryEmitStreamSuccess"] + +# override this through setting enviornment variable if the expected restart count is > 0 for example applying configmap +AGENT_POD_EXPECTED_RESTART_COUNT = 0 + +# replicaset workflow streams +KUBE_POD_INVENTORY_EMIT_STREAM = "kubePodInventoryEmitStreamSuccess" +KUBE_NODE_INVENTORY_EMIT_STREAM = "kubeNodeInventoryEmitStreamSuccess" +KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM = "kubestatedeploymentsInsightsMetricsEmitStreamSuccess" +KUBE_CONTAINER_PERF_EMIT_STREAM = "kubeContainerPerfEventEmitStreamSuccess" +KUBE_SERVICES_EMIT_STREAM = "kubeServicesEventEmitStreamSuccess" +KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM = "containerNodeInventoryEmitStreamSuccess" +KUBE_EVENTS_EMIT_STREAM = "kubeEventsInventoryEmitStreamSuccess" +# daemonset workflow streams +CONTAINER_PERF_EMIT_STREAM = "cAdvisorPerfEmitStreamSuccess" +CONTAINER_INVENTORY_EMIT_STREAM = "containerInventoryEmitStreamSuccess" + +# simple log analytics queries to validate for e2e workflows +DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES = 10 +KUBE_POD_INVENTORY_QUERY = "KubePodInventory | where TimeGenerated > ago({0}) | count" +KUBE_NODE_INVENTORY_QUERY = "KubeNodeInventory | where TimeGenerated > ago({0}) | count" +KUBE_SERVICES_QUERY = "KubeServices | where TimeGenerated > ago({0}) | count" +KUBE_EVENTS_QUERY = "KubeEvents | where TimeGenerated > ago({0}) | count" +CONTAINER_NODE_INVENTORY_QUERY = "ContainerNodeInventory | where TimeGenerated > ago({0}) | count" +CONTAINER_INVENTORY_QUERY = "ContainerInventory | where TimeGenerated > ago({0}) | count" +# node perf +NODE_PERF_CPU_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuCapacityNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryCapacityBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuAllocatableNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryAllocatableBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName =='memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container perf +CONTAINER_PERF_CPU_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuLimitNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryLimitBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuRequestNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRequestBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container log +CONTAINER_LOG_QUERY = "ContainerLog | where TimeGenerated > ago({0}) | count" +# insights metrics +INSIGHTS_METRICS_QUERY = "InsightsMetrics | where TimeGenerated > ago({0}) | count" + +# custom metrics +METRICS_API_VERSION = '2019-07-01' +DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES = 10 + +# node metrics +NODE_METRICS_NAMESPACE = 'insights.container/nodes' +NODE_METRIC_METRIC_AGGREGATION = 'average' +NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME = 'cpuUsageMilliCores' +NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME = 'cpuUsagePercentage' +NODE_MEMORY_RSS_METRIC_NAME = 'memoryRssBytes' +NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME = 'memoryRssPercentage' +NODE_MEMORY_WS_METRIC_NAME = 'memoryWorkingSetBytes' +NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME = 'memoryWorkingSetPercentage' +NODE_COUNT_METRIC_NAME = 'nodesCount' +NODE_DISK_USAGE_PERCENTAGE_METRIC_NAME = 'diskUsedPercentage(Preview)' + +# pod metrics +POD_METRICS_NAMESPACE = 'insights.container/pods' +POD_METRIC_METRIC_AGGREGATION = 'average' +POD_COUNT_METRIC_NAME = 'PodCount' diff --git a/test/e2e/src/common/helm_utility.py b/test/e2e/src/common/helm_utility.py new file mode 100644 index 000000000..6eac1e071 --- /dev/null +++ b/test/e2e/src/common/helm_utility.py @@ -0,0 +1,68 @@ +import os +import pytest +import subprocess + + +# Function to pull helm charts +def pull_helm_chart(registry_path): + os.environ['HELM_EXPERIMENTAL_OCI'] = '1' + cmd_helm_chart_pull = ["helm", "chart", "pull", registry_path] + response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_pull, error_helm_chart_pull = response_helm_chart_pull.communicate() + if response_helm_chart_pull.returncode != 0: + pytest.fail("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + return output_helm_chart_pull.decode("ascii") + + +# Function to export helm charts +def export_helm_chart(registry_path, destination): + cmd_helm_chart_export = ["helm", "chart", "export", registry_path, "--destination", destination] + response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_export, error_helm_chart_export = response_helm_chart_export.communicate() + if response_helm_chart_export.returncode != 0: + pytest.fail("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + return output_helm_chart_export.decode("ascii") + + +# Function to add a helm repository +def add_helm_repo(repo_name, repo_url): + cmd_helm_repo = ["helm", "repo", "add", repo_name, repo_url] + response_helm_repo = subprocess.Popen(cmd_helm_repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_repo, error_helm_repo = response_helm_repo.communicate() + if response_helm_repo.returncode != 0: + pytest.fail("Unable to add repository {} to helm: ".format(repo_url) + error_helm_repo.decode("ascii")) + return output_helm_repo.decode("ascii") + + +# Function to install helm charts +def install_helm_chart(helm_release_name, helm_release_namespace, helm_chart_path, wait=False, **kwargs): + cmd_helm_install = ["helm", "install", helm_release_name, helm_chart_path, "--namespace", helm_release_namespace] + if wait: + cmd_helm_install.extend(["--wait"]) + for key, value in kwargs.items(): + cmd_helm_install.extend(["--set", "{}={}".format(key, value)]) + response_helm_install = subprocess.Popen(cmd_helm_install, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_install, error_helm_install = response_helm_install.communicate() + if response_helm_install.returncode != 0: + pytest.fail("Unable to install helm release: " + error_helm_install.decode("ascii")) + return output_helm_install.decode("ascii") + + +# Function to delete helm chart +def delete_helm_release(helm_release_name, helm_release_namespace): + cmd_helm_delete = ["helm", "delete", helm_release_name, "--namespace", helm_release_namespace] + response_helm_delete = subprocess.Popen(cmd_helm_delete, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_delete, error_helm_delete = response_helm_delete.communicate() + if response_helm_delete.returncode != 0: + pytest.fail("Error occured while deleting the helm release: " + error_helm_delete.decode("ascii")) + return output_helm_delete.decode("ascii") + + +# Function to list helm release +def list_helm_release(helm_release_namespace): + cmd_helm_list = ["helm", "list", "--namespace", helm_release_namespace] + response_helm_list = subprocess.Popen(cmd_helm_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_list, error_helm_list = response_helm_list.communicate() + if response_helm_list.returncode != 0: + pytest.fail("Error occured while fetching the helm release: " + error_helm_list.decode("ascii")) + return output_helm_list.decode("ascii") diff --git a/test/e2e/src/common/kubernetes_configmap_utility.py b/test/e2e/src/common/kubernetes_configmap_utility.py new file mode 100644 index 000000000..caee9628e --- /dev/null +++ b/test/e2e/src/common/kubernetes_configmap_utility.py @@ -0,0 +1,8 @@ +import pytest + + +def get_namespaced_configmap(api_instance, namespace, configmap_name): + try: + return api_instance.read_namespaced_config_map(configmap_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving configmap: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_crd_utility.py b/test/e2e/src/common/kubernetes_crd_utility.py new file mode 100644 index 000000000..f84092878 --- /dev/null +++ b/test/e2e/src/common/kubernetes_crd_utility.py @@ -0,0 +1,27 @@ +import pytest + +from kubernetes import watch + + +# Function to get the CRD instance +def get_crd_instance(api_instance, group, version, namespace, plural, crd_name): + try: + return api_instance.get_namespaced_custom_object(group, version, namespace, plural, crd_name) + except Exception as e: + pytest.fail("Error occurred when retrieving crd information: " + str(e)) + + +# Function that watches events corresponding to given CRD instance and passes the events to a callback function +def watch_crd_instance(api_instance, group, version, namespace, plural, crd_name, timeout, callback=None): + if not callback: + pytest.fail("callback should be specified") + + field_selector = "metadata.name={}".format(crd_name) if crd_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_custom_object, group, version, namespace, plural, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when watching crd instance events: " + str(e)) + pytest.fail("The watch on the crd instance events has timed out.") diff --git a/test/e2e/src/common/kubernetes_daemonset_utility.py b/test/e2e/src/common/kubernetes_daemonset_utility.py new file mode 100644 index 000000000..dd76a11d9 --- /dev/null +++ b/test/e2e/src/common/kubernetes_daemonset_utility.py @@ -0,0 +1,36 @@ +import pytest +from kubernetes import watch + +# Returns a list of daemon_sets in a given namespace +def list_daemon_set(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_daemon_set(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving daemon_sets: " + str(e)) + +# Deletes a daemon_set +def delete_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.delete_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting daemon_set: " + str(e)) + +# Read a daemon_set +def read_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.read_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading daemon_set: " + str(e)) + +# Function that watches events corresponding to daemon_sets in the given namespace and passes the events to a callback function +def watch_daemon_set_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_daemon_set, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking daemon_set status: " + str(e)) + print("The watch on the daemon_set status has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_deployment_utility.py b/test/e2e/src/common/kubernetes_deployment_utility.py new file mode 100644 index 000000000..1be7a6b71 --- /dev/null +++ b/test/e2e/src/common/kubernetes_deployment_utility.py @@ -0,0 +1,38 @@ +import pytest +from kubernetes import watch + +# Returns a list of deployments in a given namespace +def list_deployment(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_deployment(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving deployments: " + str(e)) + +# Deletes a deployment +def delete_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.delete_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting deployment: " + str(e)) + + +# Read a deployment +def read_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.read_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading deployment: " + str(e)) + +# Function that watches events corresponding to deployments in the given namespace and passes the events to a callback function +def watch_deployment_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_deployment, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking deployment status: " + str(e)) + print("The watch on the deployment status has timed out. Please see the pod logs for more info.") + \ No newline at end of file diff --git a/test/e2e/src/common/kubernetes_namespace_utility.py b/test/e2e/src/common/kubernetes_namespace_utility.py new file mode 100644 index 000000000..cea5788c5 --- /dev/null +++ b/test/e2e/src/common/kubernetes_namespace_utility.py @@ -0,0 +1,32 @@ +import pytest +from kubernetes import watch + + +# Function that watches events corresponding to kubernetes namespaces and passes the events to a callback function +def watch_namespace(api_instance, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking namespace status: " + str(e)) + pytest.fail("The watch on the namespaces has timed out.") + + +# Function to list all kubernetes namespaces +def list_namespace(api_instance): + try: + return api_instance.list_namespace() + except Exception as e: + pytest.fail("Error occured when retrieving namespaces: " + str(e)) + + +# Function to delete a kubernetes namespaces +def delete_namespace(api_instance, namespace_name): + try: + return api_instance.delete_namespace(namespace_name) + except Exception as e: + pytest.fail("Error occured when deleting namespace: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_node_utility.py b/test/e2e/src/common/kubernetes_node_utility.py new file mode 100644 index 000000000..050ce8b87 --- /dev/null +++ b/test/e2e/src/common/kubernetes_node_utility.py @@ -0,0 +1,12 @@ +import pytest + +def get_kubernetes_node_count(api_instance): + node_list = list_kubernetes_nodes(api_instance) + return len(node_list.items) + +def list_kubernetes_nodes(api_instance): + try: + return api_instance.list_node() + except Exception as e: + pytest.fail("Error occured while retrieving node information: " + str(e)) + diff --git a/test/e2e/src/common/kubernetes_pod_utility.py b/test/e2e/src/common/kubernetes_pod_utility.py new file mode 100644 index 000000000..27345fae7 --- /dev/null +++ b/test/e2e/src/common/kubernetes_pod_utility.py @@ -0,0 +1,65 @@ +import pytest +import time + +from kubernetes import watch +from kubernetes.stream import stream + +# Returns a kubernetes pod object in given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod(api_instance, namespace, pod_name): + try: + return api_instance.read_namespaced_pod(pod_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving pod information: " + str(e)) + + +# Returns a list of kubernetes pod objects in a given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod_list(api_instance, namespace, label_selector=""): + try: + return api_instance.list_namespaced_pod(namespace, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occurred when retrieving pod information: " + str(e)) + +# get the content of the log file in the container via exec +def get_log_file_content(api_instance, namespace, podName, logfilePath): + try: + exec_command = ['tar','cf', '-', logfilePath] + return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False) + except Exception as e: + pytest.fail("Error occurred when retrieving log file content: " + str(e)) + +# Function that watches events corresponding to pods in the given namespace and passes the events to a callback function +def watch_pod_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_pod, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking pod status: " + str(e)) + pytest.fail("The watch on the pods has timed out. Please see the pod logs for more info.") + + +# Function that watches events corresponding to pod logs and passes them to a callback function +def watch_pod_logs(api_instance, namespace, pod_name, container_name, timeout_seconds, callback=None): + if not callback: + return + try: + w = watch.Watch() + timeout = time.time() + timeout_seconds + for event in w.stream(api_instance.read_namespaced_pod_log, pod_name, namespace, container=container_name): + if callback(event): + return + if time.time() > timeout: + pytest.fail("The watch on the pod logs has timed out.") + except Exception as e: + pytest.fail("Error occurred when checking pod logs: " + str(e)) + + +# Function that returns the pod logs of a given container. +def get_pod_logs(api_instance, pod_namespace, pod_name, container_name): + try: + return api_instance.read_namespaced_pod_log(pod_name, pod_namespace, container=container_name) + except Exception as e: + pytest.fail("Error occurred when fetching pod logs: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_secret_utility.py b/test/e2e/src/common/kubernetes_secret_utility.py new file mode 100644 index 000000000..8cc07fd4d --- /dev/null +++ b/test/e2e/src/common/kubernetes_secret_utility.py @@ -0,0 +1,26 @@ +import sys + +from kubernetes import watch + + +# This function returns the kubernetes secret object present in a given namespace +def get_kubernetes_secret(api_instance, namespace, secret_name): + try: + return api_instance.read_namespaced_secret(secret_name, namespace) + except Exception as e: + sys.exit("Error occurred when retrieving secret '{}': ".format(secret_name) + str(e)) + + +# Function that watches events corresponding to kubernetes secrets and passes the events to a callback function +def watch_kubernetes_secret(api_instance, namespace, secret_name, timeout, callback=None): + if not callback: + return + field_selector = "metadata.name={}".format(secret_name) if secret_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_secret, namespace, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + sys.exit("Error occurred when watching kubernetes secret events: " + str(e)) + sys.exit("The watch on the kubernetes secret events has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_service_utility.py b/test/e2e/src/common/kubernetes_service_utility.py new file mode 100644 index 000000000..694af885a --- /dev/null +++ b/test/e2e/src/common/kubernetes_service_utility.py @@ -0,0 +1,19 @@ +import pytest + +from kubernetes import watch + + +# Returns a list of services in a given namespace +def list_service(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_service(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving services: " + str(e)) + + +# Deletes a service +def delete_service(api_instance, namespace, service_name): + try: + return api_instance.delete_namespaced_service(service_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting service: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_version_utility.py b/test/e2e/src/common/kubernetes_version_utility.py new file mode 100644 index 000000000..884d1df2f --- /dev/null +++ b/test/e2e/src/common/kubernetes_version_utility.py @@ -0,0 +1,9 @@ +import pytest + + +def get_kubernetes_server_version(api_instance): + try: + api_response = api_instance.get_code() + return api_response.git_version + except Exception as e: + pytest.fail("Error occured when retrieving kubernetes server version: " + str(e)) diff --git a/test/e2e/src/common/results_utility.py b/test/e2e/src/common/results_utility.py new file mode 100644 index 000000000..14066bf16 --- /dev/null +++ b/test/e2e/src/common/results_utility.py @@ -0,0 +1,24 @@ +import pytest +import shutil +import tarfile + +from pathlib import Path + + + +# Function to create the test result directory +def create_results_dir(results_dir): + print(results_dir) + try: + Path(results_dir).mkdir(parents=True, exist_ok=True) + except Exception as e: + pytest.fail("Unable to create the results directory: " + str(e)) + + +# Function to append logs from the test run into a result file +def append_result_output(message, result_file_path): + try: + with open(result_file_path, "a") as result_file: + result_file.write(message) + except Exception as e: + pytest.fail("Error while appending message '{}' to results file: ".format(message) + str(e)) diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile new file mode 100644 index 000000000..9f85bdf4c --- /dev/null +++ b/test/e2e/src/core/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.6 + +RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org pytest pytest-xdist filelock requests kubernetes adal msrestazure + +RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \ + && helm version + +COPY ./core/e2e_tests.sh / +COPY ./core/pytest.ini /e2etests/ +COPY ./core/conftest.py /e2etests/ +COPY ./core/helper.py /e2etests/ +COPY ./core/ /e2etests/ +COPY ./common/ /e2etests/ +COPY ./tests/ /e2etests/ + +RUN ["chmod", "+x", "/e2e_tests.sh"] +ENTRYPOINT ["./e2e_tests.sh"] diff --git a/test/e2e/src/core/conftest.py b/test/e2e/src/core/conftest.py new file mode 100644 index 000000000..e659d5189 --- /dev/null +++ b/test/e2e/src/core/conftest.py @@ -0,0 +1,90 @@ +import pytest +import os +import time +import pickle + +import constants + +from filelock import FileLock +from pathlib import Path +from results_utility import create_results_dir, append_result_output + +pytestmark = pytest.mark.agentests + +# Fixture to collect all the environment variables, install pre-requisites. It will be run before the tests. +@pytest.fixture(scope='session', autouse=True) +def env_dict(): + my_file = Path("env.pkl") # File to store the environment variables. + with FileLock(str(my_file) + ".lock"): # Locking the file since each test will be run in parallel as separate subprocesses and may try to access the file simultaneously. + env_dict = {} + if not my_file.is_file(): + # Creating the results directory + create_results_dir('/tmp/results') + + # Setting some environment variables + env_dict['SETUP_LOG_FILE'] = '/tmp/results/setup' + env_dict['TEST_AGENT_LOG_FILE'] = '/tmp/results/containerinsights' + env_dict['NUM_TESTS_COMPLETED'] = 0 + + print("Starting setup...") + append_result_output("Starting setup...\n", env_dict['SETUP_LOG_FILE']) + + # Collecting environment variables + env_dict['TENANT_ID'] = os.getenv('TENANT_ID') + env_dict['CLIENT_ID'] = os.getenv('CLIENT_ID') + env_dict['CLIENT_SECRET'] = os.getenv('CLIENT_SECRET') + + # get default query time interval for log analytics queries + queryTimeInterval = int(os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + # add minute suffix since this format required for LA queries + env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] = str(queryTimeInterval) + "m" + + # get default query time interval for metrics queries + env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] = int(os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + + + # expected agent pod restart count + env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] = int(os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT')) if os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT') else constants.AGENT_POD_EXPECTED_RESTART_COUNT + + # default to azure public cloud if AZURE_CLOUD not specified + env_dict['AZURE_ENDPOINTS'] = constants.AZURE_CLOUD_DICT.get(os.getenv('AZURE_CLOUD')) if os.getenv('AZURE_CLOUD') else constants.AZURE_PUBLIC_CLOUD_ENDPOINTS + + if not env_dict.get('TENANT_ID'): + pytest.fail('ERROR: variable TENANT_ID is required.') + + if not env_dict.get('CLIENT_ID'): + pytest.fail('ERROR: variable CLIENT_ID is required.') + + if not env_dict.get('CLIENT_SECRET'): + pytest.fail('ERROR: variable CLIENT_SECRET is required.') + + print("Setup Complete.") + append_result_output("Setup Complete.\n", env_dict['SETUP_LOG_FILE']) + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) + else: + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + yield env_dict + + my_file = Path("env.pkl") + with FileLock(str(my_file) + ".lock"): + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + env_dict['NUM_TESTS_COMPLETED'] = 1 + env_dict.get('NUM_TESTS_COMPLETED') + if env_dict['NUM_TESTS_COMPLETED'] == int(os.getenv('NUM_TESTS')): + # Checking if cleanup is required. + if os.getenv('SKIP_CLEANUP'): + return + print('Starting cleanup...') + append_result_output("Starting Cleanup...\n", env_dict['SETUP_LOG_FILE']) + + print("Cleanup Complete.") + append_result_output("Cleanup Complete.\n", env_dict['SETUP_LOG_FILE']) + return + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) diff --git a/test/e2e/src/core/e2e_tests.sh b/test/e2e/src/core/e2e_tests.sh new file mode 100644 index 000000000..3bfafdce9 --- /dev/null +++ b/test/e2e/src/core/e2e_tests.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +results_dir="${RESULTS_DIR:-/tmp/results}" + +# saveResults prepares the results for handoff to the Sonobuoy worker. +# See: https://github.com/vmware-tanzu/sonobuoy/blob/master/docs/plugins.md +saveResults() { + cd ${results_dir} + + # Sonobuoy worker expects a tar file. + tar czf results.tar.gz * + + # Signal to the worker that we are done and where to find the results. + printf ${results_dir}/results.tar.gz > ${results_dir}/done +} + +# Ensure that we tell the Sonobuoy worker we are done regardless of results. +trap saveResults EXIT + +# The variable 'TEST_LIST' should be provided if we want to run specific tests. If not provided, all tests are run + +NUM_PROCESS=$(pytest /e2etests/ --collect-only -k "$TEST_NAME_LIST" -m "$TEST_MARKER_LIST" | grep " 0): + pytest.fail("numberMisscheduled shouldnt be greater than 0 for the daemonset {}.".format( + daemonset_name)) + + except Exception as e: + pytest.fail("Error occured while checking daemonset status: " + str(e)) + +# This function checks the status of kubernetes pods +def check_kubernetes_pods_status(pod_namespace, label_selector, expectedPodRestartCount, outfile=None): + try: + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, pod_namespace, label_selector) + append_result_output("podlist output {}\n".format(pod_list), outfile) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + pods = pod_list.items + if not pods: + pytest.fail("pod items shouldnt be null or empty") + if len(pods) <= 0: + pytest.fail("pod count should be greater than 0") + for pod in pods: + status = pod.status + podstatus = status.phase + if not podstatus: + pytest.fail("status should not be null or empty") + if podstatus != "Running": + pytest.fail("pod status should be in running state") + containerStatuses = status.container_statuses + if not containerStatuses: + pytest.fail("containerStatuses shouldnt be nil or empty") + if len(containerStatuses) <= 0: + pytest.fail("length containerStatuses should be greater than 0") + for containerStatus in containerStatuses: + containerId = containerStatus.container_id + if not containerId: + pytest.fail("containerId shouldnt be nil or empty") + image = containerStatus.image + if not image: + pytest.fail("image shouldnt be nil or empty") + imageId = containerStatus.image_id + if not imageId: + pytest.fail("imageId shouldnt be nil or empty") + restartCount = containerStatus.restart_count + if restartCount > expectedPodRestartCount: + pytest.fail("restartCount shouldnt be greater than expected pod restart count: {}".format(expectedPodRestartCount)) + ready = containerStatus.ready + if not ready: + pytest.fail("container status should be in ready state") + containerState = containerStatus.state + if not containerState.running: + pytest.fail("container state should be in running state") + except Exception as e: + pytest.fail("Error occured while checking pods status: " + str(e)) + + +def check_namespace_status_using_watch(outfile=None, namespace_list=None, timeout=300): + namespace_dict = {} + for namespace in namespace_list: + namespace_dict[namespace] = 0 + append_result_output( + "Namespace dict: {}\n".format(namespace_dict), outfile) + print("Generated the namespace dictionary.") + + # THe callback function to check the namespace status + def namespace_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + namespace_name = event['raw_object'].get('metadata').get('name') + namespace_status = event['raw_object'].get('status') + if not namespace_status: + return False + if namespace_status.get('phase') == 'Active': + namespace_dict[namespace_name] = 1 + if all(ele == 1 for ele in list(namespace_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the namespace event: " + str(e)) + + # Checking the namespace status + api_instance = client.CoreV1Api() + watch_namespace(api_instance, timeout, namespace_event_callback) + +# This function checks the status of daemonset in a given namespace. The daemonset to be monitored are identified using the pod label list parameter. +def check_kubernetes_daemonset_status_using_watch(daemonset_namespace, outfile=None, daemonset_label_list=None, timeout=300): + daemonset_label_dict = {} + if daemonset_label_list: # This parameter is a list of label values to identify the daemonsets that we want to monitor in the given namespace + for daemonset_label in daemonset_label_list: + daemonset_label_dict[daemonset_label] = 0 + append_result_output("daemonset label dict: {}\n".format( + daemonset_label_dict), outfile) + print("Generated the daemonset dictionary.") + + # The callback function to check if the pod is in running state + def daemonset_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + daemonset_status = event['raw_object'].get('status') + daemonset_metadata = event['raw_object'].get('metadata') + daemonset_metadata_labels = daemonset_metadata.get('labels') + if not daemonset_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + daemonset_metadata_label_values = daemonset_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in daemonset_metadata_label_values: + if label_value in daemonset_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + currentNumberScheduled = daemonset_status.get( + 'currentNumberScheduled') + desiredNumberScheduled = daemonset_status.get( + 'desiredNumberScheduled') + numberAvailable = daemonset_status.get('numberAvailable') + numberReady = daemonset_status.get('numberReady') + numberMisscheduled = daemonset_status.get('numberMisscheduled') + + if (currentNumberScheduled != desiredNumberScheduled): + pytest.fail("currentNumberScheduled doesnt match with currentNumberScheduled for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberAvailable != numberReady): + pytest.fail("numberAvailable doesnt match with expected numberReady for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberMisscheduled > 0): + pytest.fail("numberMisscheduled is greater than 0 for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if daemonset_label_dict: + api_instance = client.AppsV1Api() + watch_daemon_set_status( + api_instance, daemonset_namespace, timeout, daemonset_event_callback) + +# This function checks the status of deployment in a given namespace. The deployment to be monitored are identified using the pod label list parameter. +def check_kubernetes_deployments_status_using_watch(deployment_namespace, outfile=None, deployment_label_list=None, timeout=300): + deployment_label_dict = {} + if deployment_label_list: # This parameter is a list of label values to identify the deployments that we want to monitor in the given namespace + for deployment_label in deployment_label_list: + deployment_label_dict[deployment_label] = 0 + append_result_output("Deployment label dict: {}\n".format( + deployment_label_dict), outfile) + print("Generated the deployment dictionary.") + + # The callback function to check if the pod is in running state + def deployment_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + deployment_status = event['raw_object'].get('status') + deployment_metadata = event['raw_object'].get('metadata') + deployment_metadata_labels = deployment_metadata.get('labels') + if not deployment_metadata_labels: + return False + + # It contains the list of all label values for the deployment whose event was called. + deployment_metadata_label_values = deployment_metadata_labels.values() + # This label value will be common in deployment event and label list provided and will be monitored + current_label_value = None + for label_value in deployment_metadata_label_values: + if label_value in deployment_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + availableReplicas = deployment_status.get('availableReplicas') + readyReplicas = deployment_status.get('readyReplicas') + replicas = deployment_status.get('replicas') + + if (replicas != availableReplicas): + pytest.fail("availableReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + if (replicas != readyReplicas): + pytest.fail("readyReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if deployment_label_dict: + api_instance = client.AppsV1Api() + watch_deployment_status( + api_instance, deployment_namespace, timeout, deployment_event_callback) + +# This function checks the status of pods in a given namespace. The pods to be monitored are identified using the pod label list parameter. +def check_kubernetes_pods_status_using_watch(pod_namespace, outfile=None, pod_label_list=None, timeout=300): + pod_label_dict = {} + if pod_label_list: # This parameter is a list of label values to identify the pods that we want to monitor in the given namespace + for pod_label in pod_label_list: + pod_label_dict[pod_label] = 0 + append_result_output( + "Pod label dict: {}\n".format(pod_label_dict), outfile) + print("Generated the pods dictionary.") + + # The callback function to check if the pod is in running state + def pod_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + pod_status = event['raw_object'].get('status') + pod_metadata = event['raw_object'].get('metadata') + pod_metadata_labels = pod_metadata.get('labels') + if not pod_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + pod_metadata_label_values = pod_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in pod_metadata_label_values: + if label_value in pod_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + if pod_status.get('containerStatuses'): + for container in pod_status.get('containerStatuses'): + if container.get('restartCount') > 0: + pytest.fail("The pod {} was restarted. Please see the pod logs for more info.".format( + container.get('name'))) + if not container.get('state').get('running'): + pod_label_dict[current_label_value] = 0 + return False + else: + pod_label_dict[current_label_value] = 1 + if all(ele == 1 for ele in list(pod_label_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if pod_label_dict: + api_instance = client.CoreV1Api() + watch_pod_status(api_instance, pod_namespace, + timeout, pod_event_callback) + + +# Function to check if the crd instance status has been updated with the status fields mentioned in the 'status_list' parameter +def check_kubernetes_crd_status_using_watch(crd_group, crd_version, crd_namespace, crd_plural, crd_name, status_dict={}, outfile=None, timeout=300): + # The callback function to check if the crd event received has been updated with the status fields + def crd_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + crd_status = event['raw_object'].get('status') + if not crd_status: + return False + for status_field in status_dict: + if not crd_status.get(status_field): + return False + if crd_status.get(status_field) != status_dict.get(status_field): + pytest.fail( + "The CRD instance status has been updated with incorrect value for '{}' field.".format(status_field)) + return True + except Exception as e: + pytest.fail("Error occured while processing crd event: " + str(e)) + + # Checking if CRD instance has been updated with status fields + api_instance = client.CustomObjectsApi() + watch_crd_instance(api_instance, crd_group, crd_version, crd_namespace, + crd_plural, crd_name, timeout, crd_event_callback) + + +# Function to monitor the pod logs. It will ensure that are logs passed in the 'log_list' parameter are present in the container logs. +def check_kubernetes_pod_logs_using_watch(pod_namespace, pod_name, container_name, logs_list=None, error_logs_list=None, outfile=None, timeout=300): + logs_dict = {} + for log in logs_list: + logs_dict[log] = 0 + print("Generated the logs dictionary.") + + # The callback function to examine the pod log + def pod_log_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + for error_log in error_logs_list: + if error_log in event: + pytest.fail("Error log found: " + event) + for log in logs_dict: + if log in event: + logs_dict[log] = 1 + if all(ele == 1 for ele in list(logs_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing pod log event: " + str(e)) + + # Checking the pod logs + api_instance = client.CoreV1Api() + watch_pod_logs(api_instance, pod_namespace, pod_name, + container_name, timeout, pod_log_event_callback) + +# Function to monitor the kubernetes secret. It will determine if the secret has been successfully created. +def check_kubernetes_secret_using_watch(secret_namespace, secret_name, timeout=300): + # The callback function to check if the secret event received has secret data + def secret_event_callback(event): + try: + secret_data = event['raw_object'].get('data') + if not secret_data: + return False + return True + except Exception as e: + pytest.fail( + "Error occured while processing secret event: " + str(e)) + + # Checking the kubernetes secret + api_instance = client.CoreV1Api() + watch_kubernetes_secret(api_instance, secret_namespace, + secret_name, timeout, secret_event_callback) diff --git a/test/e2e/src/core/pytest.ini b/test/e2e/src/core/pytest.ini new file mode 100644 index 000000000..f4dc462f0 --- /dev/null +++ b/test/e2e/src/core/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + agentests: marks tests are a part of arc agent conformance tests (deselect with '-m "not agentests"') + \ No newline at end of file diff --git a/test/e2e/src/tests/test_ds_workflows.py b/test/e2e/src/tests/test_ds_workflows.py new file mode 100755 index 000000000..81ef08325 --- /dev/null +++ b/test/e2e/src/tests/test_ds_workflows.py @@ -0,0 +1,60 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of ds agent workflows +def test_ds_workflows(env_dict): + print("Starting daemonset agent workflows test.") + append_result_output("test_ds_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting daemonset pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("daemonset pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in daemonset pod list should be greater than 0") + + for podItem in pod_list.items: + podName = podItem.metadata.name + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for pod: " + podName) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0 for pod :" + podName) + + IsContainerPerfEmitStream = False + IsContainerInventoryStream = False + for line in loglines: + if line.find(constants.CONTAINER_PERF_EMIT_STREAM) >= 0: + IsContainerPerfEmitStream = True + if line.find(constants.CONTAINER_INVENTORY_EMIT_STREAM) >= 0: + IsContainerInventoryStream = True + + if IsContainerPerfEmitStream == False: + pytest.fail("ContainerPerf stream not emitted successfully from pod:" + podName) + if IsContainerInventoryStream == False: + pytest.fail("ContainerInventory stream not emitted successfully from pod:" + podName) + + append_result_output("test_ds_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed daemonset workflows test.") diff --git a/test/e2e/src/tests/test_e2e_workflows.py b/test/e2e/src/tests/test_e2e_workflows.py new file mode 100755 index 000000000..11a8e18e3 --- /dev/null +++ b/test/e2e/src/tests/test_e2e_workflows.py @@ -0,0 +1,330 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output + + +pytestmark = pytest.mark.agentests + +# validation of workflows e2e +def test_e2e_workflows(env_dict): + print("Starting e2e workflows test.") + append_result_output("test_e2e_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for LA queries + queryTimeInterval = env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not queryTimeInterval: + pytest.fail("DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail("environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail("failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for log analytics resource for the queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get('activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resource = env_dict.get('AZURE_ENDPOINTS').get('logAnalytics') + aad_token = fetch_aad_token(client_id, client_secret, authority_uri, resource) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate e2e workflows by checking data in log analytics workspace through resource centric queries + queryUrl = resource + "/v1" + clusterResourceId + "/query" + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json" + } + # KubePodInventory + query = constants.KUBE_POD_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_POD_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} and workflow: {1}".format(clusterResourceId, 'KUBE_POD_INVENTORY')) + + # KubeNodeInventory + query = constants.KUBE_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_NODE_INVENTORY')) + + # KubeServices + query = constants.KUBE_SERVICES_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_SERVICES')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_SERVICES')) + + # KubeEvents + query = constants.KUBE_EVENTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_EVENTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_EVENTS')) + + # Container Node Inventory + query = constants.CONTAINER_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_NODE_INVENTORY')) + + # Node Perf + # cpu capacity + query = constants.NODE_PERF_CPU_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_CAPCITY')) + + # memory capacity + query = constants.NODE_PERF_MEMORY_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_CAPCITY')) + + # cpu allocatable + query = constants.NODE_PERF_CPU_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_ALLOCATABLE')) + + # memory allocatable + query = constants.NODE_PERF_MEMORY_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_ALLOCATABLE')) + + # cpu usage + query = constants.NODE_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.NODE_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.NODE_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_WS_USAGE')) + + # restartime epoch + query = constants.NODE_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_RESTART_TIME_EPOCH')) + + # Container Perf + # container cpu limits + query = constants.CONTAINER_PERF_CPU_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_LIMITS')) + + # container memory limits + query = constants.CONTAINER_PERF_MEMORY_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_LIMITS')) + + # cpu requests + query = constants.CONTAINER_PERF_CPU_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_REQUESTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_REQUESTS')) + + # memory requests + query = constants.CONTAINER_PERF_MEMORY_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_REQUESTS_QUERY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_REQUESTS')) + + # cpu usage + query = constants.CONTAINER_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.CONTAINER_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_WS_USAGE')) + + # restart time epoch + query = constants.CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_RESTART_TIME_EPOCH')) + + # Container log + query = constants.CONTAINER_LOG_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_LOG')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_LOG')) + + # InsightsMetrics + query = constants.INSIGHTS_METRICS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('INSIGHTS_METRICS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'INSIGHTS_METRICS')) + + append_result_output("test_e2e_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_node_metrics_e2e_workflow.py b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py new file mode 100755 index 000000000..4346f89a8 --- /dev/null +++ b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py @@ -0,0 +1,420 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of node metrics e2e workflow +def test_node_metrics_e2e_workflow(env_dict): + print("Starting node metrics e2e workflow test.") + append_result_output("test_node_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metric queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metric queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # node metric - memoryRssBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryRssPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORYE_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsageMilliCores + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsagePercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - nodesCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_COUNT_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + append_result_output("test_node_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed node metrics e2e workflow test.") diff --git a/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py new file mode 100755 index 000000000..cd4260f76 --- /dev/null +++ b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py @@ -0,0 +1,134 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of pod metrics e2e workflows +def test_pod_metrics_e2e_workflow(env_dict): + print("Starting pod metrics e2e workflows test.") + append_result_output("test_pod_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metrics queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metrics queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # pod metric - PodCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.POD_COUNT_METRIC_NAME, + constants.POD_METRIC_METRIC_AGGREGATION, + constants.POD_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.POD_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.POD_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.POD_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.POD_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + + append_result_output("test_pod_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_resource_status.py b/test/e2e/src/tests/test_resource_status.py new file mode 100755 index 000000000..bb63dac7c --- /dev/null +++ b/test/e2e/src/tests/test_resource_status.py @@ -0,0 +1,43 @@ +import pytest +import constants + +from kubernetes import client, config +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status + +pytestmark = pytest.mark.agentests + +# validate all the critical resources such as ds, rs, ds pods and rs pod etc. are up and running +def test_resource_status(env_dict): + print("Starting resource status check.") + append_result_output("test_resource_status start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + #config.load_kube_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # checking the deployment status + check_kubernetes_deployment_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DEPLOYMENT_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + # checking the daemonset status + check_kubernetes_daemonset_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DAEMONSET_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + expectedPodRestartCount = env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] + # checking deployment pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + # checking daemonset pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + append_result_output("test_resource_status end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully checked resource status check.") diff --git a/test/e2e/src/tests/test_rs_workflows.py b/test/e2e/src/tests/test_rs_workflows.py new file mode 100755 index 000000000..aef422171 --- /dev/null +++ b/test/e2e/src/tests/test_rs_workflows.py @@ -0,0 +1,93 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of replicaset agent workflows +def test_rs_workflows(env_dict): + print("Starting replicaset agent workflows test.") + append_result_output("test_rs_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + rspodName = pod_list.items[0].metadata.name + if not rspodName: + pytest.fail("replicaset pod name should not be null or empty") + + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for rs pod: {}".format(rspodName)) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0") + + IsKubePodInventorySuccessful = False + IsKubeNodeInventorySuccessful = False + IsKubeDeploymentInventorySuccessful = False + IsKubeContainerPerfInventorySuccessful = False + IsKubeServicesInventorySuccessful = False + IsContainerNodeInventorySuccessful = False + IsKubeEventsSuccessful = False + for line in loglines: + if line.find(constants.KUBE_POD_INVENTORY_EMIT_STREAM) >= 0: + IsKubePodInventorySuccessful = True + if line.find(constants.KUBE_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsKubeNodeInventorySuccessful = True + if line.find(constants.KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM) >= 0: + IsKubeDeploymentInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_PERF_EMIT_STREAM) >= 0: + IsKubeContainerPerfInventorySuccessful = True + if line.find(constants.KUBE_SERVICES_EMIT_STREAM) >= 0: + IsKubeServicesInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsContainerNodeInventorySuccessful = True + if line.find(constants.KUBE_EVENTS_EMIT_STREAM) >= 0: + IsKubeEventsSuccessful = True + + if IsKubePodInventorySuccessful == False: + pytest.fail("KubePodInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeNodeInventorySuccessful == False: + pytest.fail("KubeNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeDeploymentInventorySuccessful == False: + pytest.fail("KubeDeploymentInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeContainerPerfInventorySuccessful == False: + pytest.fail("KubeContainerPerfInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeServicesInventorySuccessful == False: + pytest.fail("KubeServicesInventory stream not emitted successfully from pod:" + rspodName) + + if IsContainerNodeInventorySuccessful == False: + pytest.fail("ContainerNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeEventsSuccessful == False: + pytest.fail("KubeEventsInventory stream not emitted successfully from rs pod:" + rspodName) + + append_result_output("test_rs_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed replicaset workflows test.")