From 7b8a07ba4d495cbc39cb9a8bc1b54d7037f22f7e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 23 Oct 2021 23:29:36 -0700 Subject: [PATCH 01/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 275 +++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 scripts/troubleshoot/troubleshooterrors.sh diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh new file mode 100644 index 000000000..31b804bdd --- /dev/null +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# +# This script troubleshoots errors related to onboarding of Azure Monitor for containers to Kubernetes cluster hosted outside and connected to Azure via Azure Arc cluster +# Prerequisites : +# Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest + +# bash troubelshooterror.sh --resource-id --kube-context --cloudName + +set -e +set -o pipefail + +logFile="TroubleshootDump.log" +clusterType="connectedClusters" +extensionInstanceName="azuremonitor-containers" +# resource type for azure log analytics workspace +workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" +workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" + +write_to_log_file() { + echo "$@" + echo "$@" >> $logFile +} + +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + write_to_log_file "login to the azure using provided service principal creds" + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" + else + write_to_log_file "login to the azure interactively" + az login --use-device-code + fi +} + +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + write_to_log_file "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + write_to_log_file "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +} + +usage() { + local basename=$(basename $0) + echo + echo "Troubleshooting Errors related to Azure Monitor for containers:" + echo "$basename --resource-id [--kube-context ]" +} + +parse_args() { + + if [ $# -le 1 ]; then + usage + exit 1 + fi + + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; + "--kube-context") set -- "$@" "-k" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done + + local OPTIND opt + + while getopts 'hk:r:' opt; do + case "$opt" in + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + write_to_log_file "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + write_to_log_file "clusterResourceId is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND - 1))" + + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" + + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + + write_to_log_file "cluster SubscriptionId:" $subscriptionId + write_to_log_file "cluster ResourceGroup:" $resourceGroup + write_to_log_file "cluster ProviderName:" $providerName + write_to_log_file "cluster Name:" $clusterName + + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + write_to_log_file "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + exit 1 + fi + + if [[ $providerName != microsoft.* ]]; then + write_to_log_file "-e invalid azure cluster resource id format." + exit 1 + fi + + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + write_to_log_file "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + isArcK8sCluster=true + resourceProvider=$arcK8sResourceProvider + else + write_to_log_file "-e not valid azure arc enabled kubernetes cluster resource id" + exit 1 + fi + + if [ -z "$kubeconfigContext" ]; then + write_to_log_file "using or getting current kube config context since --kube-context parameter not set " + fi + + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + write_to_log_file "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi +} + +command_exists() { + command -v "$@" > /dev/null 2>&1 +} + +validate_ci_extension () { + extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) + write_to_log_file $extension + configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") + if [ -z "$configurationSettings" ]; then + write_to_log_file "-e error configurationSettings either null or empty" + exit 1 + fi + logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") + if [ -z "$logAnalyticsWorkspaceResourceID" ]; then + write_to_log_file "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" + exit 1 + fi + + provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState") + if [ -z "$provisioningState" ]; then + write_to_log_file "-e error provisioningState either null or empty in the config settings" + exit 1 + fi + if [ $provisioningState = "Succeeded" ]; then + write_to_log_file "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" + exit 1 + fi + logAnalyticsWorkspaceDomain=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query 'configurationSettings."omsagent.domain"') + if [ -z "$logAnalyticsWorkspaceDomain" ]; then + write_to_log_file "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" + exit 1 + fi + azureCloudName=${1} + if [ "$azureCloudName" = "azureusgovernment" ]; then + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.us" ]; then + write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" + exit 1 + fi + elif [ "$azureCloudName" = "azurecloud" ]; then + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.com" ]; then + write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" + exit 1 + fi + elif [ "$azureCloudName" = "azurechinacloud" ]; then + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.cn" ]; then + write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" + exit 1 + fi + fi + + workspaceSubscriptionId="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" + workspaceResourceGroup="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f5)" + workspaceName="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f9)" + + clusterSubscriptionId=${2} + # set the azure subscription to azure cli if the workspace in different sub than cluster + if [[ "$clusterSubscriptionId" != "$workspaceSubscriptionId" ]]; then + write_to_log_file "switch subscription id of workspace as active subscription for azure cli since workspace in different subscription than cluster: ${workspaceSubscriptionId}" + isClusterAndWorkspaceInSameSubscription=false + set_azure_subscription $workspaceSubscriptionId + fi + workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) + if [ "$workspaceList" = "[]" ]; then + write_to_log_file "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + ciSolutionResourceId="/subscriptions/${workspaceSubscriptionId}/resourceGroups/${workspaceResourceGroup}/Microsoft.OperationsManagement/solutions/ContainerInsights(${workspaceName})" + ciSolutionResourceName=$(az resource show --ids "$ciSolutionResourceId" --query name) + if [[ "$ciSolutionResourceName" != "ContainerInsights(${workspaceName})" ]]; then + write_to_log_file "-e error ContainerInsights solution on workspace ${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion) + write_to_log_file "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" + if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then + write_to_log_file "-e error Unless private link configured, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + exit 1 + fi + publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery) + write_to_log_file "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" + if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then + write_to_log_file "-e error Unless private link configured, publicNetworkAccessForQuery MUST be enabled for data query" + exit 1 + fi + + workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb) + write_to_log_file "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" + if [[ "$workspaceCappingDailyQuotaGb" != "1.0" ]]; then + write_to_log_file "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" + exit 1 + fi +} + +if command_exists az; then + write_to_log_file "detected azure cli installed" + azCLIVersion=$(az -v) + write_to_log_file "azure-cli version: ${azCLIVersion}" + azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") + if [ $azCLIExtension = "k8s-extension" ]; then + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + write_to_log_file "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" + az extension update --name 'k8s-extension' + else + write_to_log_file "adding k8s-extension since k8s-extension doesnt exist as installed" + az extension add --name 'k8s-extension' + fi + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + write_to_log_file "current installed k8s-extension version: ${azCLIExtensionVersion}" +else + write_to_log_file "-e error azure cli doesnt exist as installed" + write_to_log_file "Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" + exit 1 +fi + +# parse and validate args +parse_args $@ + +# parse cluster resource id +clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" +clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" +providerName="$(echo $clusterResourceId | cut -d'/' -f7)" +clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" + +azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") +write_to_log_file "azure cloud name: ${azureCloudName}" + +# login to azure interactively +login_to_azure + +# set the cluster subscription id as active sub for azure cli +set_azure_subscription $clusterSubscriptionId + +#validate ci extension +validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGroup $clusterName \ No newline at end of file From ba9f8909de58a7a97ca6036aee868fcd19532ab4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 12:30:51 -0700 Subject: [PATCH 02/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 135 ++++++++++++++------- 1 file changed, 88 insertions(+), 47 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 31b804bdd..b1661a683 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -15,27 +15,35 @@ extensionInstanceName="azuremonitor-containers" # resource type for azure log analytics workspace workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" - -write_to_log_file() { +contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with this script captured logs" +dataCapHelpMessage="Please review and increase data cap https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage" +workspacePrivateLinkMessage="Please review this doc https://docs.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security" +azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" +kubectlInstallLinkMessage="Please install kubectl as per the instructions https://kubernetes.io/docs/tasks/tools/#kubectl and rerun the troubleshooting script" +jqInstallLinkMessage="Please install jq as per instructions https://stedolan.github.io/jq/download/ and rerun the troubleshooting script" + +log_message() { echo "$@" + echo "" echo "$@" >> $logFile } + login_to_azure() { if [ "$isUsingServicePrincipal" = true ]; then - write_to_log_file "login to the azure using provided service principal creds" + log_message "login to the azure using provided service principal creds" az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" else - write_to_log_file "login to the azure interactively" + log_message "login to the azure interactively" az login --use-device-code fi } set_azure_subscription() { local subscriptionId="$(echo ${1})" - write_to_log_file "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + log_message "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" az account set -s ${subscriptionId} - write_to_log_file "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" + log_message "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" } usage() { @@ -73,12 +81,12 @@ parse_args() { k) kubeconfigContext="$OPTARG" - write_to_log_file "name of kube-context is $OPTARG" + log_message "name of kube-context is $OPTARG" ;; r) clusterResourceId="$OPTARG" - write_to_log_file "clusterResourceId is $OPTARG" + log_message "clusterResourceId is $OPTARG" ;; ?) @@ -102,37 +110,37 @@ parse_args() { # convert to lowercase for validation providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") - write_to_log_file "cluster SubscriptionId:" $subscriptionId - write_to_log_file "cluster ResourceGroup:" $resourceGroup - write_to_log_file "cluster ProviderName:" $providerName - write_to_log_file "cluster Name:" $clusterName + log_message "cluster SubscriptionId:" $subscriptionId + log_message "cluster ResourceGroup:" $resourceGroup + log_message "cluster ProviderName:" $providerName + log_message "cluster Name:" $clusterName if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then - write_to_log_file "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + log_message "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" exit 1 fi if [[ $providerName != microsoft.* ]]; then - write_to_log_file "-e invalid azure cluster resource id format." + log_message "-e invalid azure cluster resource id format." exit 1 fi # detect the resource provider from the provider name in the cluster resource id if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then - write_to_log_file "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + log_message "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" isArcK8sCluster=true resourceProvider=$arcK8sResourceProvider else - write_to_log_file "-e not valid azure arc enabled kubernetes cluster resource id" + log_message "-e not valid azure arc enabled kubernetes cluster resource id" exit 1 fi if [ -z "$kubeconfigContext" ]; then - write_to_log_file "using or getting current kube config context since --kube-context parameter not set " + log_message "using or getting current kube config context since --kube-context parameter not set " fi if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then - write_to_log_file "using service principal creds (clientId, secret and tenantId) for azure login since provided" + log_message "using service principal creds (clientId, secret and tenantId) for azure login since provided" isUsingServicePrincipal=true fi } @@ -141,48 +149,56 @@ command_exists() { command -v "$@" > /dev/null 2>&1 } -validate_ci_extension () { +validate_ci_extension() { extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) - write_to_log_file $extension + log_message $extension configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") if [ -z "$configurationSettings" ]; then - write_to_log_file "-e error configurationSettings either null or empty" + log_message "-e error configurationSettings either null or empty" + log_message ${contactUSMessage} exit 1 fi logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") if [ -z "$logAnalyticsWorkspaceResourceID" ]; then - write_to_log_file "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" + log_message "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" + log_message ${contactUSMessage} exit 1 fi provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState") if [ -z "$provisioningState" ]; then - write_to_log_file "-e error provisioningState either null or empty in the config settings" + log_message "-e error provisioningState either null or empty in the config settings" + log_message ${contactUSMessage} exit 1 fi if [ $provisioningState = "Succeeded" ]; then - write_to_log_file "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" + log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" + log_message ${contactUSMessage} exit 1 fi logAnalyticsWorkspaceDomain=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query 'configurationSettings."omsagent.domain"') if [ -z "$logAnalyticsWorkspaceDomain" ]; then - write_to_log_file "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" + log_message "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" + log_message ${contactUSMessage} exit 1 fi azureCloudName=${1} if [ "$azureCloudName" = "azureusgovernment" ]; then if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.us" ]; then - write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} exit 1 fi elif [ "$azureCloudName" = "azurecloud" ]; then if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.com" ]; then - write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} exit 1 fi elif [ "$azureCloudName" = "azurechinacloud" ]; then if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.cn" ]; then - write_to_log_file "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} exit 1 fi fi @@ -194,62 +210,70 @@ validate_ci_extension () { clusterSubscriptionId=${2} # set the azure subscription to azure cli if the workspace in different sub than cluster if [[ "$clusterSubscriptionId" != "$workspaceSubscriptionId" ]]; then - write_to_log_file "switch subscription id of workspace as active subscription for azure cli since workspace in different subscription than cluster: ${workspaceSubscriptionId}" + log_message "switch subscription id of workspace as active subscription for azure cli since workspace in different subscription than cluster: ${workspaceSubscriptionId}" isClusterAndWorkspaceInSameSubscription=false set_azure_subscription $workspaceSubscriptionId fi workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) if [ "$workspaceList" = "[]" ]; then - write_to_log_file "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + log_message "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" exit 1 fi ciSolutionResourceId="/subscriptions/${workspaceSubscriptionId}/resourceGroups/${workspaceResourceGroup}/Microsoft.OperationsManagement/solutions/ContainerInsights(${workspaceName})" ciSolutionResourceName=$(az resource show --ids "$ciSolutionResourceId" --query name) if [[ "$ciSolutionResourceName" != "ContainerInsights(${workspaceName})" ]]; then - write_to_log_file "-e error ContainerInsights solution on workspace ${logAnalyticsWorkspaceResourceID} doesnt exist" + log_message "-e error ContainerInsights solution on workspace ${logAnalyticsWorkspaceResourceID} doesnt exist" + log_message ${contactUSMessage} exit 1 fi publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion) - write_to_log_file "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" + log_message "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then - write_to_log_file "-e error Unless private link configured, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message "-e error Unless private link configured, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message ${workspacePrivateLinkMessage} exit 1 fi publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery) - write_to_log_file "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" + log_message "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then - write_to_log_file "-e error Unless private link configured, publicNetworkAccessForQuery MUST be enabled for data query" + log_message "-e error Unless private link configured, publicNetworkAccessForQuery MUST be enabled for data query" + log_message ${workspacePrivateLinkMessage} exit 1 fi workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb) - write_to_log_file "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" + log_message "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" if [[ "$workspaceCappingDailyQuotaGb" != "1.0" ]]; then - write_to_log_file "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" + log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" + log_message ${dataCapHelpMessage} exit 1 fi } +validate_ci_agent_pods() { + +} + if command_exists az; then - write_to_log_file "detected azure cli installed" + log_message "detected azure cli installed" azCLIVersion=$(az -v) - write_to_log_file "azure-cli version: ${azCLIVersion}" + log_message "azure-cli version: ${azCLIVersion}" azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") if [ $azCLIExtension = "k8s-extension" ]; then azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") - write_to_log_file "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" + log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" az extension update --name 'k8s-extension' else - write_to_log_file "adding k8s-extension since k8s-extension doesnt exist as installed" + log_message "adding k8s-extension since k8s-extension doesnt exist as installed" az extension add --name 'k8s-extension' fi azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") - write_to_log_file "current installed k8s-extension version: ${azCLIExtensionVersion}" + log_message "current installed k8s-extension version: ${azCLIExtensionVersion}" else - write_to_log_file "-e error azure cli doesnt exist as installed" - write_to_log_file "Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" + log_message "-e error azure cli doesnt exist as installed" + log_message ${azureCLIInstallLinkMessage} exit 1 fi @@ -263,7 +287,7 @@ providerName="$(echo $clusterResourceId | cut -d'/' -f7)" clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") -write_to_log_file "azure cloud name: ${azureCloudName}" +log_message "azure cloud name: ${azureCloudName}" # login to azure interactively login_to_azure @@ -271,5 +295,22 @@ login_to_azure # set the cluster subscription id as active sub for azure cli set_azure_subscription $clusterSubscriptionId -#validate ci extension -validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGroup $clusterName \ No newline at end of file +# validate ci extension +validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGroup $clusterName + +# validate ci agent pods +if command_exists kubectl; then + if command_exists jq; then + log_message "-e error jq doesnt exist as installed" + log_message $jqInstallLinkMessage + exit 1 + fi + validate_ci_agent_pods +else + log_message "-e error kubectl doesnt exist as installed" + log_message ${kubectlInstallLinkMessage} + exit 1 +fi + +log_message "Everything looks good according to this script." +log_message $contactUSMessage From 8f3b942a08db591780f0d356caf1f2f92e4bc0e3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 14:27:13 -0700 Subject: [PATCH 03/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 121 ++++++++++++++++----- 1 file changed, 96 insertions(+), 25 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index b1661a683..b7f978b35 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -15,12 +15,19 @@ extensionInstanceName="azuremonitor-containers" # resource type for azure log analytics workspace workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" +agentK8sNamespace="kube-system" +agentK8sSecretName="omsagent-secret" +agentK8sDeploymentName="omsagent-rs" +agentK8sLinuxDaemonsetName="omsagent" +workspaceId="" +workspacePrimarySharedKey="" contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with this script captured logs" dataCapHelpMessage="Please review and increase data cap https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage" workspacePrivateLinkMessage="Please review this doc https://docs.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security" azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" kubectlInstallLinkMessage="Please install kubectl as per the instructions https://kubernetes.io/docs/tasks/tools/#kubectl and rerun the troubleshooting script" jqInstallLinkMessage="Please install jq as per instructions https://stedolan.github.io/jq/download/ and rerun the troubleshooting script" +ciExtensionReOnboarding="Please reinstall extension as per instructions https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json" log_message() { echo "$@" @@ -172,7 +179,7 @@ validate_ci_extension() { exit 1 fi if [ $provisioningState = "Succeeded" ]; then - log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" + log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" log_message ${contactUSMessage} exit 1 fi @@ -250,32 +257,96 @@ validate_ci_extension() { log_message ${dataCapHelpMessage} exit 1 fi + + + workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId) + log_message "workspaceId: ${workspaceId}" + + workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) + workspacePrimarySharedKey=$(echo $workspaceKey | tr -d '"') +} + +validate_az_cli_installed_or_not() { + if command_exists az; then + log_message "detected azure cli installed" + azCLIVersion=$(az -v) + log_message "azure-cli version: ${azCLIVersion}" + azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") + if [ $azCLIExtension = "k8s-extension" ]; then + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" + log_message "updating the k8s-extension version to latest available one" + az extension update --name 'k8s-extension' + else + log_message "adding k8s-extension since k8s-extension doesnt exist as installed" + az extension add --name 'k8s-extension' + fi + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "current installed k8s-extension version: ${azCLIExtensionVersion}" + else + log_message "-e error azure cli doesnt exist as installed" + log_message ${azureCLIInstallLinkMessage} + exit 1 + fi } -validate_ci_agent_pods() { +validate_ci_agent_pods() { + # verify the id and key of the workspace matches with workspace key value in the secret + wsID=$(kubectl get secrets ${omsagent-secret} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") + wsID=$(echo $wsID | base64 -d) + + wsKEY=$(kubectl get secrets ${omsagent-secret} -n ${agentK8sNamespace} -o json | jq -r ".data.KEY") + wsKEY=$(echo $wsKEY | base64 -d) + + if [[ "$workspaceId" != "$wsID" ]]; then + log_message "-e error workspaceId: ${workspaceID} of the workspace doesnt match with workspaceId: ${wsID} value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + if [[ "$workspacePrimarySharedKey" != "$wsKEY" ]]; then + log_message "-e error workspacePrimarySharedKey of the workspace doesnt match with workspacekey value value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + + # verify state of agent deployment + readyReplicas=$(kubectl get deployments -n kube-system ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') + if [[ "$readyReplicas" != "1" ]]; then + log_message "-e error number of readyReplicas of agent deployment MUST be 1" + exit 1 + fi + replicas=$(kubectl get deployments -n kube-system ${agentK8sDeploymentName} -o json | jq '.status.replicas') + if [[ "$replicas" != "1" ]]; then + log_message "-e error number of replicas of agent deployment MUST be 1" + exit 1 + fi + + # verify state of agent ds + currentNumberScheduled=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') + desiredNumberScheduled=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') + if [[ "$currentNumberScheduled" != "$desiredNumberScheduled" ]]; then + log_message "-e error desiredNumberScheduled: ${desiredNumberScheduled} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + + numberAvailable=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') + if [[ "$numberAvailable" != "$currentNumberScheduled" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + numberReady=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') + if [[ "$numberAvailable" != "$numberReady" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with numberReady: ${numberReady}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi } -if command_exists az; then - log_message "detected azure cli installed" - azCLIVersion=$(az -v) - log_message "azure-cli version: ${azCLIVersion}" - azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") - if [ $azCLIExtension = "k8s-extension" ]; then - azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") - log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" - az extension update --name 'k8s-extension' - else - log_message "adding k8s-extension since k8s-extension doesnt exist as installed" - az extension add --name 'k8s-extension' - fi - azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") - log_message "current installed k8s-extension version: ${azCLIExtensionVersion}" -else - log_message "-e error azure cli doesnt exist as installed" - log_message ${azureCLIInstallLinkMessage} - exit 1 -fi +# verify azure cli installed or not +validate_az_cli_installed_or_not # parse and validate args parse_args $@ @@ -300,13 +371,13 @@ validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGro # validate ci agent pods if command_exists kubectl; then - if command_exists jq; then + if command_exists jq; then log_message "-e error jq doesnt exist as installed" log_message $jqInstallLinkMessage exit 1 fi - validate_ci_agent_pods -else + validate_ci_agent_pods +else log_message "-e error kubectl doesnt exist as installed" log_message ${kubectlInstallLinkMessage} exit 1 From b3666f49c2576929bcf4416f2908b2ddc924ad22 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 16:51:33 -0700 Subject: [PATCH 04/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index b7f978b35..6b7b469ca 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -178,7 +178,7 @@ validate_ci_extension() { log_message ${contactUSMessage} exit 1 fi - if [ $provisioningState = "Succeeded" ]; then + if [ $provisioningState != "Succeeded" ]; then log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" log_message ${contactUSMessage} exit 1 @@ -238,14 +238,14 @@ validate_ci_extension() { publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion) log_message "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then - log_message "-e error Unless private link configured, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" log_message ${workspacePrivateLinkMessage} exit 1 fi publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery) log_message "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" - if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then - log_message "-e error Unless private link configured, publicNetworkAccessForQuery MUST be enabled for data query" + if [[ "$publicNetworkAccessForQuery" != "Enabled" ]]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" log_message ${workspacePrivateLinkMessage} exit 1 fi @@ -258,7 +258,6 @@ validate_ci_extension() { exit 1 fi - workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId) log_message "workspaceId: ${workspaceId}" @@ -310,33 +309,33 @@ validate_ci_agent_pods() { fi # verify state of agent deployment - readyReplicas=$(kubectl get deployments -n kube-system ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') + readyReplicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') if [[ "$readyReplicas" != "1" ]]; then log_message "-e error number of readyReplicas of agent deployment MUST be 1" exit 1 fi - replicas=$(kubectl get deployments -n kube-system ${agentK8sDeploymentName} -o json | jq '.status.replicas') + replicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.replicas') if [[ "$replicas" != "1" ]]; then log_message "-e error number of replicas of agent deployment MUST be 1" exit 1 fi # verify state of agent ds - currentNumberScheduled=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') - desiredNumberScheduled=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') + currentNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') + desiredNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') if [[ "$currentNumberScheduled" != "$desiredNumberScheduled" ]]; then log_message "-e error desiredNumberScheduled: ${desiredNumberScheduled} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" exit 1 fi - numberAvailable=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') + numberAvailable=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') if [[ "$numberAvailable" != "$currentNumberScheduled" ]]; then log_message "-e error numberAvailable: ${numberAvailable} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" exit 1 fi - numberReady=$(kubectl get ds -n kube-system ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') + numberReady=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') if [[ "$numberAvailable" != "$numberReady" ]]; then log_message "-e error numberAvailable: ${numberAvailable} doesnt match with numberReady: ${numberReady}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" @@ -357,6 +356,7 @@ clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" providerName="$(echo $clusterResourceId | cut -d'/' -f7)" clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" +# get the current active azure cloud of the az cli azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "azure cloud name: ${azureCloudName}" From d67a65bdd31c5f0e14039d7c05599e60c2f538f8 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 17:04:55 -0700 Subject: [PATCH 05/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 6b7b469ca..67a561009 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -173,12 +173,13 @@ validate_ci_extension() { fi provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState") + log_message "Extension provisioningState: ${provisioningState}" if [ -z "$provisioningState" ]; then log_message "-e error provisioningState either null or empty in the config settings" log_message ${contactUSMessage} exit 1 fi - if [ $provisioningState != "Succeeded" ]; then + if [ "$provisioningState" != "Succeeded" ]; then log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" log_message ${contactUSMessage} exit 1 From b6a10ec246d45fb330ab7b201ef1aaf64dc32746 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 17:35:13 -0700 Subject: [PATCH 06/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 67a561009..07f5638bb 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -172,19 +172,20 @@ validate_ci_extension() { exit 1 fi - provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState") + provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "Extension provisioningState: ${provisioningState}" if [ -z "$provisioningState" ]; then log_message "-e error provisioningState either null or empty in the config settings" log_message ${contactUSMessage} exit 1 fi - if [ "$provisioningState" != "Succeeded" ]; then - log_message "-e error expected state of extension provisioningState MUST be Succeeded state but actual state is ${provisioningState}" + if [ "$provisioningState" != "succeeded" ]; then + log_message "-e error expected state of extension provisioningState MUST be succeeded state but actual state is ${provisioningState}" log_message ${contactUSMessage} exit 1 fi logAnalyticsWorkspaceDomain=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query 'configurationSettings."omsagent.domain"') + log_message "Extension logAnalyticsWorkspaceDomain: ${logAnalyticsWorkspaceDomain}" if [ -z "$logAnalyticsWorkspaceDomain" ]; then log_message "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" log_message ${contactUSMessage} @@ -192,18 +193,21 @@ validate_ci_extension() { fi azureCloudName=${1} if [ "$azureCloudName" = "azureusgovernment" ]; then + log_message "az cli configured cloud name:$azureCloudName" if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.us" ]; then log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" log_message ${contactUSMessage} exit 1 fi elif [ "$azureCloudName" = "azurecloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.com" ]; then log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" log_message ${contactUSMessage} exit 1 fi elif [ "$azureCloudName" = "azurechinacloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.cn" ]; then log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" log_message ${contactUSMessage} @@ -214,6 +218,7 @@ validate_ci_extension() { workspaceSubscriptionId="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" workspaceResourceGroup="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f5)" workspaceName="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f9)" + log_message "workspaceSubscriptionId:${workspaceSubscriptionId} workspaceResourceGroup:${workspaceResourceGroup} workspaceName:${workspaceName}" clusterSubscriptionId=${2} # set the azure subscription to azure cli if the workspace in different sub than cluster From e4953f7e5a4ae825ab6a4f0f69a4bc8ef66877d5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 17:42:20 -0700 Subject: [PATCH 07/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 07f5638bb..734af64a0 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -227,7 +227,8 @@ validate_ci_extension() { isClusterAndWorkspaceInSameSubscription=false set_azure_subscription $workspaceSubscriptionId fi - workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) + workspaceList=$(az resource list -g "$workspaceResourceGroup" -n "$workspaceName" --resource-type $workspaceResourceProvider) + log_message "workspace info:${workspaceList}" if [ "$workspaceList" = "[]" ]; then log_message "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" exit 1 From afeca562fb41d8bffcc07ce60958a3b22c0f8e02 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 17:50:27 -0700 Subject: [PATCH 08/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 734af64a0..d13d34eb5 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -159,13 +159,14 @@ command_exists() { validate_ci_extension() { extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) log_message $extension - configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") + configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings") if [ -z "$configurationSettings" ]; then log_message "-e error configurationSettings either null or empty" log_message ${contactUSMessage} exit 1 fi - logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID") + logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "Extension logAnalyticsWorkspaceResourceID: ${logAnalyticsWorkspaceResourceID}" if [ -z "$logAnalyticsWorkspaceResourceID" ]; then log_message "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" log_message ${contactUSMessage} From d2e5741d1af69148b73510288577d3cf78559e13 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 18:06:29 -0700 Subject: [PATCH 09/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index d13d34eb5..33e337cc5 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -235,11 +235,11 @@ validate_ci_extension() { exit 1 fi - ciSolutionResourceId="/subscriptions/${workspaceSubscriptionId}/resourceGroups/${workspaceResourceGroup}/Microsoft.OperationsManagement/solutions/ContainerInsights(${workspaceName})" - ciSolutionResourceName=$(az resource show --ids "$ciSolutionResourceId" --query name) - if [[ "$ciSolutionResourceName" != "ContainerInsights(${workspaceName})" ]]; then - log_message "-e error ContainerInsights solution on workspace ${logAnalyticsWorkspaceResourceID} doesnt exist" - log_message ${contactUSMessage} + ciSolutionResourceName="ContainerInsights(${workspaceName})" + workspaceSolutionList=$(az resource list -g $workspaceResourceGroup -n $ciSolutionResourceName --resource-type $workspaceSolutionResourceProvider) + log_message "workspace solution info:${workspaceSolutionList}" + if [ "$workspaceSolutionList" = "[]" ]; then + log_message "-e error ContainerInsights solution on workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" exit 1 fi From a3244552b9075c68b2971f6b2fbe3eebd8f5de6e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 19:36:12 -0700 Subject: [PATCH 10/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 33e337cc5..4f861a667 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -243,30 +243,30 @@ validate_ci_extension() { exit 1 fi - publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion) + publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" - if [[ "$publicNetworkAccessForIngestion" != "Enabled" ]]; then + if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" log_message ${workspacePrivateLinkMessage} exit 1 fi - publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery) + publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" - if [[ "$publicNetworkAccessForQuery" != "Enabled" ]]; then + if [ "$publicNetworkAccessForQuery" != "enabled" ]; then log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" log_message ${workspacePrivateLinkMessage} exit 1 fi - workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb) + workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" - if [[ "$workspaceCappingDailyQuotaGb" != "1.0" ]]; then + if [ "$workspaceCappingDailyQuotaGb" != "1.0" ]; then log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" log_message ${dataCapHelpMessage} exit 1 fi - workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId) + workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId -o tsv | tr -d "[:space:]") log_message "workspaceId: ${workspaceId}" workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) From 6dbc51871a7dd9fb678e3dc747bcbd298c99cc5d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 19:38:40 -0700 Subject: [PATCH 11/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 4f861a667..45ef036ca 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -260,7 +260,7 @@ validate_ci_extension() { workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" - if [ "$workspaceCappingDailyQuotaGb" != "1.0" ]; then + if [ "$workspaceCappingDailyQuotaGb" != "-1.0" ]; then log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" log_message ${dataCapHelpMessage} exit 1 From e5fa561e2889bd338eb8a3eade07d381d9154f31 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 19:43:53 -0700 Subject: [PATCH 12/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 45ef036ca..4298b2982 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -258,10 +258,10 @@ validate_ci_extension() { exit 1 fi - workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr -d "[:space:]") log_message "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" if [ "$workspaceCappingDailyQuotaGb" != "-1.0" ]; then - log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota: ${workspaceCappingDailyQuotaGb}" + log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota:${workspaceCappingDailyQuotaGb}" log_message ${dataCapHelpMessage} exit 1 fi From f77ca7048f74d9b362d5aadd465c942029f923e2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 19:51:14 -0700 Subject: [PATCH 13/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 4298b2982..d1aa75941 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -380,11 +380,12 @@ validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGro # validate ci agent pods if command_exists kubectl; then if command_exists jq; then + validate_ci_agent_pods + else log_message "-e error jq doesnt exist as installed" log_message $jqInstallLinkMessage exit 1 fi - validate_ci_agent_pods else log_message "-e error kubectl doesnt exist as installed" log_message ${kubectlInstallLinkMessage} From 04bd3ef222c6c994cc31f7c707d719f817ba3b1e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 19:57:36 -0700 Subject: [PATCH 14/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index d1aa75941..c2a1540fa 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -16,7 +16,7 @@ extensionInstanceName="azuremonitor-containers" workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" agentK8sNamespace="kube-system" -agentK8sSecretName="omsagent-secret" +agentK8sSecretName="agentK8sSecretName" agentK8sDeploymentName="omsagent-rs" agentK8sLinuxDaemonsetName="omsagent" workspaceId="" @@ -244,14 +244,14 @@ validate_ci_extension() { fi publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") - log_message "workspace publicNetworkAccessForIngestion: ${publicNetworkAccessForIngestion}" + log_message "workspace publicNetworkAccessForIngestion:${publicNetworkAccessForIngestion}" if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" log_message ${workspacePrivateLinkMessage} exit 1 fi publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") - log_message "workspace publicNetworkAccessForQuery: ${publicNetworkAccessForQuery}" + log_message "workspace publicNetworkAccessForQuery:${publicNetworkAccessForQuery}" if [ "$publicNetworkAccessForQuery" != "enabled" ]; then log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" log_message ${workspacePrivateLinkMessage} @@ -259,7 +259,7 @@ validate_ci_extension() { fi workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr -d "[:space:]") - log_message "workspaceCapping dailyQuotaGb: ${workspaceCappingDailyQuotaGb}" + log_message "workspaceCapping dailyQuotaGb:${workspaceCappingDailyQuotaGb}" if [ "$workspaceCappingDailyQuotaGb" != "-1.0" ]; then log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota:${workspaceCappingDailyQuotaGb}" log_message ${dataCapHelpMessage} @@ -299,10 +299,11 @@ validate_az_cli_installed_or_not() { validate_ci_agent_pods() { # verify the id and key of the workspace matches with workspace key value in the secret - wsID=$(kubectl get secrets ${omsagent-secret} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") + wsID=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") wsID=$(echo $wsID | base64 -d) + log_message "workspaceId: ${wsID} value in the ${agentK8sSecretName}" - wsKEY=$(kubectl get secrets ${omsagent-secret} -n ${agentK8sNamespace} -o json | jq -r ".data.KEY") + wsKEY=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.KEY") wsKEY=$(echo $wsKEY | base64 -d) if [[ "$workspaceId" != "$wsID" ]]; then From 42629c3d08f0fb672b6e52f4d37af68da5756f90 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 20:00:50 -0700 Subject: [PATCH 15/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index c2a1540fa..e0f06c203 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -16,7 +16,7 @@ extensionInstanceName="azuremonitor-containers" workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" agentK8sNamespace="kube-system" -agentK8sSecretName="agentK8sSecretName" +agentK8sSecretName="omsagent-secret" agentK8sDeploymentName="omsagent-rs" agentK8sLinuxDaemonsetName="omsagent" workspaceId="" From 28a34aea8e1ce39ac2e233e6e16959178fd72d7d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 20:17:59 -0700 Subject: [PATCH 16/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 30 +++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index e0f06c203..1d30b6b70 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -21,7 +21,7 @@ agentK8sDeploymentName="omsagent-rs" agentK8sLinuxDaemonsetName="omsagent" workspaceId="" workspacePrimarySharedKey="" -contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with this script captured logs" +contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with TroubleshootDump.log generated by this script" dataCapHelpMessage="Please review and increase data cap https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage" workspacePrivateLinkMessage="Please review this doc https://docs.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security" azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" @@ -243,19 +243,26 @@ validate_ci_extension() { exit 1 fi + privateLinkScopedResources=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.privateLinkScopedResources -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace privateLinkScopedResources:${privateLinkScopedResources}" + publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspace publicNetworkAccessForIngestion:${publicNetworkAccessForIngestion}" - if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then - log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" - log_message ${workspacePrivateLinkMessage} - exit 1 + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi fi publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") log_message "workspace publicNetworkAccessForQuery:${publicNetworkAccessForQuery}" - if [ "$publicNetworkAccessForQuery" != "enabled" ]; then - log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" - log_message ${workspacePrivateLinkMessage} - exit 1 + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForQuery" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi fi workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr -d "[:space:]") @@ -319,11 +326,13 @@ validate_ci_agent_pods() { # verify state of agent deployment readyReplicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') + log_message "number of deployment ready replicas:${readyReplicas}" if [[ "$readyReplicas" != "1" ]]; then log_message "-e error number of readyReplicas of agent deployment MUST be 1" exit 1 fi replicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.replicas') + log_message "number of deployment replicas:${replicas}" if [[ "$replicas" != "1" ]]; then log_message "-e error number of replicas of agent deployment MUST be 1" exit 1 @@ -332,6 +341,7 @@ validate_ci_agent_pods() { # verify state of agent ds currentNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') desiredNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') + log_message "number of linux deamonset pods currentNumberScheduled:${currentNumberScheduled} and currentNumberScheduled:${currentNumberScheduled}" if [[ "$currentNumberScheduled" != "$desiredNumberScheduled" ]]; then log_message "-e error desiredNumberScheduled: ${desiredNumberScheduled} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" @@ -339,12 +349,14 @@ validate_ci_agent_pods() { fi numberAvailable=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') + log_message "number of linux deamonset pods numberAvailable:${numberAvailable}" if [[ "$numberAvailable" != "$currentNumberScheduled" ]]; then log_message "-e error numberAvailable: ${numberAvailable} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" exit 1 fi numberReady=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') + log_message "number of linux deamonset pods numberReady:${numberReady}" if [[ "$numberAvailable" != "$numberReady" ]]; then log_message "-e error numberAvailable: ${numberAvailable} doesnt match with numberReady: ${numberReady}" log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" From 99dceafdf2fd4377e1d1a3107f4ebd1b5b1f2297 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 20:31:13 -0700 Subject: [PATCH 17/24] wip --- scripts/troubleshoot/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index 5ffa07639..fa2ffbe60 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -1,5 +1,14 @@ # Troubleshoot Guide for Azure Monitor for containers +# Azure Arc-enabled Kubernetes +The table below summarizes known issues you may face while using Azure Monitor for containers . + +| Issues and Error Messages | Action | +| ---- | --- | +| Error Message `No data for selected filters` | It may take some time to establish monitoring data flow for newly created clusters. Please allow at least 10-15 minutes for data to appear for your cluster. | +| Error Message `Error retrieving data` | While Azure Arc-enabled Kubernetes cluster is setting up for health and performance monitoring, a connection is established between the cluster and Azure Log Analytics workspace. Log Analytics workspace is used to store all monitoring data for your cluster. This error may occurr when your Log Analytics workspace has been deleted or lost. Please check whether your Log Analytics workspace is available. To find your Log Analytics workspace go [here.](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and your workspace is available. If the workspace is missing, you will have to delete and create Microsoft.AzureMonitor.Containers extension https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json. | + + # Azure Kubernetes Service (AKS) The table below summarizes known issues you may face while using Azure Monitor for containers . From 6ee24cc8f9a3f510d77c2a6c9e65cba5ebfbdfe3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 24 Oct 2021 21:19:20 -0700 Subject: [PATCH 18/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 1d30b6b70..7507d1958 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -16,6 +16,7 @@ extensionInstanceName="azuremonitor-containers" workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" agentK8sNamespace="kube-system" +azureArcK8sNamespace="azure-arc" agentK8sSecretName="omsagent-secret" agentK8sDeploymentName="omsagent-rs" agentK8sLinuxDaemonsetName="omsagent" @@ -365,6 +366,28 @@ validate_ci_agent_pods() { } +get_nodes_pods_crds_info() { + + log_message "nodes" + kubectl get nodes >> $logFile + + log_message "kube-system pods" + kubectl get get po -n ${agentK8sNamespace} >> $logFile + + log_message "azurearck8spods" + kubectl get po -n ${azureArcK8sNamespace} >> $logFile + + log_message "crds" + kubectl get crds -A >> $logFile + + log_message "azureclusteridentityrequests crds" + kubectl get crds azureclusteridentityrequests.clusterconfig.azure.com >> $logFile + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} >> $logFile + + log_message "container-insights-clusteridentityrequest crd" + kubectl describe azureclusteridentityrequests -n ${azureArcK8sNamespace} container-insights-clusteridentityrequest >> $logFile +} + # verify azure cli installed or not validate_az_cli_installed_or_not @@ -405,5 +428,8 @@ else exit 1 fi +# get nodes and pods status +get_nodes_pods_crds_info + log_message "Everything looks good according to this script." log_message $contactUSMessage From 61ef20ac0f7e44ea9d989f1ff6c5e01d6cee607a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 25 Oct 2021 07:59:53 -0700 Subject: [PATCH 19/24] doc updates --- scripts/troubleshoot/README.md | 21 +++++++++++ scripts/troubleshoot/troubleshooterrors.sh | 41 ++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index fa2ffbe60..3f098736a 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -78,3 +78,24 @@ For more details on Azure Resource Manager template deployment via cli refer to If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: * File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) * Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.txt in the email generated by the troubleshooting script if you had tried running the script to solve your problem. + +# Azure Arc-enabled Kubernetes + +You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh) to diagnose the problem. + +Steps: +- Before executing the Troubleshooting script, please install following pre-requisistes if you dont have already + - Install [Azure-CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) + - Install [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) + - Install [jq](https://stedolan.github.io/jq/download/) +- Download and execute the script + ``` bash + curl -LO https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh + bash troubleshooterrors.sh --resource-id --kube-context + ``` +- This script will generate a TroubleshootDump.log which collects detailed information about container health onboarding. +Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respond back to you. + +If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: +* File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) +* Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.log in the email generated by the troubleshooting script if you had tried running the script to solve your problem. \ No newline at end of file diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 7507d1958..d0288f9e0 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -20,6 +20,7 @@ azureArcK8sNamespace="azure-arc" agentK8sSecretName="omsagent-secret" agentK8sDeploymentName="omsagent-rs" agentK8sLinuxDaemonsetName="omsagent" +agentArcK8sIdentityCRDName="container-insights-clusteridentityrequest" workspaceId="" workspacePrimarySharedKey="" contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with TroubleshootDump.log generated by this script" @@ -29,6 +30,7 @@ azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions htt kubectlInstallLinkMessage="Please install kubectl as per the instructions https://kubernetes.io/docs/tasks/tools/#kubectl and rerun the troubleshooting script" jqInstallLinkMessage="Please install jq as per instructions https://stedolan.github.io/jq/download/ and rerun the troubleshooting script" ciExtensionReOnboarding="Please reinstall extension as per instructions https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json" +timesyncHelpMessage="Please check if you have any timesync issues on your cluster nodes" log_message() { echo "$@" @@ -366,6 +368,42 @@ validate_ci_agent_pods() { } +validate_ci_agent_identity_status() { + + log_message "Info of ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json >> $logFile + status=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status') + if [ -z "$status" ]; then + log_message "-e error status field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + expirationTime=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.expirationTime') + if [ -z "$expirationTime" ]; then + log_message "-e error expirationTime field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + tokenReference=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference') + if [ -z "$tokenReference" ]; then + log_message "-e error tokenReference field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + dataName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.dataName') + if [ -z "$dataName" ]; then + log_message "-e error dataName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + secretName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.secretName') + if [ -z "$secretName" ]; then + log_message "-e error secretName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi +} + get_nodes_pods_crds_info() { log_message "nodes" @@ -428,6 +466,9 @@ else exit 1 fi +# validate ci cluster identity token +validate_ci_agent_identity_status + # get nodes and pods status get_nodes_pods_crds_info From cca290edb098e5c1fca8428fcc72a2bdb5446daa Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 25 Oct 2021 08:07:06 -0700 Subject: [PATCH 20/24] doc updates --- scripts/troubleshoot/troubleshooterrors.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index d0288f9e0..6dbdadbac 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -160,6 +160,7 @@ command_exists() { } validate_ci_extension() { + log_message "START:validate_ci_extension" extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) log_message $extension configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings") @@ -281,6 +282,8 @@ validate_ci_extension() { workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) workspacePrimarySharedKey=$(echo $workspaceKey | tr -d '"') + + log_message "END:validate_ci_extension" } validate_az_cli_installed_or_not() { @@ -308,6 +311,7 @@ validate_az_cli_installed_or_not() { } validate_ci_agent_pods() { + log_message "START:validate_ci_agent_pods" # verify the id and key of the workspace matches with workspace key value in the secret wsID=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") wsID=$(echo $wsID | base64 -d) @@ -365,11 +369,11 @@ validate_ci_agent_pods() { log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" exit 1 fi - + log_message "END:validate_ci_agent_pods" } validate_ci_agent_identity_status() { - + log_message "START:validate_ci_agent_identity_status" log_message "Info of ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json >> $logFile status=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status') @@ -402,18 +406,19 @@ validate_ci_agent_identity_status() { log_message $timesyncHelpMessage exit 1 fi + log_message "END:validate_ci_agent_identity_status" } get_nodes_pods_crds_info() { - + log_message "START:get_nodes_pods_crds_info" log_message "nodes" kubectl get nodes >> $logFile log_message "kube-system pods" - kubectl get get po -n ${agentK8sNamespace} >> $logFile + kubectl get pods -n ${agentK8sNamespace} >> $logFile log_message "azurearck8spods" - kubectl get po -n ${azureArcK8sNamespace} >> $logFile + kubectl get pods -n ${azureArcK8sNamespace} >> $logFile log_message "crds" kubectl get crds -A >> $logFile @@ -424,6 +429,7 @@ get_nodes_pods_crds_info() { log_message "container-insights-clusteridentityrequest crd" kubectl describe azureclusteridentityrequests -n ${azureArcK8sNamespace} container-insights-clusteridentityrequest >> $logFile + log_message "END:get_nodes_pods_crds_info" } # verify azure cli installed or not From 266b470d59610ac5788be7ce1b7df5a000b5cce6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 25 Oct 2021 12:06:49 -0700 Subject: [PATCH 21/24] wip --- scripts/troubleshoot/troubleshooterrors.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 6dbdadbac..68177118d 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -283,7 +283,7 @@ validate_ci_extension() { workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) workspacePrimarySharedKey=$(echo $workspaceKey | tr -d '"') - log_message "END:validate_ci_extension" + log_message "END:validate_ci_extension:SUCCESS" } validate_az_cli_installed_or_not() { @@ -369,7 +369,7 @@ validate_ci_agent_pods() { log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" exit 1 fi - log_message "END:validate_ci_agent_pods" + log_message "END:validate_ci_agent_pods:SUCCESS" } validate_ci_agent_identity_status() { @@ -406,7 +406,7 @@ validate_ci_agent_identity_status() { log_message $timesyncHelpMessage exit 1 fi - log_message "END:validate_ci_agent_identity_status" + log_message "END:validate_ci_agent_identity_status:SUCCESS" } get_nodes_pods_crds_info() { @@ -429,9 +429,12 @@ get_nodes_pods_crds_info() { log_message "container-insights-clusteridentityrequest crd" kubectl describe azureclusteridentityrequests -n ${azureArcK8sNamespace} container-insights-clusteridentityrequest >> $logFile - log_message "END:get_nodes_pods_crds_info" + log_message "END:get_nodes_pods_crds_info:SUCCESS" } +datetime=$(date -u) +log_message "Script Execution start @ ${datetime}" + # verify azure cli installed or not validate_az_cli_installed_or_not From f4ef80f5186d34b171ba45187e4a60055b089299 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 25 Oct 2021 12:18:58 -0700 Subject: [PATCH 22/24] wip --- scripts/troubleshoot/README.md | 2 +- scripts/troubleshoot/troubleshooterrors.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index 3f098736a..c62f5c3f3 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -91,7 +91,7 @@ Steps: - Download and execute the script ``` bash curl -LO https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh - bash troubleshooterrors.sh --resource-id --kube-context + bash troubleshooterrors.sh --resource-id --kube-context ``` - This script will generate a TroubleshootDump.log which collects detailed information about container health onboarding. Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respond back to you. diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 68177118d..56a482f9f 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -4,7 +4,7 @@ # Prerequisites : # Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest -# bash troubelshooterror.sh --resource-id --kube-context --cloudName +# bash troubelshooterror.sh --resource-id --kube-context set -e set -o pipefail @@ -433,7 +433,7 @@ get_nodes_pods_crds_info() { } datetime=$(date -u) -log_message "Script Execution start @ ${datetime}" +log_message "*** Script Execution start @ ${datetime} ***" # verify azure cli installed or not validate_az_cli_installed_or_not From 317c3532b189d346a9cea659318c9652ade7507a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 25 Oct 2021 13:03:12 -0700 Subject: [PATCH 23/24] update repo for issues --- scripts/troubleshoot/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index c62f5c3f3..650a5df6f 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -76,7 +76,7 @@ Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respon For more details on Azure Resource Manager template deployment via cli refer to [this documentation](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-template-deploy-cli). If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: -* File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) * Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.txt in the email generated by the troubleshooting script if you had tried running the script to solve your problem. # Azure Arc-enabled Kubernetes @@ -97,5 +97,5 @@ Steps: Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respond back to you. If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: -* File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) * Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.log in the email generated by the troubleshooting script if you had tried running the script to solve your problem. \ No newline at end of file From d6c2aa97efd29f387e56613c3be86d89dca486d3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 26 Oct 2021 17:20:42 -0700 Subject: [PATCH 24/24] fix minor one --- scripts/troubleshoot/troubleshooterrors.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh index 56a482f9f..ac08d7afc 100644 --- a/scripts/troubleshoot/troubleshooterrors.sh +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -292,7 +292,7 @@ validate_az_cli_installed_or_not() { azCLIVersion=$(az -v) log_message "azure-cli version: ${azCLIVersion}" azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") - if [ $azCLIExtension = "k8s-extension" ]; then + if [ "$azCLIExtension" = "k8s-extension" ]; then azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" log_message "updating the k8s-extension version to latest available one"