From 314c19d358c3860ee05dedb792a1819918f9c66e Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 6 Dec 2022 10:57:11 +0530 Subject: [PATCH 01/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 4 ++-- src/connectedk8s/azext_connectedk8s/custom.py | 12 +++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 75fe471ed81..af033a6ceab 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -6,8 +6,8 @@ # pylint: disable=line-too-long -Distribution_Enum_Values = ["auto", "generic", "openshift", "rancher_rke", "kind", "k3s", "minikube", "gke", "eks", "aks", "aks_management", "aks_workload", "capz", "aks_engine", "tkg", "canonical", "karbon"] -Infrastructure_Enum_Values = ["auto", "generic", "azure", "aws", "gcp", "azure_stack_hci", "azure_stack_hub", "azure_stack_edge", "vsphere", "windows_server"] +Distribution_Enum_Values = ["generic", "openshift", "rancher_rke", "kind", "k3s", "minikube", "gke", "eks", "aks", "aks_management", "aks_workload", "capz", "aks_engine", "tkg", "canonical", "karbon"] +Infrastructure_Enum_Values = ["generic", "azure", "aws", "gcp", "azure_stack_hci", "azure_stack_hub", "azure_stack_edge", "vsphere", "windows_server"] AHB_Enum_Values = ["True", "False", "NotApplicable"] Feature_Values = ["cluster-connect", "azure-rbac", "custom-locations"] CRD_FOR_FORCE_DELETE = ["arccertificates.clusterconfig.azure.com", "azureclusteridentityrequests.clusterconfig.azure.com", "azureextensionidentities.clusterconfig.azure.com", "connectedclusters.arc.azure.com", "customlocationsettings.clusterconfig.azure.com", "extensionconfigs.clusterconfig.azure.com", "gitconfigs.clusterconfig.azure.com"] diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 62648b80726..162f493e99b 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -65,7 +65,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlation_id=None, https_proxy="", http_proxy="", no_proxy="", proxy_cert="", location=None, - kube_config=None, kube_context=None, no_wait=False, tags=None, distribution='auto', infrastructure='auto', + kube_config=None, kube_context=None, no_wait=False, tags=None, distribution='generic', infrastructure='generic', disable_auto_upgrade=False, cl_oid=None, onboarding_timeout="600", enable_private_link=None, private_link_scope_resource_id=None, distribution_version=None, azure_hybrid_benefit=None, yes=False, container_log_path=None): logger.warning("This operation might take a while...\n") @@ -151,14 +151,8 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ValidationError("Your credentials doesn't have permission to create clusterrolebindings on this kubernetes cluster. Please check your permissions.") # Get kubernetes cluster info - if distribution == 'auto': - kubernetes_distro = get_kubernetes_distro(node_api_response) # (cluster heuristics) - else: - kubernetes_distro = distribution - if infrastructure == 'auto': - kubernetes_infra = get_kubernetes_infra(node_api_response) # (cluster heuristics) - else: - kubernetes_infra = infrastructure + kubernetes_distro = distribution + kubernetes_infra = infrastructure kubernetes_properties = { 'Context.Default.AzureCLI.KubernetesVersion': kubernetes_version, From d34a3cbd9839f028552e2368ca53eeb5f931203a Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 6 Dec 2022 11:30:18 +0530 Subject: [PATCH 02/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 59 ------------------- 1 file changed, 59 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 162f493e99b..2c845167152 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -516,65 +516,6 @@ def get_private_key(key_pair): privKey_DER = key_pair.exportKey(format='DER') return PEM.encode(privKey_DER, "RSA PRIVATE KEY") - -def get_kubernetes_distro(api_response): # Heuristic - if api_response is None: - return "generic" - try: - for node in api_response.items: - labels = node.metadata.labels - provider_id = str(node.spec.provider_id) - annotations = node.metadata.annotations - if labels.get("node.openshift.io/os_id"): - return "openshift" - if labels.get("kubernetes.azure.com/node-image-version"): - return "aks" - if labels.get("cloud.google.com/gke-nodepool") or labels.get("cloud.google.com/gke-os-distribution"): - return "gke" - if labels.get("eks.amazonaws.com/nodegroup"): - return "eks" - if labels.get("minikube.k8s.io/version"): - return "minikube" - if provider_id.startswith("kind://"): - return "kind" - if provider_id.startswith("k3s://"): - return "k3s" - if annotations.get("rke.cattle.io/external-ip") or annotations.get("rke.cattle.io/internal-ip"): - return "rancher_rke" - return "generic" - except Exception as e: # pylint: disable=broad-except - logger.debug("Error occured while trying to fetch kubernetes distribution: " + str(e)) - utils.kubernetes_exception_handler(e, consts.Get_Kubernetes_Distro_Fault_Type, 'Unable to fetch kubernetes distribution', - raise_error=False) - return "generic" - - -def get_kubernetes_infra(api_response): # Heuristic - if api_response is None: - return "generic" - try: - for node in api_response.items: - provider_id = str(node.spec.provider_id) - infra = provider_id.split(':')[0] - if infra == "k3s" or infra == "kind": - return "generic" - if infra == "azure": - return "azure" - if infra == "gce": - return "gcp" - if infra == "aws": - return "aws" - k8s_infra = utils.validate_infrastructure_type(infra) - if k8s_infra is not None: - return k8s_infra - return "generic" - except Exception as e: # pylint: disable=broad-except - logger.debug("Error occured while trying to fetch kubernetes infrastructure: " + str(e)) - utils.kubernetes_exception_handler(e, consts.Get_Kubernetes_Infra_Fault_Type, 'Unable to fetch kubernetes infrastructure', - raise_error=False) - return "generic" - - def check_linux_amd64_node(api_response): try: for item in api_response.items: From 6842e24836d115a824c708d6c91214838b94e8a1 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 6 Dec 2022 11:32:04 +0530 Subject: [PATCH 03/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 2c845167152..162f493e99b 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -516,6 +516,65 @@ def get_private_key(key_pair): privKey_DER = key_pair.exportKey(format='DER') return PEM.encode(privKey_DER, "RSA PRIVATE KEY") + +def get_kubernetes_distro(api_response): # Heuristic + if api_response is None: + return "generic" + try: + for node in api_response.items: + labels = node.metadata.labels + provider_id = str(node.spec.provider_id) + annotations = node.metadata.annotations + if labels.get("node.openshift.io/os_id"): + return "openshift" + if labels.get("kubernetes.azure.com/node-image-version"): + return "aks" + if labels.get("cloud.google.com/gke-nodepool") or labels.get("cloud.google.com/gke-os-distribution"): + return "gke" + if labels.get("eks.amazonaws.com/nodegroup"): + return "eks" + if labels.get("minikube.k8s.io/version"): + return "minikube" + if provider_id.startswith("kind://"): + return "kind" + if provider_id.startswith("k3s://"): + return "k3s" + if annotations.get("rke.cattle.io/external-ip") or annotations.get("rke.cattle.io/internal-ip"): + return "rancher_rke" + return "generic" + except Exception as e: # pylint: disable=broad-except + logger.debug("Error occured while trying to fetch kubernetes distribution: " + str(e)) + utils.kubernetes_exception_handler(e, consts.Get_Kubernetes_Distro_Fault_Type, 'Unable to fetch kubernetes distribution', + raise_error=False) + return "generic" + + +def get_kubernetes_infra(api_response): # Heuristic + if api_response is None: + return "generic" + try: + for node in api_response.items: + provider_id = str(node.spec.provider_id) + infra = provider_id.split(':')[0] + if infra == "k3s" or infra == "kind": + return "generic" + if infra == "azure": + return "azure" + if infra == "gce": + return "gcp" + if infra == "aws": + return "aws" + k8s_infra = utils.validate_infrastructure_type(infra) + if k8s_infra is not None: + return k8s_infra + return "generic" + except Exception as e: # pylint: disable=broad-except + logger.debug("Error occured while trying to fetch kubernetes infrastructure: " + str(e)) + utils.kubernetes_exception_handler(e, consts.Get_Kubernetes_Infra_Fault_Type, 'Unable to fetch kubernetes infrastructure', + raise_error=False) + return "generic" + + def check_linux_amd64_node(api_response): try: for item in api_response.items: From 9aafd26e5e92c57188ff7205dae23df16aae4a35 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 6 Dec 2022 11:34:50 +0530 Subject: [PATCH 04/62] modified: src/connectedk8s/HISTORY.rst modified: src/connectedk8s/setup.py --- src/connectedk8s/HISTORY.rst | 5 +++++ src/connectedk8s/setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index fba0257f319..7159d3e4974 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,6 +2,11 @@ Release History =============== +1.3.7 +++++++ + +* Removing detection of infrasturcture and distribution in connectedk8s connect + 1.3.6 ++++++ diff --git a/src/connectedk8s/setup.py b/src/connectedk8s/setup.py index baabd2ac952..7f26d7b3abd 100644 --- a/src/connectedk8s/setup.py +++ b/src/connectedk8s/setup.py @@ -17,7 +17,7 @@ # TODO: Confirm this is the right version number you want and it matches your # HISTORY.rst entry. -VERSION = '1.3.6' +VERSION = '1.3.7' # The full list of classifiers is available at # https://pypi.python.org/pypi?%3Aaction=list_classifiers From 69ce330dc2ffa6c61ec3728a6fcc38f31088afc7 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 11:27:31 +0530 Subject: [PATCH 05/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py new file: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 2 + .../azext_connectedk8s/_precheckutils.py | 369 ++++++++++++++++++ src/connectedk8s/azext_connectedk8s/custom.py | 30 ++ 3 files changed, 401 insertions(+) create mode 100644 src/connectedk8s/azext_connectedk8s/_precheckutils.py diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index af033a6ceab..65ba651fc20 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -176,6 +176,8 @@ K8s_Cluster_Info = "k8s_cluster_info.txt" Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" +Connect_Precheck_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" +Connect_Precheck_Job_Version= "0.1.0" # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py new file mode 100644 index 00000000000..d828580efc5 --- /dev/null +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -0,0 +1,369 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + + +from argparse import Namespace +from pydoc import cli +from kubernetes import client, config, watch, utils +from logging import exception +import yaml +import json +import datetime +from subprocess import Popen, PIPE, run, STDOUT, call, DEVNULL +import shutil +from knack.log import get_logger +from azure.cli.core import telemetry +import azext_connectedk8s._constants as consts +logger = get_logger(__name__) +# pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long +import os +import shutil +import subprocess +from subprocess import Popen, PIPE +import time +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import json +from kubernetes import client, config, watch, utils +from knack.util import CLIError +from knack.log import get_logger +from knack.prompting import NoTTYException, prompt_y_n +from azure.cli.core.commands.client_factory import get_subscription_id +from azure.cli.core.util import send_raw_request +from azure.cli.core import telemetry +from azure.core.exceptions import ResourceNotFoundError, HttpResponseError +from msrest.exceptions import AuthenticationError, HttpOperationError, TokenExpiredError +from msrest.exceptions import ValidationError as MSRestValidationError +from kubernetes.client.rest import ApiException +from azext_connectedk8s._client_factory import _resource_client_factory, _resource_providers_client +import azext_connectedk8s._constants as consts +from kubernetes import client as kube_client +from azure.cli.core import get_default_cli +from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError + +logger = get_logger(__name__) + +# pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long +# pylint: disable +def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): + try: + # Setting DNS and Outbound Check as working + dns_check = "Starting" + + outbound_connectivity_check = "Starting" + # Executing the Diagnoser job and fetching diagnoser logs obtained + diagnoser_container_log = executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + # print(diagnoser_container_log) + # If diagnoser_container_log is not empty then only we will check for the results + if(diagnoser_container_log is not None and diagnoser_container_log != ""): + diagnoser_container_log_list = diagnoser_container_log.split("\n") + diagnoser_container_log_list.pop(-1) + dns_check_log = "" + counter_container_logs = 1 + # For retrieving only diagnoser logs from the diagnoser output + for outputs in diagnoser_container_log_list: + if consts.Outbound_Connectivity_Check_Result_String in outputs: + counter_container_logs = 1 + elif consts.DNS_Check_Result_String in outputs: + dns_check_log += outputs + counter_container_logs = 0 + elif counter_container_logs == 0: + dns_check_log += " " + outputs + # print(dns_check_log) + dns_check = check_cluster_DNS(dns_check_log) + # print("after dns") + # print(diagnoser_container_log_list[-1]) + outbound_connectivity_check= check_cluster_outbound_connectivity(diagnoser_container_log_list[-1]) + else: + # print("if test cannot start") + return consts.Diagnostic_Check_Incomplete + + # If both the check passed then we will return Diagnoser checks Passed + if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed): + # print("if 1") + return consts.Diagnostic_Check_Passed + # If any of the check remain Incomplete than we will return Incomplete + elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete): + # print("if 2") + if dns_check == consts.Diagnostic_Check_Incomplete : + print("DNS DIDNT WORK") + if outbound_connectivity_check == consts.Diagnostic_Check_Incomplete : + print("DNS DIDNT WORK") + return consts.Diagnostic_Check_Incomplete + else: + # print("if 3") + return consts.Diagnostic_Check_Failed + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while trying to perform diagnoser container check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Container_Check_Failed_Fault_Type, summary="Error occured while performing the diagnoser container checks") + + return consts.Diagnostic_Check_Incomplete + +def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): + job_name = "connect-precheck-diagnoser-job" + # yaml_file_path = os.path.join(absolute_path, "connect-precheck-diagnoser-file.yaml") + # Setting the log output as Empty + diagnoser_container_log = "" + + # cmd_delete_job = [kubectl_client_location, "delete", "-f", ""] + # if kube_config: + # cmd_delete_job.extend(["--kubeconfig", kube_config]) + # if kube_context: + # cmd_delete_job.extend(["--context", kube_context]) + + cmd_helm_delete = [helm_client_location, "uninstall", "connect-precheck-diagnoser"] + if kube_config: + cmd_helm_delete.extend(["--kubeconfig", kube_config]) + if kube_context: + cmd_helm_delete.extend(["--context", kube_context]) + + # print("deleteing connect-precheck helm release if present") + # To handle the user keyboard Interrupt + try: + # Executing the diagnoser_job.yaml + config.load_kube_config(kube_config, kube_context) + k8s_client = client.ApiClient() + # Attempting deletion of diagnoser resources to handle the scenario if any stale resources are present + response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate() + # If any error occured while execution of delete command + if (response_kubectl_delete_helm != 0): + # Converting the string of multiple errors to list + error_msg_list = error_kubectl_delete_helm.decode("ascii").split("\n") + error_msg_list.pop(-1) + valid_exception_list = [] + # Checking if any exception occured or not + exception_occured_counter = 0 + for ind_errors in error_msg_list: + if('not found' in ind_errors or 'deleted' in ind_errors): + pass + else: + valid_exception_list.append(ind_errors) + exception_occured_counter = 1 + # If any exception occured we will print the exception and return + if exception_occured_counter == 1: + # print(valid_exception_list) + logger.warning("An error occured while installing the connect precheck helm release in the cluster. Exception:") + # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + return + # print("installing connect-precheck helm release") + try: + chart_path = get_chart_path(consts.Connect_Precheck_Job_Registry_Path, kube_config, kube_context, helm_client_location) + + helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, + kube_context, helm_client_location) + # To handle the Exception that occured + except Exception as e: + # print("helm not installed and job not applied") + logger.warning("An error occured while deploying the connect precheck diagnoser job in the cluster. Exception:") + logger.warning(str(e)) + # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + # Deleting all the stale resources that got created + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + return + # Watching for diagnoser container to reach in completed stage + w = watch.Watch() + is_job_complete = False + is_job_scheduled = False + # To watch for changes in the pods states till it reach completed state or exit if it takes more than 180 seconds + for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=90): + try: + # Checking if job get scheduled or not + if event["object"].metadata.name == "connect-precheck-diagnoser-job": + # print("job scheduled") + is_job_scheduled = True + # Checking if job reached completed stage or not + if event["object"].metadata.name == "connect-precheck-diagnoser-job" and event["object"].status.conditions[0].type == "Complete": + # print("job complete") + is_job_complete = True + w.stop() + except Exception as e: + + # print("exception") + # print(e) + continue + else: + # print("passed") + continue + + if (is_job_scheduled is False): + logger.warning("Unable to schedule the connect precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + return + elif (is_job_scheduled is True and is_job_complete is False): + # print("scheduled not completed") + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + return + else: + # print("Scheduled and finished job") + # Fetching the Diagnoser Container logs + all_pods = corev1_api_instance.list_namespaced_pod('default') + # Traversing through all agents + for each_pod in all_pods.items: + # Fetching the current Pod name and creating a folder with that name inside the timestamp folder + pod_name = each_pod.metadata.name + if(pod_name.startswith(job_name)): + # print("inside making diagnoser container log") + # Creating a text file with the name of the container and adding that containers logs in it + diagnoser_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="connect-precheck-diagnoser-container", namespace='default') + print(diagnoser_container_log) + # Clearing all the resources after fetching the diagnoser container logs + # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while trying to execute the diagnoser job in the cluster. Exception: {}".format(str(e)) + "\n") + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + return + + return diagnoser_container_log + +def check_cluster_DNS(dns_check_log): + + try: + if consts.DNS_Check_Result_String not in dns_check_log: + # print("dns prob") + return consts.Diagnostic_Check_Incomplete + formatted_dns_log = dns_check_log.replace('\t', '') + # Validating if DNS is working or not and displaying proper result + if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): + logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + return consts.Diagnostic_Check_Failed + else: + return consts.Diagnostic_Check_Passed + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + + return consts.Diagnostic_Check_Incomplete + + +def check_cluster_outbound_connectivity(outbound_connectivity_check_log): + + global diagnoser_output + try: + outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] + outbound_connectivity_response = outbound_connectivity_response[::-1] + if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: + # print("outbound prob") + return consts.Diagnostic_Check_Incomplete + # Validating if outbound connectiivty is working or not and displaying proper result + if(outbound_connectivity_response != "000"): + return consts.Diagnostic_Check_Passed + else: + logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + return consts.Diagnostic_Check_Failed + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + + return consts.Diagnostic_Check_Incomplete + +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location): + # print("getting chart path") + # Pulling helm chart from registry + os.environ['HELM_EXPERIMENTAL_OCI'] = '1' + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location) + + # Exporting helm chart after cleanup + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') + try: + if os.path.isdir(chart_export_path): + # print("found the chart") + shutil.rmtree(chart_export_path) + except: + logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location) + + # Returning helm chart path + helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') + chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path + return chart_path + +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location): + # print("pulling helm chart") + cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] + # cmd_helm_chart_pull = [helm_client_location, "fetch", registry_path] + # cmd_helm_chart_pull.extend(["--version", consts.Connect_Precheck_Job_Version]) + if kube_config: + cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) + if kube_context: + cmd_helm_chart_pull.extend(["--kube-context", kube_context]) + response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) + _, error_helm_chart_pull = response_helm_chart_pull.communicate() + if response_helm_chart_pull.returncode != 0: + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, + summary='Unable to pull helm chart from the registry') + raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + + +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): + # print("export chart ") + cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] + if kube_config: + cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) + if kube_context: + cmd_helm_chart_export.extend(["--kube-context", kube_context]) + response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) + _, error_helm_chart_export = response_helm_chart_export.communicate() + if response_helm_chart_export.returncode != 0: + telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, + summary='Unable to export helm chart from the registry') + raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + + +def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, + kube_config, kube_context, helm_client_location, onboarding_timeout="200"): + # print("installing release") + # print(chart_path) + cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path , "--debug"] + # print("before cmd helm install") + # To set some other helm parameters through file + if https_proxy: + cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) + if http_proxy: + cmd_helm_install.extend(["--set", "global.httpProxy={}".format(http_proxy)]) + if no_proxy: + cmd_helm_install.extend(["--set", "global.noProxy={}".format(no_proxy)]) + if proxy_cert: + cmd_helm_install.extend(["--set-file", "global.proxyCert={}".format(proxy_cert)]) + + if kube_config: + cmd_helm_install.extend(["--kubeconfig", kube_config]) + if kube_context: + cmd_helm_install.extend(["--kube-context", kube_context]) + + # if not no_wait: + # # Change --timeout format for helm client to understand + # onboarding_timeout = onboarding_timeout + "s" + # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) + + response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) + _, error_helm_install = response_helm_install.communicate() + if response_helm_install.returncode != 0: + if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): + telemetry.set_user_fault() + telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Install_HelmRelease_Fault_Type, + summary='Unable to install helm release') + logger.warning("Please check if the azure-arc namespace was deployed and run 'kubectl get pods -n azure-arc' to check if all the pods are in running state. A possible cause for pods stuck in pending state could be insufficient resources on the kubernetes cluster to onboard to arc.") + raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 162f493e99b..a53ed5c796e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -48,6 +48,7 @@ import azext_connectedk8s._utils as utils import azext_connectedk8s._clientproxyutils as clientproxyutils import azext_connectedk8s._troubleshootutils as troubleshootutils +import azext_connectedk8s._precheckutils as precheckutils from glob import glob from .vendored_sdks.models import ConnectedCluster, ConnectedClusterIdentity, ConnectedClusterPatch, ListClusterUserCredentialProperties from .vendored_sdks.preview_2022_10_01.models import ConnectedCluster as ConnectedClusterPreview @@ -137,6 +138,35 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat api_instance = kube_client.CoreV1Api() node_api_response = utils.validate_node_api_response(api_instance, None) + try: + absolute_path = os.path.abspath(os.path.dirname(__file__)) + kubectl_client_location = install_kubectl_client() + helm_client_location=install_helm_client() + release_namespace = get_release_namespace(kube_config, kube_context, helm_client_location) + diagnostic_checks = "Failed" + + batchv1_api_instance = kube_client.BatchV1Api() + corev1_api_instance = kube_client.CoreV1Api() + # Performing diagnoser container check + diagnostic_checks = precheckutils.check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + # print(diagnostic_checks) + # If all the checks passed then display no error found + all_checks_passed = True + # for checks in diagnostic_checks: + if diagnostic_checks != consts.Diagnostic_Check_Passed: + all_checks_passed = False + + + # Handling the user manual interrupt + except KeyboardInterrupt: + # except Exception as e: # pylint: disable=broad-except + # logger.warning("An exception has occured") + raise ManualInterrupt('Process terminated .') + + if(all_checks_passed == False): + logger.warning("connect prechecks failed (dns or outbound)") + return + required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: telemetry.set_user_fault() From 8b7f6d3b328f0a1462fad5821bf02f694e63b296 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 11:33:39 +0530 Subject: [PATCH 06/62] modified: src/connectedk8s/HISTORY.rst --- src/connectedk8s/HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index 7159d3e4974..31716a1179c 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -5,7 +5,7 @@ Release History 1.3.7 ++++++ -* Removing detection of infrasturcture and distribution in connectedk8s connect +* Added DNS and outbound connectivity prechecks in connect command 1.3.6 ++++++ From 03e59e8e1f0bf912c23f2d610c77d2be84a925c1 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 11:38:47 +0530 Subject: [PATCH 07/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- .../azext_connectedk8s/_precheckutils.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index d828580efc5..3ab82bcf420 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -52,7 +52,6 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut try: # Setting DNS and Outbound Check as working dns_check = "Starting" - outbound_connectivity_check = "Starting" # Executing the Diagnoser job and fetching diagnoser logs obtained diagnoser_container_log = executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) @@ -103,13 +102,13 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Container_Check_Failed_Fault_Type, summary="Error occured while performing the diagnoser container checks") return consts.Diagnostic_Check_Incomplete - + def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "connect-precheck-diagnoser-job" # yaml_file_path = os.path.join(absolute_path, "connect-precheck-diagnoser-file.yaml") # Setting the log output as Empty diagnoser_container_log = "" - + # cmd_delete_job = [kubectl_client_location, "delete", "-f", ""] # if kube_config: # cmd_delete_job.extend(["--kubeconfig", kube_config]) @@ -121,7 +120,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: cmd_helm_delete.extend(["--context", kube_context]) - + # print("deleteing connect-precheck helm release if present") # To handle the user keyboard Interrupt try: @@ -183,14 +182,13 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ is_job_complete = True w.stop() except Exception as e: - # print("exception") # print(e) continue else: # print("passed") continue - + if (is_job_scheduled is False): logger.warning("Unable to schedule the connect precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) @@ -214,16 +212,16 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ print(diagnoser_container_log) # Clearing all the resources after fetching the diagnoser container logs # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - + # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while trying to execute the diagnoser job in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") return - + return diagnoser_container_log - + def check_cluster_DNS(dns_check_log): try: @@ -250,7 +248,6 @@ def check_cluster_DNS(dns_check_log): return consts.Diagnostic_Check_Incomplete - def check_cluster_outbound_connectivity(outbound_connectivity_check_log): global diagnoser_output @@ -316,7 +313,6 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati summary='Unable to pull helm chart from the registry') raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) - def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): # print("export chart ") cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] @@ -331,7 +327,6 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex summary='Unable to export helm chart from the registry') raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) - def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="200"): # print("installing release") From 64c1935691f1a20f94da3160e98a9168086f5b7c Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 13:46:26 +0530 Subject: [PATCH 08/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_precheckutils.py | 51 +++++++++---------- src/connectedk8s/azext_connectedk8s/custom.py | 6 +-- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 3ab82bcf420..1409cabd695 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -3,21 +3,6 @@ # Licensed under the MIT License. See License.txt in the project root for license information. # -------------------------------------------------------------------------------------------- - -from argparse import Namespace -from pydoc import cli -from kubernetes import client, config, watch, utils -from logging import exception -import yaml -import json -import datetime -from subprocess import Popen, PIPE, run, STDOUT, call, DEVNULL -import shutil -from knack.log import get_logger -from azure.cli.core import telemetry -import azext_connectedk8s._constants as consts -logger = get_logger(__name__) -# pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long import os import shutil import subprocess @@ -43,11 +28,24 @@ from kubernetes import client as kube_client from azure.cli.core import get_default_cli from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError - +from argparse import Namespace +from pydoc import cli +from kubernetes import client, config, watch, utils +from logging import exception +import yaml +import json +import datetime +from subprocess import Popen, PIPE, run, STDOUT, call, DEVNULL +import shutil +from knack.log import get_logger +from azure.cli.core import telemetry +import azext_connectedk8s._constants as consts logger = get_logger(__name__) - # pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long # pylint: disable +logger = get_logger(__name__) + + def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working @@ -75,7 +73,7 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut dns_check = check_cluster_DNS(dns_check_log) # print("after dns") # print(diagnoser_container_log_list[-1]) - outbound_connectivity_check= check_cluster_outbound_connectivity(diagnoser_container_log_list[-1]) + outbound_connectivity_check = check_cluster_outbound_connectivity(diagnoser_container_log_list[-1]) else: # print("if test cannot start") return consts.Diagnostic_Check_Incomplete @@ -87,9 +85,9 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut # If any of the check remain Incomplete than we will return Incomplete elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete): # print("if 2") - if dns_check == consts.Diagnostic_Check_Incomplete : + if dns_check == consts.Diagnostic_Check_Incomplete: print("DNS DIDNT WORK") - if outbound_connectivity_check == consts.Diagnostic_Check_Incomplete : + if outbound_connectivity_check == consts.Diagnostic_Check_Incomplete: print("DNS DIDNT WORK") return consts.Diagnostic_Check_Incomplete else: @@ -103,6 +101,7 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut return consts.Diagnostic_Check_Incomplete + def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "connect-precheck-diagnoser-job" # yaml_file_path = os.path.join(absolute_path, "connect-precheck-diagnoser-file.yaml") @@ -126,7 +125,6 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ try: # Executing the diagnoser_job.yaml config.load_kube_config(kube_config, kube_context) - k8s_client = client.ApiClient() # Attempting deletion of diagnoser resources to handle the scenario if any stale resources are present response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate() @@ -154,8 +152,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ try: chart_path = get_chart_path(consts.Connect_Precheck_Job_Registry_Path, kube_config, kube_context, helm_client_location) - helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, - kube_context, helm_client_location) + helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: # print("helm not installed and job not applied") @@ -223,7 +220,6 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ return diagnoser_container_log def check_cluster_DNS(dns_check_log): - try: if consts.DNS_Check_Result_String not in dns_check_log: # print("dns prob") @@ -248,8 +244,8 @@ def check_cluster_DNS(dns_check_log): return consts.Diagnostic_Check_Incomplete -def check_cluster_outbound_connectivity(outbound_connectivity_check_log): +def check_cluster_outbound_connectivity(outbound_connectivity_check_log): global diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] @@ -276,6 +272,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log): return consts.Diagnostic_Check_Incomplete + def get_chart_path(registry_path, kube_config, kube_context, helm_client_location): # print("getting chart path") # Pulling helm chart from registry @@ -313,6 +310,7 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati summary='Unable to pull helm chart from the registry') raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): # print("export chart ") cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] @@ -327,6 +325,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex summary='Unable to export helm chart from the registry') raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="200"): # print("installing release") @@ -361,4 +360,4 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Install_HelmRelease_Fault_Type, summary='Unable to install helm release') logger.warning("Please check if the azure-arc namespace was deployed and run 'kubectl get pods -n azure-arc' to check if all the pods are in running state. A possible cause for pods stuck in pending state could be insufficient resources on the kubernetes cluster to onboard to arc.") - raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) + raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) \ No newline at end of file diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index a53ed5c796e..59a313ae670 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -141,10 +141,9 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat try: absolute_path = os.path.abspath(os.path.dirname(__file__)) kubectl_client_location = install_kubectl_client() - helm_client_location=install_helm_client() + helm_client_location = install_helm_client() release_namespace = get_release_namespace(kube_config, kube_context, helm_client_location) diagnostic_checks = "Failed" - batchv1_api_instance = kube_client.BatchV1Api() corev1_api_instance = kube_client.CoreV1Api() # Performing diagnoser container check @@ -155,7 +154,6 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # for checks in diagnostic_checks: if diagnostic_checks != consts.Diagnostic_Check_Passed: all_checks_passed = False - # Handling the user manual interrupt except KeyboardInterrupt: @@ -163,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # logger.warning("An exception has occured") raise ManualInterrupt('Process terminated .') - if(all_checks_passed == False): + if all_checks_passed == False: logger.warning("connect prechecks failed (dns or outbound)") return From 970830d5f6f2a58ab821f7470a37d9ddec849ce1 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 15:08:22 +0530 Subject: [PATCH 09/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_precheckutils.py | 15 +++++++++------ src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 1409cabd695..bc061d3688e 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -219,6 +219,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ return diagnoser_container_log + def check_cluster_DNS(dns_check_log): try: if consts.DNS_Check_Result_String not in dns_check_log: @@ -294,6 +295,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path return chart_path + def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location): # print("pulling helm chart") cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] @@ -327,10 +329,10 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, - kube_config, kube_context, helm_client_location, onboarding_timeout="200"): +kube_config, kube_context, helm_client_location, onboarding_timeout="200"): # print("installing release") # print(chart_path) - cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path , "--debug"] + cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path, "--debug"] # print("before cmd helm install") # To set some other helm parameters through file if https_proxy: @@ -346,12 +348,12 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce cmd_helm_install.extend(["--kubeconfig", kube_config]) if kube_context: cmd_helm_install.extend(["--kube-context", kube_context]) - + # if not no_wait: # # Change --timeout format for helm client to understand # onboarding_timeout = onboarding_timeout + "s" - # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) - + # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) + response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) _, error_helm_install = response_helm_install.communicate() if response_helm_install.returncode != 0: @@ -360,4 +362,5 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Install_HelmRelease_Fault_Type, summary='Unable to install helm release') logger.warning("Please check if the azure-arc namespace was deployed and run 'kubectl get pods -n azure-arc' to check if all the pods are in running state. A possible cause for pods stuck in pending state could be insufficient resources on the kubernetes cluster to onboard to arc.") - raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) \ No newline at end of file + raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) + \ No newline at end of file diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 59a313ae670..b576e9be12a 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # logger.warning("An exception has occured") raise ManualInterrupt('Process terminated .') - if all_checks_passed == False: + if all_checks_passed is False: logger.warning("connect prechecks failed (dns or outbound)") return From d92e0f17817908aba6aed6e596c6d90c1279611e Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 15:30:58 +0530 Subject: [PATCH 10/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 2 +- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 65ba651fc20..cb54dc06cce 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -177,7 +177,7 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" Connect_Precheck_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" -Connect_Precheck_Job_Version= "0.1.0" +Connect_Precheck_Job_Version = "0.1.0" # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index bc061d3688e..12099357b03 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -352,7 +352,7 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce # if not no_wait: # # Change --timeout format for helm client to understand # onboarding_timeout = onboarding_timeout + "s" - # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) + # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) _, error_helm_install = response_helm_install.communicate() @@ -363,4 +363,3 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce summary='Unable to install helm release') logger.warning("Please check if the azure-arc namespace was deployed and run 'kubectl get pods -n azure-arc' to check if all the pods are in running state. A possible cause for pods stuck in pending state could be insufficient resources on the kubernetes cluster to onboard to arc.") raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) - \ No newline at end of file From 3560a39efd7c8e7d2fe76aa91c9f2f077716b085 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 26 Dec 2022 15:47:13 +0530 Subject: [PATCH 11/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 12099357b03..273c7b90f0b 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -328,8 +328,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) -def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, -kube_config, kube_context, helm_client_location, onboarding_timeout="200"): +def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="200"): # print("installing release") # print(chart_path) cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path, "--debug"] From 199f8b140ff72b289228b0bdf11ecc7c3e04b451 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 27 Dec 2022 11:23:02 +0530 Subject: [PATCH 12/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_precheckutils.py | 56 ++----------------- src/connectedk8s/azext_connectedk8s/custom.py | 9 ++- 2 files changed, 10 insertions(+), 55 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 273c7b90f0b..a4781378519 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -53,7 +53,6 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut outbound_connectivity_check = "Starting" # Executing the Diagnoser job and fetching diagnoser logs obtained diagnoser_container_log = executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) - # print(diagnoser_container_log) # If diagnoser_container_log is not empty then only we will check for the results if(diagnoser_container_log is not None and diagnoser_container_log != ""): diagnoser_container_log_list = diagnoser_container_log.split("\n") @@ -69,29 +68,18 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - # print(dns_check_log) dns_check = check_cluster_DNS(dns_check_log) - # print("after dns") - # print(diagnoser_container_log_list[-1]) outbound_connectivity_check = check_cluster_outbound_connectivity(diagnoser_container_log_list[-1]) else: - # print("if test cannot start") return consts.Diagnostic_Check_Incomplete # If both the check passed then we will return Diagnoser checks Passed if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed): - # print("if 1") return consts.Diagnostic_Check_Passed # If any of the check remain Incomplete than we will return Incomplete elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete): - # print("if 2") - if dns_check == consts.Diagnostic_Check_Incomplete: - print("DNS DIDNT WORK") - if outbound_connectivity_check == consts.Diagnostic_Check_Incomplete: - print("DNS DIDNT WORK") return consts.Diagnostic_Check_Incomplete else: - # print("if 3") return consts.Diagnostic_Check_Failed # To handle any exception that may occur during the execution @@ -104,23 +92,15 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "connect-precheck-diagnoser-job" - # yaml_file_path = os.path.join(absolute_path, "connect-precheck-diagnoser-file.yaml") # Setting the log output as Empty diagnoser_container_log = "" - # cmd_delete_job = [kubectl_client_location, "delete", "-f", ""] - # if kube_config: - # cmd_delete_job.extend(["--kubeconfig", kube_config]) - # if kube_context: - # cmd_delete_job.extend(["--context", kube_context]) - cmd_helm_delete = [helm_client_location, "uninstall", "connect-precheck-diagnoser"] if kube_config: cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: cmd_helm_delete.extend(["--context", kube_context]) - # print("deleteing connect-precheck helm release if present") # To handle the user keyboard Interrupt try: # Executing the diagnoser_job.yaml @@ -144,18 +124,15 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: - # print(valid_exception_list) logger.warning("An error occured while installing the connect precheck helm release in the cluster. Exception:") # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") return - # print("installing connect-precheck helm release") try: chart_path = get_chart_path(consts.Connect_Precheck_Job_Registry_Path, kube_config, kube_context, helm_client_location) helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: - # print("helm not installed and job not applied") logger.warning("An error occured while deploying the connect precheck diagnoser job in the cluster. Exception:") logger.warning(str(e)) # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") @@ -171,19 +148,14 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ try: # Checking if job get scheduled or not if event["object"].metadata.name == "connect-precheck-diagnoser-job": - # print("job scheduled") is_job_scheduled = True # Checking if job reached completed stage or not if event["object"].metadata.name == "connect-precheck-diagnoser-job" and event["object"].status.conditions[0].type == "Complete": - # print("job complete") is_job_complete = True w.stop() except Exception as e: - # print("exception") - # print(e) continue else: - # print("passed") continue if (is_job_scheduled is False): @@ -191,11 +163,9 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): - # print("scheduled not completed") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: - # print("Scheduled and finished job") # Fetching the Diagnoser Container logs all_pods = corev1_api_instance.list_namespaced_pod('default') # Traversing through all agents @@ -203,12 +173,10 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ # Fetching the current Pod name and creating a folder with that name inside the timestamp folder pod_name = each_pod.metadata.name if(pod_name.startswith(job_name)): - # print("inside making diagnoser container log") # Creating a text file with the name of the container and adding that containers logs in it diagnoser_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="connect-precheck-diagnoser-container", namespace='default') - print(diagnoser_container_log) # Clearing all the resources after fetching the diagnoser container logs - # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: @@ -223,7 +191,6 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ def check_cluster_DNS(dns_check_log): try: if consts.DNS_Check_Result_String not in dns_check_log: - # print("dns prob") return consts.Diagnostic_Check_Incomplete formatted_dns_log = dns_check_log.replace('\t', '') # Validating if DNS is working or not and displaying proper result @@ -247,12 +214,10 @@ def check_cluster_DNS(dns_check_log): def check_cluster_outbound_connectivity(outbound_connectivity_check_log): - global diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - # print("outbound prob") return consts.Diagnostic_Check_Incomplete # Validating if outbound connectiivty is working or not and displaying proper result if(outbound_connectivity_response != "000"): @@ -275,7 +240,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log): def get_chart_path(registry_path, kube_config, kube_context, helm_client_location): - # print("getting chart path") + # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location) @@ -284,7 +249,6 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') try: if os.path.isdir(chart_export_path): - # print("found the chart") shutil.rmtree(chart_export_path) except: logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) @@ -297,10 +261,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location): - # print("pulling helm chart") cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] - # cmd_helm_chart_pull = [helm_client_location, "fetch", registry_path] - # cmd_helm_chart_pull.extend(["--version", consts.Connect_Precheck_Job_Version]) if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) if kube_context: @@ -314,7 +275,6 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): - # print("export chart ") cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) @@ -328,11 +288,8 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) -def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="200"): - # print("installing release") - # print(chart_path) +def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="120"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path, "--debug"] - # print("before cmd helm install") # To set some other helm parameters through file if https_proxy: cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) @@ -348,10 +305,9 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce if kube_context: cmd_helm_install.extend(["--kube-context", kube_context]) - # if not no_wait: - # # Change --timeout format for helm client to understand - # onboarding_timeout = onboarding_timeout + "s" - # cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) + # Change --timeout format for helm client to understand + onboarding_timeout = onboarding_timeout + "s" + cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) _, error_helm_install = response_helm_install.communicate() diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index b576e9be12a..08e268e7ca8 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -148,21 +148,20 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat corev1_api_instance = kube_client.CoreV1Api() # Performing diagnoser container check diagnostic_checks = precheckutils.check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) - # print(diagnostic_checks) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: if diagnostic_checks != consts.Diagnostic_Check_Passed: all_checks_passed = False + except Exception as e: + logger.warning("Exception occured : {}".format(str(e))) + # Handling the user manual interrupt except KeyboardInterrupt: - # except Exception as e: # pylint: disable=broad-except - # logger.warning("An exception has occured") - raise ManualInterrupt('Process terminated .') + raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - logger.warning("connect prechecks failed (dns or outbound)") return required_node_exists = check_linux_amd64_node(node_api_response) From a2786100de0099a50a9a39ab3b2fe4cba297b3d0 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 27 Dec 2022 13:33:16 +0530 Subject: [PATCH 13/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- .../azext_connectedk8s/_constants.py | 8 ++- .../azext_connectedk8s/_precheckutils.py | 51 ++++++++++--------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index cb54dc06cce..ac4dfa0423b 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -176,9 +176,15 @@ K8s_Cluster_Info = "k8s_cluster_info.txt" Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" +#Precheck Diagnoser consts Connect_Precheck_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" Connect_Precheck_Job_Version = "0.1.0" - +Precheck_Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the connect precheck diagnoser diagnoser" +Precheck_Diagnoser_Helm_Release_Failed_Fault_Type = "Error while installing Precheck Diagnoser helm release" +Precheck_Diagnoser_Failed_Fault_Type = "Error while executing connect precheck Diagnoser Job" +Precheck_Diagnoser_Pull_HelmChart_Fault_Type = 'precheck-diagnoser-helm-chart-pull-error' +Precheck_Diagnoser_Export_HelmChart_Fault_Type = 'precheck-diagnoser-helm-chart-export-error' +Precheck_Diagnoser_Install_HelmRelease_Fault_Type = 'precheck-diagnoser-helm-release-install-error' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a4781378519..a3162e07817 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -84,8 +84,8 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to perform diagnoser container check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Container_Check_Failed_Fault_Type, summary="Error occured while performing the diagnoser container checks") + logger.warning("An exception has occured while trying to perform prechecks diagnoser container on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Container_Check_Failed_Fault_Type, summary="Error occured while performing the prechecks diagnoser container") return consts.Diagnostic_Check_Incomplete @@ -105,7 +105,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ try: # Executing the diagnoser_job.yaml config.load_kube_config(kube_config, kube_context) - # Attempting deletion of diagnoser resources to handle the scenario if any stale resources are present + # Attempting deletion of precheck diagnoser resources to handle the scenario if any stale resources are present response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate() # If any error occured while execution of delete command @@ -125,7 +125,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ # If any exception occured we will print the exception and return if exception_occured_counter == 1: logger.warning("An error occured while installing the connect precheck helm release in the cluster. Exception:") - # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Failed_Fault_Type, summary="Error while executing connect precheck Diagnoser Job") return try: chart_path = get_chart_path(consts.Connect_Precheck_Job_Registry_Path, kube_config, kube_context, helm_client_location) @@ -133,9 +133,9 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: - logger.warning("An error occured while deploying the connect precheck diagnoser job in the cluster. Exception:") + logger.warning("An error occured while installing helm release of connect precheck diagnoser in the cluster. Exception:") logger.warning(str(e)) - # telemetry.set_exception(exception=error_helm_get_values.decode("ascii"), fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Helm_Release_Failed_Fault_Type, summary="Error while installing Precheck Diagnoser helm release") # Deleting all the stale resources that got created Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return @@ -144,7 +144,7 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ is_job_complete = False is_job_scheduled = False # To watch for changes in the pods states till it reach completed state or exit if it takes more than 180 seconds - for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=90): + for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=60): try: # Checking if job get scheduled or not if event["object"].metadata.name == "connect-precheck-diagnoser-job": @@ -180,9 +180,9 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to execute the diagnoser job in the cluster. Exception: {}".format(str(e)) + "\n") + logger.warning("An exception has occured while trying to execute the connect precheck diagnoser in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Job_Failed_Fault_Type, summary="Error while executing Diagnoser Job") + telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Failed_Fault_Type, summary="Error while executing Precheck Diagnoser Job") return return diagnoser_container_log @@ -203,12 +203,14 @@ def check_cluster_DNS(dns_check_log): # For handling storage or OS exception that may occur during the execution except OSError as e: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") return consts.Diagnostic_Check_Incomplete @@ -229,12 +231,14 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log): # For handling storage or OS exception that may occur during the execution except OSError as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") return consts.Diagnostic_Check_Incomplete @@ -269,9 +273,9 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, - summary='Unable to pull helm chart from the registry') - raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Pull_HelmChart_Fault_Type, + summary='Unable to pull connect precheck helm chart from the registry') + raise CLIInternalError("Unable to pull connect precheck helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): @@ -283,12 +287,12 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export helm chart from the registry') - raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Export_HelmChart_Fault_Type, + summary='Unable to export connect precheck helm chart from the registry') + raise CLIInternalError("Unable to export connect precheck helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) -def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="120"): +def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path, "--debug"] # To set some other helm parameters through file if https_proxy: @@ -314,7 +318,6 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce if response_helm_install.returncode != 0: if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): telemetry.set_user_fault() - telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Install_HelmRelease_Fault_Type, - summary='Unable to install helm release') - logger.warning("Please check if the azure-arc namespace was deployed and run 'kubectl get pods -n azure-arc' to check if all the pods are in running state. A possible cause for pods stuck in pending state could be insufficient resources on the kubernetes cluster to onboard to arc.") - raise CLIInternalError("Unable to install helm release: " + error_helm_install.decode("ascii")) + telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Install_HelmRelease_Fault_Type, + summary='Unable to install connect precheck helm release') + raise CLIInternalError("Unable to install connect precheck helm release: " + error_helm_install.decode("ascii")) From 07469aeeb8dec464818ad327a110e2fc37f9944f Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 27 Dec 2022 13:50:36 +0530 Subject: [PATCH 14/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index ac4dfa0423b..b92af7f0b0c 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -176,7 +176,7 @@ K8s_Cluster_Info = "k8s_cluster_info.txt" Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" -#Precheck Diagnoser consts +# Connect Precheck Diagnoser constants Connect_Precheck_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" Connect_Precheck_Job_Version = "0.1.0" Precheck_Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the connect precheck diagnoser diagnoser" From 278556de0ba33a4fe5e777f1620a2b284dfc5f06 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 3 Jan 2023 00:31:55 +0530 Subject: [PATCH 15/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a3162e07817..193af62de3f 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -293,7 +293,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): - cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path, "--debug"] + cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path] # To set some other helm parameters through file if https_proxy: cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) From e7896f21a3a318b2bb7a1fe0f6d8a63417fa1cc5 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 3 Jan 2023 10:42:26 +0530 Subject: [PATCH 16/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 193af62de3f..c8d66f12dc3 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -159,10 +159,11 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ continue if (is_job_scheduled is False): - logger.warning("Unable to schedule the connect precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") + logger.warning("Unable to schedule the cluster precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): + logger.warning("Unable to finish the cluster precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of lack of Resources on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: From 617abfe8cac312e5a3189ff483b4669e68281ab9 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 3 Jan 2023 12:07:41 +0530 Subject: [PATCH 17/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 4 ++-- src/connectedk8s/azext_connectedk8s/custom.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index b92af7f0b0c..834b510c2c6 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -6,8 +6,8 @@ # pylint: disable=line-too-long -Distribution_Enum_Values = ["generic", "openshift", "rancher_rke", "kind", "k3s", "minikube", "gke", "eks", "aks", "aks_management", "aks_workload", "capz", "aks_engine", "tkg", "canonical", "karbon"] -Infrastructure_Enum_Values = ["generic", "azure", "aws", "gcp", "azure_stack_hci", "azure_stack_hub", "azure_stack_edge", "vsphere", "windows_server"] +Distribution_Enum_Values = ["auto", "generic", "openshift", "rancher_rke", "kind", "k3s", "minikube", "gke", "eks", "aks", "aks_management", "aks_workload", "capz", "aks_engine", "tkg", "canonical", "karbon"] +Infrastructure_Enum_Values = ["auto", "generic", "azure", "aws", "gcp", "azure_stack_hci", "azure_stack_hub", "azure_stack_edge", "vsphere", "windows_server"] AHB_Enum_Values = ["True", "False", "NotApplicable"] Feature_Values = ["cluster-connect", "azure-rbac", "custom-locations"] CRD_FOR_FORCE_DELETE = ["arccertificates.clusterconfig.azure.com", "azureclusteridentityrequests.clusterconfig.azure.com", "azureextensionidentities.clusterconfig.azure.com", "connectedclusters.arc.azure.com", "customlocationsettings.clusterconfig.azure.com", "extensionconfigs.clusterconfig.azure.com", "gitconfigs.clusterconfig.azure.com"] diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 08e268e7ca8..2d6a2afab5e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -66,7 +66,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlation_id=None, https_proxy="", http_proxy="", no_proxy="", proxy_cert="", location=None, - kube_config=None, kube_context=None, no_wait=False, tags=None, distribution='generic', infrastructure='generic', + kube_config=None, kube_context=None, no_wait=False, tags=None, distribution='auto', infrastructure='auto', disable_auto_upgrade=False, cl_oid=None, onboarding_timeout="600", enable_private_link=None, private_link_scope_resource_id=None, distribution_version=None, azure_hybrid_benefit=None, yes=False, container_log_path=None): logger.warning("This operation might take a while...\n") @@ -178,8 +178,14 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ValidationError("Your credentials doesn't have permission to create clusterrolebindings on this kubernetes cluster. Please check your permissions.") # Get kubernetes cluster info - kubernetes_distro = distribution - kubernetes_infra = infrastructure + if distribution == 'auto': + kubernetes_distro = get_kubernetes_distro(node_api_response) # (cluster heuristics) + else: + kubernetes_distro = distribution + if infrastructure == 'auto': + kubernetes_infra = get_kubernetes_infra(node_api_response) # (cluster heuristics) + else: + kubernetes_infra = infrastructure kubernetes_properties = { 'Context.Default.AzureCLI.KubernetesVersion': kubernetes_version, From 83799006cf557b8863a9fd1cf55f62da24e8c06e Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 9 Jan 2023 17:33:29 +0530 Subject: [PATCH 18/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_troubleshootutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 15 +- .../azext_connectedk8s/_precheckutils.py | 126 ++++++------ .../azext_connectedk8s/_troubleshootutils.py | 92 +-------- src/connectedk8s/azext_connectedk8s/_utils.py | 193 ++++++++++++++++-- src/connectedk8s/azext_connectedk8s/custom.py | 5 +- 5 files changed, 254 insertions(+), 177 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 834b510c2c6..40096268a98 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -177,14 +177,13 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Connect_Precheck_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" -Connect_Precheck_Job_Version = "0.1.0" -Precheck_Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the connect precheck diagnoser diagnoser" -Precheck_Diagnoser_Helm_Release_Failed_Fault_Type = "Error while installing Precheck Diagnoser helm release" -Precheck_Diagnoser_Failed_Fault_Type = "Error while executing connect precheck Diagnoser Job" -Precheck_Diagnoser_Pull_HelmChart_Fault_Type = 'precheck-diagnoser-helm-chart-pull-error' -Precheck_Diagnoser_Export_HelmChart_Fault_Type = 'precheck-diagnoser-helm-chart-export-error' -Precheck_Diagnoser_Install_HelmRelease_Fault_Type = 'precheck-diagnoser-helm-release-install-error' +Pre_Onboarding_Inspector_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" +Pre_Onboarding_Inspector_Check_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" +Pre_Onboarding_Inspector_Helm_Release_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" +Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector Job" +Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-pull-error' +Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-export-error' +Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index c8d66f12dc3..e3fe14bd078 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -25,12 +25,12 @@ from kubernetes.client.rest import ApiException from azext_connectedk8s._client_factory import _resource_client_factory, _resource_providers_client import azext_connectedk8s._constants as consts +import azext_connectedk8s._utils as azext_utils from kubernetes import client as kube_client from azure.cli.core import get_default_cli from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError from argparse import Namespace from pydoc import cli -from kubernetes import client, config, watch, utils from logging import exception import yaml import json @@ -43,24 +43,23 @@ logger = get_logger(__name__) # pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long # pylint: disable -logger = get_logger(__name__) -def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): +def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" - # Executing the Diagnoser job and fetching diagnoser logs obtained - diagnoser_container_log = executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) - # If diagnoser_container_log is not empty then only we will check for the results - if(diagnoser_container_log is not None and diagnoser_container_log != ""): - diagnoser_container_log_list = diagnoser_container_log.split("\n") - diagnoser_container_log_list.pop(-1) + # Executing the pre onboarding inspector job and fetching the logs obtained + preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + # If preonboarding_inspector_container_log is not empty then only we will check for the results + if(preonboarding_inspector_container_log is not None and preonboarding_inspector_container_log != ""): + preonboarding_inspector_container_log_list = preonboarding_inspector_container_log.split("\n") + preonboarding_inspector_container_log_list.pop(-1) dns_check_log = "" counter_container_logs = 1 - # For retrieving only diagnoser logs from the diagnoser output - for outputs in diagnoser_container_log_list: + # For retrieving only preonboarding inspector logs from the inspector output + for outputs in preonboarding_inspector_container_log_list: if consts.Outbound_Connectivity_Check_Result_String in outputs: counter_container_logs = 1 elif consts.DNS_Check_Result_String in outputs: @@ -68,12 +67,12 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check = check_cluster_DNS(dns_check_log) - outbound_connectivity_check = check_cluster_outbound_connectivity(diagnoser_container_log_list[-1]) + dns_check = azext_utils.check_cluster_DNS(dns_check_log, True) + outbound_connectivity_check = azext_utils.check_cluster_outbound_connectivity(preonboarding_inspector_container_log_list[-1], True) else: return consts.Diagnostic_Check_Incomplete - # If both the check passed then we will return Diagnoser checks Passed + # If both the check passed then we will return pre onboarding inspector checks Passed if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed): return consts.Diagnostic_Check_Passed # If any of the check remain Incomplete than we will return Incomplete @@ -84,18 +83,18 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolut # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to perform prechecks diagnoser container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Container_Check_Failed_Fault_Type, summary="Error occured while performing the prechecks diagnoser container") + logger.warning("An exception has occured while trying to perform pre onboarding inspector container on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Check_Failed_Fault_Type, summary="Error occured while performing the pre onboarding inspector container") return consts.Diagnostic_Check_Incomplete -def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): - job_name = "connect-precheck-diagnoser-job" +def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): + job_name = "pre-onboarding-inspector-job" # Setting the log output as Empty - diagnoser_container_log = "" + preonboarding_inspector_container_log = "" - cmd_helm_delete = [helm_client_location, "uninstall", "connect-precheck-diagnoser"] + cmd_helm_delete = [helm_client_location, "uninstall", "pre-onboarding-inspector"] if kube_config: cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: @@ -103,9 +102,9 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ # To handle the user keyboard Interrupt try: - # Executing the diagnoser_job.yaml + # Executing the pre onboarding inspector job yaml config.load_kube_config(kube_config, kube_context) - # Attempting deletion of precheck diagnoser resources to handle the scenario if any stale resources are present + # Attempting deletion of pre onboarding inspector resources to handle the scenario if any stale resources are present response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate() # If any error occured while execution of delete command @@ -124,22 +123,22 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: - logger.warning("An error occured while installing the connect precheck helm release in the cluster. Exception:") - telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Failed_Fault_Type, summary="Error while executing connect precheck Diagnoser Job") + logger.warning("An error occured while installing the pre onboarding inspector helm release in the cluster. Exception:") + telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error while executing pre onboarding inspector Job") return try: - chart_path = get_chart_path(consts.Connect_Precheck_Job_Registry_Path, kube_config, kube_context, helm_client_location) + chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, True) helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: - logger.warning("An error occured while installing helm release of connect precheck diagnoser in the cluster. Exception:") + logger.warning("An error occured while installing helm release of pre onboarding inspector in the cluster. Exception:") logger.warning(str(e)) - telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Helm_Release_Failed_Fault_Type, summary="Error while installing Precheck Diagnoser helm release") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Helm_Release_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") # Deleting all the stale resources that got created Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return - # Watching for diagnoser container to reach in completed stage + # Watching for pre onboarding inspector container to reach in completed stage w = watch.Watch() is_job_complete = False is_job_scheduled = False @@ -147,10 +146,10 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=60): try: # Checking if job get scheduled or not - if event["object"].metadata.name == "connect-precheck-diagnoser-job": + if event["object"].metadata.name == "pre-onboarding-inspector-job": is_job_scheduled = True # Checking if job reached completed stage or not - if event["object"].metadata.name == "connect-precheck-diagnoser-job" and event["object"].status.conditions[0].type == "Complete": + if event["object"].metadata.name == "pre-onboarding-inspector-job" and event["object"].status.conditions[0].type == "Complete": is_job_complete = True w.stop() except Exception as e: @@ -159,15 +158,15 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ continue if (is_job_scheduled is False): - logger.warning("Unable to schedule the cluster precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") + logger.warning("Unable to schedule the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): - logger.warning("Unable to finish the cluster precheck diagnoser job in the kubernetes cluster. The possible reasons can be presence of lack of Resources on the cluster.\n") + logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of lack of Resources on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: - # Fetching the Diagnoser Container logs + # Fetching the pre onboarding inspector Container logs all_pods = corev1_api_instance.list_namespaced_pod('default') # Traversing through all agents for each_pod in all_pods.items: @@ -175,48 +174,49 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, absolute_ pod_name = each_pod.metadata.name if(pod_name.startswith(job_name)): # Creating a text file with the name of the container and adding that containers logs in it - diagnoser_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="connect-precheck-diagnoser-container", namespace='default') - # Clearing all the resources after fetching the diagnoser container logs + preonboarding_inspector_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="pre-onboarding-inspector-container", namespace='default') + # Clearing all the resources after fetching the pre onboarding inspector container logs Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to execute the connect precheck diagnoser in the cluster. Exception: {}".format(str(e)) + "\n") + logger.warning("An exception has occured while trying to execute the pre onboarding inspector in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - telemetry.set_exception(exception=e, fault_type=consts.Precheck_Diagnoser_Failed_Fault_Type, summary="Error while executing Precheck Diagnoser Job") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error while executing Pre onboarding inspector Job") return - return diagnoser_container_log - + return preonboarding_inspector_container_log -def check_cluster_DNS(dns_check_log): - try: - if consts.DNS_Check_Result_String not in dns_check_log: - return consts.Diagnostic_Check_Incomplete - formatted_dns_log = dns_check_log.replace('\t', '') - # Validating if DNS is working or not and displaying proper result - if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): - logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - return consts.Diagnostic_Check_Failed - else: - return consts.Diagnostic_Check_Passed - # For handling storage or OS exception that may occur during the execution - except OSError as e: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") +def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): + cmd_helm_install = [helm_client_location, "upgrade", "--install", "pre-onboarding-inspector", chart_path] + # To set some other helm parameters through file + if https_proxy: + cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) + if http_proxy: + cmd_helm_install.extend(["--set", "global.httpProxy={}".format(http_proxy)]) + if no_proxy: + cmd_helm_install.extend(["--set", "global.noProxy={}".format(no_proxy)]) + if proxy_cert: + cmd_helm_install.extend(["--set-file", "global.proxyCert={}".format(proxy_cert)]) - return consts.Diagnostic_Check_Incomplete + if kube_config: + cmd_helm_install.extend(["--kubeconfig", kube_config]) + if kube_context: + cmd_helm_install.extend(["--kube-context", kube_context]) + # Change --timeout format for helm client to understand + onboarding_timeout = onboarding_timeout + "s" + cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) -def check_cluster_outbound_connectivity(outbound_connectivity_check_log): + response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) + _, error_helm_install = response_helm_install.communicate() + if response_helm_install.returncode != 0: + if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): + telemetry.set_user_fault() + telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type, + summary='Unable to install pre onboarding inspector helm release') + raise CLIInternalError("Unable to install pre onboarding inspector helm release: " + error_helm_install.decode("ascii")) try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] diff --git a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py index 24b18d383dd..1fed211f02d 100644 --- a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py +++ b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py @@ -16,6 +16,7 @@ from knack.log import get_logger from azure.cli.core import telemetry import azext_connectedk8s._constants as consts +import azext_connectedk8s._utils as azext_utils logger = get_logger(__name__) # pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long @@ -493,8 +494,8 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, filepat counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check, storage_space_available = check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available) - outbound_connectivity_check, storage_space_available = check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], filepath_with_timestamp, storage_space_available) + dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, False, filepath_with_timestamp, storage_space_available) + outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], False, filepath_with_timestamp, storage_space_available) else: return consts.Diagnostic_Check_Incomplete, storage_space_available @@ -737,93 +738,6 @@ def executing_diagnoser_job(corev1_api_instance, batchv1_api_instance, filepath_ return diagnoser_container_log -def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available): - - global diagnoser_output - try: - if consts.DNS_Check_Result_String not in dns_check_log: - return consts.Diagnostic_Check_Incomplete, storage_space_available - formatted_dns_log = dns_check_log.replace('\t', '') - # Validating if DNS is working or not and displaying proper result - if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): - logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available - else: - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - storage_space_available = False - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - else: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - - return consts.Diagnostic_Check_Incomplete, storage_space_available - - -def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available): - - global diagnoser_output - try: - outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] - outbound_connectivity_response = outbound_connectivity_response[::-1] - if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - return consts.Diagnostic_Check_Incomplete, storage_space_available - # Validating if outbound connectiivty is working or not and displaying proper result - if(outbound_connectivity_response != "000"): - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available - else: - logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - storage_space_available = False - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - else: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - - return consts.Diagnostic_Check_Incomplete, storage_space_available - - def check_msi_certificate_presence(corev1_api_instance): global diagnoser_output diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 2a988f348cf..f566418a013 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -69,27 +69,42 @@ def validate_location(cmd, location): break -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location): +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location) + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) # Exporting helm chart after cleanup - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'AzureArcCharts') + if for_preonboarding_checks: + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') + else: + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'AzureArcCharts') try: if os.path.isdir(chart_export_path): shutil.rmtree(chart_export_path) except: - logger.warning("Unable to cleanup the azure-arc helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location) + if for_preonboarding_checks: + logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) + else: + logger.warning("Unable to cleanup the azure-arc helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) # Returning helm chart path - helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') - chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path + if for_preonboarding_checks: + # helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') + helm_chart_path = os.path.join(chart_export_path, 'pre-onboarding-inspector') + else: + helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') + + if for_preonboarding_checks: + chart_path = helm_chart_path + else: + chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path + return chart_path -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location): +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) @@ -98,12 +113,17 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, - summary='Unable to pull helm chart from the registry') - raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + if for_preonboarding_checks: + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type, + summary='Unable to pull pre-onboarding-inspector helm chart from the registry') + raise CLIInternalError("Unable to pull pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + else: + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, + summary='Unable to pull helm chart from the registry') + raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) @@ -112,9 +132,152 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export helm chart from the registry') - raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + if for_preonboarding_checks: + telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type, + summary='Unable to export pre-onboarding-inspector helm chart from the registry') + raise CLIInternalError("Unable to export pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + else: + telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, + summary='Unable to export helm chart from the registry') + raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + + +def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): + if for_preonboarding_checks: + try: + if consts.DNS_Check_Result_String not in dns_check_log: + return consts.Diagnostic_Check_Incomplete + formatted_dns_log = dns_check_log.replace('\t', '') + # Validating if DNS is working or not and displaying proper result + if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): + logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + return consts.Diagnostic_Check_Failed + else: + return consts.Diagnostic_Check_Passed + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") + + return consts.Diagnostic_Check_Incomplete + else: + global diagnoser_output + try: + if consts.DNS_Check_Result_String not in dns_check_log: + return consts.Diagnostic_Check_Incomplete, storage_space_available + formatted_dns_log = dns_check_log.replace('\t', '') + # Validating if DNS is working or not and displaying proper result + if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): + logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available + else: + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + else: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + + return consts.Diagnostic_Check_Incomplete, storage_space_available + + +def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): + if for_preonboarding_checks: + try: + outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] + outbound_connectivity_response = outbound_connectivity_response[::-1] + if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: + return consts.Diagnostic_Check_Incomplete + # Validating if outbound connectiivty is working or not and displaying proper result + if(outbound_connectivity_response != "000"): + return consts.Diagnostic_Check_Passed + else: + logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + return consts.Diagnostic_Check_Failed + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_user_fault() + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") + + return consts.Diagnostic_Check_Incomplete + else: + global diagnoser_output + try: + outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] + outbound_connectivity_response = outbound_connectivity_response[::-1] + if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: + return consts.Diagnostic_Check_Incomplete, storage_space_available + # Validating if outbound connectiivty is working or not and displaying proper result + if(outbound_connectivity_response != "000"): + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available + else: + logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + else: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + + return consts.Diagnostic_Check_Incomplete, storage_space_available def add_helm_repo(kube_config, kube_context, helm_client_location): diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 2d6a2afab5e..6b1972ccd5e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -138,6 +138,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat api_instance = kube_client.CoreV1Api() node_api_response = utils.validate_node_api_response(api_instance, None) + # Pre onboarding checks try: absolute_path = os.path.abspath(os.path.dirname(__file__)) kubectl_client_location = install_kubectl_client() @@ -146,8 +147,8 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() corev1_api_instance = kube_client.CoreV1Api() - # Performing diagnoser container check - diagnostic_checks = precheckutils.check_diagnoser_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + # Performing pre onboarding inspector container check + diagnostic_checks = precheckutils.check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: From ad4ed210deb467a3281fdda2ef4187b7ba01c826 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 9 Jan 2023 18:02:14 +0530 Subject: [PATCH 19/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py --- .../azext_connectedk8s/_precheckutils.py | 105 ------------------ src/connectedk8s/azext_connectedk8s/_utils.py | 4 +- 2 files changed, 2 insertions(+), 107 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index e3fe14bd078..b77f238a364 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -217,108 +217,3 @@ def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_ce telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type, summary='Unable to install pre onboarding inspector helm release') raise CLIInternalError("Unable to install pre onboarding inspector helm release: " + error_helm_install.decode("ascii")) - try: - outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] - outbound_connectivity_response = outbound_connectivity_response[::-1] - if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - return consts.Diagnostic_Check_Incomplete - # Validating if outbound connectiivty is working or not and displaying proper result - if(outbound_connectivity_response != "000"): - return consts.Diagnostic_Check_Passed - else: - logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - return consts.Diagnostic_Check_Failed - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") - - return consts.Diagnostic_Check_Incomplete - - -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location): - - # Pulling helm chart from registry - os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location) - - # Exporting helm chart after cleanup - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') - try: - if os.path.isdir(chart_export_path): - shutil.rmtree(chart_export_path) - except: - logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location) - - # Returning helm chart path - helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') - chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path - return chart_path - - -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location): - cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] - if kube_config: - cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) - if kube_context: - cmd_helm_chart_pull.extend(["--kube-context", kube_context]) - response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) - _, error_helm_chart_pull = response_helm_chart_pull.communicate() - if response_helm_chart_pull.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Pull_HelmChart_Fault_Type, - summary='Unable to pull connect precheck helm chart from the registry') - raise CLIInternalError("Unable to pull connect precheck helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) - - -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location): - cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] - if kube_config: - cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) - if kube_context: - cmd_helm_chart_export.extend(["--kube-context", kube_context]) - response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) - _, error_helm_chart_export = response_helm_chart_export.communicate() - if response_helm_chart_export.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Export_HelmChart_Fault_Type, - summary='Unable to export connect precheck helm chart from the registry') - raise CLIInternalError("Unable to export connect precheck helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) - - -def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): - cmd_helm_install = [helm_client_location, "upgrade", "--install", "connect-precheck-diagnoser", chart_path] - # To set some other helm parameters through file - if https_proxy: - cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) - if http_proxy: - cmd_helm_install.extend(["--set", "global.httpProxy={}".format(http_proxy)]) - if no_proxy: - cmd_helm_install.extend(["--set", "global.noProxy={}".format(no_proxy)]) - if proxy_cert: - cmd_helm_install.extend(["--set-file", "global.proxyCert={}".format(proxy_cert)]) - - if kube_config: - cmd_helm_install.extend(["--kubeconfig", kube_config]) - if kube_context: - cmd_helm_install.extend(["--kube-context", kube_context]) - - # Change --timeout format for helm client to understand - onboarding_timeout = onboarding_timeout + "s" - cmd_helm_install.extend(["--wait", "--timeout", "{}".format(onboarding_timeout)]) - - response_helm_install = Popen(cmd_helm_install, stdout=PIPE, stderr=PIPE) - _, error_helm_install = response_helm_install.communicate() - if response_helm_install.returncode != 0: - if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): - telemetry.set_user_fault() - telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Precheck_Diagnoser_Install_HelmRelease_Fault_Type, - summary='Unable to install connect precheck helm release') - raise CLIInternalError("Unable to install connect precheck helm release: " + error_helm_install.decode("ascii")) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index f566418a013..5d527df5dc0 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -115,7 +115,7 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati if response_helm_chart_pull.returncode != 0: if for_preonboarding_checks: telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type, - summary='Unable to pull pre-onboarding-inspector helm chart from the registry') + summary='Unable to pull pre-onboarding-inspector helm chart from the registry') raise CLIInternalError("Unable to pull pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) else: telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, @@ -134,7 +134,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex if response_helm_chart_export.returncode != 0: if for_preonboarding_checks: telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type, - summary='Unable to export pre-onboarding-inspector helm chart from the registry') + summary='Unable to export pre-onboarding-inspector helm chart from the registry') raise CLIInternalError("Unable to export pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) else: telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, From a290d2241f00b229e4f0e7dfd337852fc80c6def Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 11 Jan 2023 15:51:14 +0530 Subject: [PATCH 20/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 40096268a98..93d42182852 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -177,7 +177,7 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Pre_Onboarding_Inspector_Job_Registry_Path = "connectprecheck.azurecr.io/helm/connect-precheck-diagnoser:0.1.0" +Pre_Onboarding_Inspector_Job_Registry_Path = "connectprecheck.azurecr.io/helm/pre-onboarding-inspector:0.1.0" Pre_Onboarding_Inspector_Check_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" Pre_Onboarding_Inspector_Helm_Release_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector Job" From 500a8ac5420c4b27476891245d45fa0d6c7bcfb2 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Fri, 13 Jan 2023 15:16:11 +0530 Subject: [PATCH 21/62] modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_utils.py | 122 +++++++++++++----- src/connectedk8s/azext_connectedk8s/custom.py | 4 +- 2 files changed, 89 insertions(+), 37 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 5d527df5dc0..30bed755416 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -69,42 +69,33 @@ def validate_location(cmd, location): break -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) # Exporting helm chart after cleanup - if for_preonboarding_checks: - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') - else: - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'AzureArcCharts') + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', chart_path_name) try: if os.path.isdir(chart_export_path): shutil.rmtree(chart_export_path) except: - if for_preonboarding_checks: - logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - else: - logger.warning("Unable to cleanup the azure-arc helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) + + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) # Returning helm chart path - if for_preonboarding_checks: - # helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') + if chart_path_name == 'ConnectPrecheckCharts': helm_chart_path = os.path.join(chart_export_path, 'pre-onboarding-inspector') - else: - helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') - - if for_preonboarding_checks: chart_path = helm_chart_path else: + helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path return chart_path -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) @@ -113,17 +104,12 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - if for_preonboarding_checks: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type, - summary='Unable to pull pre-onboarding-inspector helm chart from the registry') - raise CLIInternalError("Unable to pull pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) - else: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, - summary='Unable to pull helm chart from the registry') - raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, + summary="Unable to pull {} helm charts from the registry".format(chart_path_name)) + raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_pull.decode("ascii")) -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) @@ -132,14 +118,82 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - if for_preonboarding_checks: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type, - summary='Unable to export pre-onboarding-inspector helm chart from the registry') - raise CLIInternalError("Unable to export pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) - else: telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export helm chart from the registry') - raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) + raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_export.decode("ascii")) + + +# def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +# # Pulling helm chart from registry +# os.environ['HELM_EXPERIMENTAL_OCI'] = '1' +# pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + +# # Exporting helm chart after cleanup +# if for_preonboarding_checks: +# chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') +# else: +# chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'AzureArcCharts') +# try: +# if os.path.isdir(chart_export_path): +# shutil.rmtree(chart_export_path) +# except: +# if for_preonboarding_checks: +# logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) +# else: +# logger.warning("Unable to cleanup the azure-arc helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) +# export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + +# # Returning helm chart path +# if for_preonboarding_checks: +# # helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') +# helm_chart_path = os.path.join(chart_export_path, 'pre-onboarding-inspector') +# else: +# helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') + +# if for_preonboarding_checks: +# chart_path = helm_chart_path +# else: +# chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path + +# return chart_path + + +# def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +# cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] +# if kube_config: +# cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) +# if kube_context: +# cmd_helm_chart_pull.extend(["--kube-context", kube_context]) +# response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) +# _, error_helm_chart_pull = response_helm_chart_pull.communicate() +# if response_helm_chart_pull.returncode != 0: +# if for_preonboarding_checks: +# telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type, +# summary='Unable to pull pre-onboarding-inspector helm chart from the registry') +# raise CLIInternalError("Unable to pull pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) +# else: +# telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, +# summary='Unable to pull helm chart from the registry') +# raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + + +# def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +# cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] +# if kube_config: +# cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) +# if kube_context: +# cmd_helm_chart_export.extend(["--kube-context", kube_context]) +# response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) +# _, error_helm_chart_export = response_helm_chart_export.communicate() +# if response_helm_chart_export.returncode != 0: +# if for_preonboarding_checks: +# telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type, +# summary='Unable to export pre-onboarding-inspector helm chart from the registry') +# raise CLIInternalError("Unable to export pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) +# else: +# telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, +# summary='Unable to export helm chart from the registry') +# raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 6b1972ccd5e..268833aad77 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -143,12 +143,10 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat absolute_path = os.path.abspath(os.path.dirname(__file__)) kubectl_client_location = install_kubectl_client() helm_client_location = install_helm_client() - release_namespace = get_release_namespace(kube_config, kube_context, helm_client_location) diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() - corev1_api_instance = kube_client.CoreV1Api() # Performing pre onboarding inspector container check - diagnostic_checks = precheckutils.check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + diagnostic_checks = precheckutils.check_preonboarding_inspector_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: From e1f21a98244fe89be53fcfc0497c469535e1cacc Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 12:15:46 +0530 Subject: [PATCH 22/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 2 + .../azext_connectedk8s/_precheckutils.py | 8 +- src/connectedk8s/azext_connectedk8s/_utils.py | 203 +++++++----------- src/connectedk8s/azext_connectedk8s/custom.py | 4 +- 4 files changed, 90 insertions(+), 127 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 93d42182852..f683112f233 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -135,6 +135,8 @@ Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the diagnoser container checks" Cluster_DNS_Check_Fault_Type = "Error occured while performing cluster DNS check" Outbound_Connectivity_Check_Fault_Type = "Error occured while performing outbound connectivity check in the cluster" +Outbound_Connectivity_Failed_Fault_Type = "Failed outbound network connectivity from the cluster" +DNS_Failed_Fault_Type = "DNS not working in the cluster" MSI_Cert_Check_Fault_Type = "Error occurred while trying to perform MSI ceritificate presence check" Cluster_Security_Policy_Check_Fault_Type = "Error occured while performing cluster security policy check" KAP_Cert_Check_Fault_Type = "Error occurred while trying to perform KAP ceritificate presence check" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index b77f238a364..aeddf05fea1 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -45,13 +45,13 @@ # pylint: disable -def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): +def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" # Executing the pre onboarding inspector job and fetching the logs obtained - preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) # If preonboarding_inspector_container_log is not empty then only we will check for the results if(preonboarding_inspector_container_log is not None and preonboarding_inspector_container_log != ""): preonboarding_inspector_container_log_list = preonboarding_inspector_container_log.split("\n") @@ -89,7 +89,7 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins return consts.Diagnostic_Check_Incomplete -def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): +def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "pre-onboarding-inspector-job" # Setting the log output as Empty preonboarding_inspector_container_log = "" @@ -127,7 +127,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error while executing pre onboarding inspector Job") return try: - chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, True) + chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 5d527df5dc0..fa7b2af7a61 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -69,42 +69,33 @@ def validate_location(cmd, location): break -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) # Exporting helm chart after cleanup - if for_preonboarding_checks: - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'ConnectPrecheckCharts') - else: - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', 'AzureArcCharts') + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', chart_path_name) try: if os.path.isdir(chart_export_path): shutil.rmtree(chart_export_path) except: - if for_preonboarding_checks: - logger.warning("Unable to cleanup the connect-precheck helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - else: - logger.warning("Unable to cleanup the azure-arc helm charts already present on the machine. In case of failure, please cleanup the directory '%s' and try again.", chart_export_path) - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks) + logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) + + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) # Returning helm chart path - if for_preonboarding_checks: - # helm_chart_path = os.path.join(chart_export_path, 'connect-precheck-diagnoser') + if chart_path_name == 'ConnectPrecheckCharts': helm_chart_path = os.path.join(chart_export_path, 'pre-onboarding-inspector') - else: - helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') - - if for_preonboarding_checks: chart_path = helm_chart_path else: + helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path return chart_path -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) @@ -113,17 +104,12 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - if for_preonboarding_checks: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type, - summary='Unable to pull pre-onboarding-inspector helm chart from the registry') - raise CLIInternalError("Unable to pull pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) - else: - telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, - summary='Unable to pull helm chart from the registry') - raise CLIInternalError("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, + summary="Unable to pull {} helm charts from the registry".format(chart_path_name)) + raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_pull.decode("ascii")) -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, for_preonboarding_checks=False): +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) @@ -132,63 +118,45 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - if for_preonboarding_checks: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type, - summary='Unable to export pre-onboarding-inspector helm chart from the registry') - raise CLIInternalError("Unable to export pre-onboarding-inspector helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) - else: telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export helm chart from the registry') - raise CLIInternalError("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) + raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_export.decode("ascii")) def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks: - try: - if consts.DNS_Check_Result_String not in dns_check_log: - return consts.Diagnostic_Check_Incomplete - formatted_dns_log = dns_check_log.replace('\t', '') - # Validating if DNS is working or not and displaying proper result - if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): - logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - return consts.Diagnostic_Check_Failed - else: - return consts.Diagnostic_Check_Passed - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing Precheck - cluster DNS") - - return consts.Diagnostic_Check_Incomplete - else: - global diagnoser_output + if for_preonboarding_checks == False: + global diagnoser_output try: if consts.DNS_Check_Result_String not in dns_check_log: - return consts.Diagnostic_Check_Incomplete, storage_space_available + if for_preonboarding_checks: + return consts.Diagnostic_Check_Incomplete + else: + return consts.Diagnostic_Check_Incomplete, storage_space_available formatted_dns_log = dns_check_log.replace('\t', '') # Validating if DNS is working or not and displaying proper result if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available + if for_preonboarding_checks: + telemetry.set_user_fault() + telemetry.set_exception(exception="DNS not working in the cluster", fault_type=consts.DNS_Failed_Fault_Type, + summary="DNS not working in the cluster") + return consts.Diagnostic_Check_Failed + else: + diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available else: - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available + if for_preonboarding_checks: + return consts.Diagnostic_Check_Passed + else: + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available # For handling storage or OS exception that may occur during the execution except OSError as e: @@ -198,67 +166,56 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) else: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + if for_preonboarding_checks: + telemetry.set_user_fault() + else: + diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - return consts.Diagnostic_Check_Incomplete, storage_space_available + return consts.Diagnostic_Check def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks: - try: - outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] - outbound_connectivity_response = outbound_connectivity_response[::-1] - if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - return consts.Diagnostic_Check_Incomplete - # Validating if outbound connectiivty is working or not and displaying proper result - if(outbound_connectivity_response != "000"): - return consts.Diagnostic_Check_Passed - else: - logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - return consts.Diagnostic_Check_Failed - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_user_fault() - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing Precheck - outbound connectivity in the cluster") - - return consts.Diagnostic_Check_Incomplete - else: - global diagnoser_output + if for_preonboarding_checks == False: + global diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - return consts.Diagnostic_Check_Incomplete, storage_space_available + if for_preonboarding_checks: + return consts.Diagnostic_Check_Incomplete + else: + return consts.Diagnostic_Check_Incomplete, storage_space_available # Validating if outbound connectiivty is working or not and displaying proper result if(outbound_connectivity_response != "000"): - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available + if for_preonboarding_checks: + return consts.Diagnostic_Check_Passed + else: + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available else: logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available + if for_preonboarding_checks: + telemetry.set_user_fault() + telemetry.set_exception(exception="Failed outbound network connectivity from the cluster", fault_type=consts.Outbound_Connectivity_Failed_Fault_Type, + summary="Failed outbound network connectivity from the cluster") + return consts.Diagnostic_Check_Failed + else: + diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available # For handling storage or OS exception that may occur during the execution except OSError as e: @@ -268,16 +225,22 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) else: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + if for_preonboarding_checks == False: + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + if for_preonboarding_checks == False: + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + else: + telemetry.set_user_fault() telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - - return consts.Diagnostic_Check_Incomplete, storage_space_available + if for_preonboarding_checks: + return consts.Diagnostic_Check_Incomplete + else: + return consts.Diagnostic_Check_Incomplete, storage_space_available def add_helm_repo(kube_config, kube_context, helm_client_location): diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 6b1972ccd5e..268833aad77 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -143,12 +143,10 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat absolute_path = os.path.abspath(os.path.dirname(__file__)) kubectl_client_location = install_kubectl_client() helm_client_location = install_helm_client() - release_namespace = get_release_namespace(kube_config, kube_context, helm_client_location) diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() - corev1_api_instance = kube_client.CoreV1Api() # Performing pre onboarding inspector container check - diagnostic_checks = precheckutils.check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, release_namespace, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + diagnostic_checks = precheckutils.check_preonboarding_inspector_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: From 37dc6acd625328ff305cedda0953c12fa182e4cb Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 13:20:42 +0530 Subject: [PATCH 23/62] Your branch is ahead of 'origin/dns-and-outbound-connect-prechecks' by 2 commits. (use "git push" to publish your local commits) From 0cc38b42de061a50f3f8ed5a4df480b5643f1a01 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 13:33:38 +0530 Subject: [PATCH 24/62] modified: src/connectedk8s/HISTORY.rst modified: src/connectedk8s/setup.py --- src/connectedk8s/HISTORY.rst | 4 +--- src/connectedk8s/setup.py | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index 947e8963575..737befb3482 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,17 +2,15 @@ Release History =============== -<<<<<<< HEAD 1.3.8 ++++++ * Added DNS and outbound connectivity prechecks in connect command -======= + 1.3.7 ++++++ * Install new helm release in azure-arc-release NS ->>>>>>> 943e8af2032cd7e040d6d95d5e70167c1754f58a 1.3.6 ++++++ diff --git a/src/connectedk8s/setup.py b/src/connectedk8s/setup.py index bdc15e4a23e..d074e94dbb4 100644 --- a/src/connectedk8s/setup.py +++ b/src/connectedk8s/setup.py @@ -17,11 +17,7 @@ # TODO: Confirm this is the right version number you want and it matches your # HISTORY.rst entry. -<<<<<<< HEAD VERSION = '1.3.8' -======= -VERSION = '1.3.7' ->>>>>>> 943e8af2032cd7e040d6d95d5e70167c1754f58a # The full list of classifiers is available at # https://pypi.python.org/pypi?%3Aaction=list_classifiers From 6810ee158360501527b448b477b179e9d1569487 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 13:50:05 +0530 Subject: [PATCH 25/62] modified: src/connectedk8s/azext_connectedk8s/_utils.py --- src/connectedk8s/azext_connectedk8s/_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index c0e74958d2c..575e89fea73 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -124,7 +124,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks == False: + if for_preonboarding_checks is False: global diagnoser_output try: if consts.DNS_Check_Result_String not in dns_check_log: @@ -174,7 +174,7 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") if for_preonboarding_checks: telemetry.set_user_fault() - else: + else: diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") @@ -182,7 +182,7 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks == False: + if for_preonboarding_checks is False: global diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] @@ -225,21 +225,21 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) else: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks == False: + if for_preonboarding_checks is False: diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks == False: + if for_preonboarding_checks is False: diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") else: telemetry.set_user_fault() telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") if for_preonboarding_checks: - return consts.Diagnostic_Check_Incomplete - else: + return consts.Diagnostic_Check_Incomplete + else: return consts.Diagnostic_Check_Incomplete, storage_space_available From 4ef28a6fa8c92c7950eed8725a887020a16651e2 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 15:31:20 +0530 Subject: [PATCH 26/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 6 ++-- .../azext_connectedk8s/_precheckutils.py | 10 +++--- src/connectedk8s/azext_connectedk8s/_utils.py | 34 +++---------------- src/connectedk8s/azext_connectedk8s/custom.py | 1 + 4 files changed, 14 insertions(+), 37 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index a73f760cf95..3cde8968311 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -181,9 +181,9 @@ Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants Pre_Onboarding_Inspector_Job_Registry_Path = "connectprecheck.azurecr.io/helm/pre-onboarding-inspector:0.1.0" -Pre_Onboarding_Inspector_Check_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" -Pre_Onboarding_Inspector_Helm_Release_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" -Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector Job" +Pre_Onboarding_Inspector_Check_Execution_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" +Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" +Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector in the cluster" Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-pull-error' Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-export-error' Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index aeddf05fea1..58994312c2c 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -84,7 +84,7 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while trying to perform pre onboarding inspector container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Check_Failed_Fault_Type, summary="Error occured while performing the pre onboarding inspector container") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Check_Execution_Failed_Fault_Type, summary="Error occured while performing the pre onboarding inspector container") return consts.Diagnostic_Check_Incomplete @@ -129,12 +129,12 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta try: chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') - helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) + helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: logger.warning("An error occured while installing helm release of pre onboarding inspector in the cluster. Exception:") logger.warning(str(e)) - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Helm_Release_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") # Deleting all the stale resources that got created Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return @@ -162,7 +162,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): - logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of lack of Resources on the cluster.\n") + logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: @@ -188,7 +188,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta return preonboarding_inspector_container_log -def helm_install_release(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): +def helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "pre-onboarding-inspector", chart_path] # To set some other helm parameters through file if https_proxy: diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 575e89fea73..082af871bd6 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -158,27 +158,17 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") return consts.Diagnostic_Check_Passed, storage_space_available - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - storage_space_available = False - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - else: - logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks: - telemetry.set_user_fault() - else: + if for_preonboarding_checks is False: diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - return consts.Diagnostic_Check + if for_preonboarding_checks: + return consts.Diagnostic_Check_Incomplete + else: + return consts.Diagnostic_Check_Incomplete, storage_space_available def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): @@ -217,25 +207,11 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") return consts.Diagnostic_Check_Failed, storage_space_available - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - storage_space_available = False - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - else: - logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks is False: - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") if for_preonboarding_checks is False: diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - else: - telemetry.set_user_fault() telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") if for_preonboarding_checks: return consts.Diagnostic_Check_Incomplete diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 6e12992e6e4..2bad7e26fdf 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,6 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: + logger.warning("One or more cluster diagnostics check failed.") return required_node_exists = check_linux_amd64_node(node_api_response) From 9b932c06d9e439d2271c880fea8b12a7b474540b Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 15:33:14 +0530 Subject: [PATCH 27/62] modified: src/connectedk8s/setup.py --- src/connectedk8s/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/setup.py b/src/connectedk8s/setup.py index d074e94dbb4..ef5b845dcfe 100644 --- a/src/connectedk8s/setup.py +++ b/src/connectedk8s/setup.py @@ -17,7 +17,7 @@ # TODO: Confirm this is the right version number you want and it matches your # HISTORY.rst entry. -VERSION = '1.3.8' +VERSION = '1.3.9' # The full list of classifiers is available at # https://pypi.python.org/pypi?%3Aaction=list_classifiers From c1cc9a4001c3267892ea4154f79fd29e99a89461 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 15:33:36 +0530 Subject: [PATCH 28/62] modified: src/connectedk8s/HISTORY.rst --- src/connectedk8s/HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index ead818ca03d..fbec032acd8 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -5,7 +5,7 @@ Release History 1.3.9 ++++++ -* * Added DNS and outbound connectivity prechecks in connect command +* Added DNS and outbound connectivity prechecks in connect command 1.3.8 ++++++ From 48b64c611db4859204a2a3f6585b9b16662cb401 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 16:14:28 +0530 Subject: [PATCH 29/62] modified: src/connectedk8s/azext_connectedk8s/_utils.py --- src/connectedk8s/azext_connectedk8s/_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 082af871bd6..644289a1374 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -104,6 +104,8 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: + if chart_path_name is 'AzureArcCharts': + chart_path_name = 'azure-arc' telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, summary="Unable to pull {} helm charts from the registry".format(chart_path_name)) raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_pull.decode("ascii")) @@ -118,9 +120,11 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) - raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_export.decode("ascii")) + if chart_path_name is 'AzureArcCharts': + chart_path_name = 'azure-arc' + telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, + summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) + raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_export.decode("ascii")) def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): From 5300ed65943b2931112cc7c856e36069df9f789d Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 18 Jan 2023 17:04:43 +0530 Subject: [PATCH 30/62] modified: src/connectedk8s/azext_connectedk8s/_utils.py --- src/connectedk8s/azext_connectedk8s/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 644289a1374..185f897b627 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -104,7 +104,7 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - if chart_path_name is 'AzureArcCharts': + if chart_path_name == 'AzureArcCharts': chart_path_name = 'azure-arc' telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, summary="Unable to pull {} helm charts from the registry".format(chart_path_name)) @@ -120,7 +120,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - if chart_path_name is 'AzureArcCharts': + if chart_path_name == 'AzureArcCharts': chart_path_name = 'azure-arc' telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) From cd86de8d19a673d7405cef32b1d7b9036cdd5ca2 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Thu, 19 Jan 2023 16:47:24 +0530 Subject: [PATCH 31/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 6 ++--- src/connectedk8s/azext_connectedk8s/_utils.py | 26 ++++++++++--------- src/connectedk8s/azext_connectedk8s/custom.py | 3 +-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index dd3af9f0020..c6674770cdc 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -135,8 +135,8 @@ Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the diagnoser container checks" Cluster_DNS_Check_Fault_Type = "Error occured while performing cluster DNS check" Outbound_Connectivity_Check_Fault_Type = "Error occured while performing outbound connectivity check in the cluster" -Outbound_Connectivity_Failed_Fault_Type = "Failed outbound network connectivity from the cluster" -DNS_Failed_Fault_Type = "DNS not working in the cluster" +Outbound_Connectivity_Failed_Fault_Type = "Outbound network connectivity failed in onboarding pre-checks" +DNS_Failed_Fault_Type = "DNS resolution failed in onboarding pre-checks" MSI_Cert_Check_Fault_Type = "Error occurred while trying to perform MSI ceritificate presence check" Cluster_Security_Policy_Check_Fault_Type = "Error occured while performing cluster security policy check" KAP_Cert_Check_Fault_Type = "Error occurred while trying to perform KAP ceritificate presence check" @@ -179,7 +179,7 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Pre_Onboarding_Inspector_Job_Registry_Path = "connectprecheck.azurecr.io/helm/pre-onboarding-inspector:0.1.0" +Pre_Onboarding_Inspector_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" Pre_Onboarding_Inspector_Check_Execution_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector in the cluster" diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 185f897b627..51b9bbc1d76 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -72,7 +72,10 @@ def validate_location(cmd, location): def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) + if chart_path_name == 'ConnectPrecheckCharts': + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) + else: + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, 'azure-arc') # Exporting helm chart after cleanup chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', chart_path_name) @@ -82,7 +85,10 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio except: logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) + if chart_path_name == 'ConnectPrecheckCharts': + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) + else: + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') # Returning helm chart path if chart_path_name == 'ConnectPrecheckCharts': @@ -95,7 +101,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio return chart_path -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc'): cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) @@ -104,14 +110,12 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=PIPE, stderr=PIPE) _, error_helm_chart_pull = response_helm_chart_pull.communicate() if response_helm_chart_pull.returncode != 0: - if chart_path_name == 'AzureArcCharts': - chart_path_name = 'azure-arc' telemetry.set_exception(exception=error_helm_chart_pull.decode("ascii"), fault_type=consts.Pull_HelmChart_Fault_Type, - summary="Unable to pull {} helm charts from the registry".format(chart_path_name)) - raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_pull.decode("ascii")) + summary="Unable to pull {} helm charts from the registry".format(chart_name)) + raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_pull.decode("ascii")) -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name='azure-arc'): +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc'): cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) @@ -120,11 +124,9 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=PIPE, stderr=PIPE) _, error_helm_chart_export = response_helm_chart_export.communicate() if response_helm_chart_export.returncode != 0: - if chart_path_name == 'AzureArcCharts': - chart_path_name = 'azure-arc' telemetry.set_exception(exception=error_helm_chart_export.decode("ascii"), fault_type=consts.Export_HelmChart_Fault_Type, - summary='Unable to export {} helm chart from the registry'.format(chart_path_name)) - raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_path_name, registry_path) + error_helm_chart_export.decode("ascii")) + summary='Unable to export {} helm chart from the registry'.format(chart_name)) + raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 63033a7c232..eecebef2e47 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,8 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - logger.warning("One or more cluster diagnostics check failed.") - return + raise CLIError("One or more cluster diagnostic checks failed.Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From 9f43bc44bb3b7934d0fc6acce245cd38c069b39a Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Thu, 19 Jan 2023 16:56:25 +0530 Subject: [PATCH 32/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index eecebef2e47..f835c2326a5 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise CLIError("One or more cluster diagnostic checks failed.Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise CLIError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From ad27d1e4c4edde5d6fcbc31a4047b91fc155f198 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Thu, 19 Jan 2023 17:00:00 +0530 Subject: [PATCH 33/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index f835c2326a5..747ecd8930d 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise CLIError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise UnclassifiedUserFault("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From 7097f17b80b77ea7dcc1df242a317488c8e6873d Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Thu, 19 Jan 2023 22:33:42 +0530 Subject: [PATCH 34/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 5 +++-- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 6 +++++- src/connectedk8s/azext_connectedk8s/_utils.py | 4 +--- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index c6674770cdc..bc475c0458e 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -180,12 +180,13 @@ Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants Pre_Onboarding_Inspector_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" -Pre_Onboarding_Inspector_Check_Execution_Failed_Fault_Type = "Error occured while running the pre onboarding inspector" Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" -Pre_Onboarding_Inspector_Failed_Fault_Type = "Error while executing pre onboarding inspector in the cluster" +Pre_Onboarding_Inspector_Failed_Fault_Type = "Error occured while running pre onboarding inspector" Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-pull-error' Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-export-error' Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' +Pre_Onboarding_Inspector_Job_Not_Scheduled = 'Unable to schedule pre-onboarding-inspector job' +Pre_Onboarding_Inspector_Job_Not_Complete = 'Unable to complete pre-onboarding-inspector job after scheduling' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 58994312c2c..a45a0e75b9b 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -84,7 +84,7 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while trying to perform pre onboarding inspector container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Check_Execution_Failed_Fault_Type, summary="Error occured while performing the pre onboarding inspector container") + telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error occured while executing the pre onboarding inspector container") return consts.Diagnostic_Check_Incomplete @@ -158,10 +158,14 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta continue if (is_job_scheduled is False): + telemetry.set_exception(exception="Couldn't schedule pre onboarding inspector job in the cluster", fault_type=consts.Pre_Onboarding_Inspector_Job_Not_Scheduled, + summary="Couldn't schedule pre onboarding inspector job in the cluster") logger.warning("Unable to schedule the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): + telemetry.set_exception(exception="Couldn't complete pre onboarding inspector job after scheduling in the cluster", fault_type=consts.Pre_Onboarding_Inspector_Job_Not_Scheduled, + summary="Couldn't complete pre onboarding inspector job after scheduling in the cluster") logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 51b9bbc1d76..9aacb78ca02 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -88,7 +88,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio if chart_path_name == 'ConnectPrecheckCharts': export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) else: - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') # Returning helm chart path if chart_path_name == 'ConnectPrecheckCharts': @@ -143,7 +143,6 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") if for_preonboarding_checks: - telemetry.set_user_fault() telemetry.set_exception(exception="DNS not working in the cluster", fault_type=consts.DNS_Failed_Fault_Type, summary="DNS not working in the cluster") return consts.Diagnostic_Check_Failed @@ -201,7 +200,6 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre else: logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") if for_preonboarding_checks: - telemetry.set_user_fault() telemetry.set_exception(exception="Failed outbound network connectivity from the cluster", fault_type=consts.Outbound_Connectivity_Failed_Fault_Type, summary="Failed outbound network connectivity from the cluster") return consts.Diagnostic_Check_Failed diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 747ecd8930d..70cc9922a5f 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise UnclassifiedUserFault("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From f345579dc6defce5d0c02ce7a13682f323fc6aea Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Thu, 19 Jan 2023 23:58:03 +0530 Subject: [PATCH 35/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 70cc9922a5f..0b3667a4ec3 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From 1904597572905493793ae0104b90fcae244ddec3 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Fri, 20 Jan 2023 00:09:33 +0530 Subject: [PATCH 36/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 0b3667a4ec3..70cc9922a5f 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise ValidationError("One or more diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From e8c9e8d4214776f03910e0762e457b1c5d866829 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Fri, 20 Jan 2023 00:20:37 +0530 Subject: [PATCH 37/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 70cc9922a5f..0b3667a4ec3 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From 24a6b0412191466123f88fce70d3434787152017 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Fri, 20 Jan 2023 11:28:24 +0530 Subject: [PATCH 38/62] modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 0b3667a4ec3..70cc9922a5f 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise ValidationError("One or more diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From eb7a3378b55efe0b52451047a97d3d565f1b8e58 Mon Sep 17 00:00:00 2001 From: Siri Teja Reddy Kasireddy Date: Fri, 20 Jan 2023 16:24:34 +0530 Subject: [PATCH 39/62] pass location param to precheckcharts --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 11 ++++++----- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a45a0e75b9b..af98b000d3e 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -45,13 +45,13 @@ # pylint: disable -def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): +def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" # Executing the pre onboarding inspector job and fetching the logs obtained - preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) # If preonboarding_inspector_container_log is not empty then only we will check for the results if(preonboarding_inspector_container_log is not None and preonboarding_inspector_container_log != ""): preonboarding_inspector_container_log_list = preonboarding_inspector_container_log.split("\n") @@ -89,7 +89,7 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins return consts.Diagnostic_Check_Incomplete -def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert): +def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "pre-onboarding-inspector-job" # Setting the log output as Empty preonboarding_inspector_container_log = "" @@ -129,7 +129,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta try: chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') - helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) + helm_install_release_preonboarding_inspector(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: logger.warning("An error occured while installing helm release of pre onboarding inspector in the cluster. Exception:") @@ -192,9 +192,10 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta return preonboarding_inspector_container_log -def helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): +def helm_install_release_preonboarding_inspector(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "pre-onboarding-inspector", chart_path] # To set some other helm parameters through file + cmd_helm_install.extend(["--set", "global,location={}".format(location)]) if https_proxy: cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) if http_proxy: diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 70cc9922a5f..b003218ef79 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -146,7 +146,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() # Performing pre onboarding inspector container check - diagnostic_checks = precheckutils.check_preonboarding_inspector_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, http_proxy, https_proxy, no_proxy, proxy_cert) + diagnostic_checks = precheckutils.check_preonboarding_inspector_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: From 36d6bda238c8cdfa452c28c5c4e3f37393278eca Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Fri, 20 Jan 2023 16:44:17 +0530 Subject: [PATCH 40/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 14 ++++++-------- .../azext_connectedk8s/_precheckutils.py | 16 ++++++++-------- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index bc475c0458e..5450695011e 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -179,14 +179,12 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Pre_Onboarding_Inspector_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" -Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" -Pre_Onboarding_Inspector_Failed_Fault_Type = "Error occured while running pre onboarding inspector" -Pre_Onboarding_Inspector_Pull_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-pull-error' -Pre_Onboarding_Inspector_Export_HelmChart_Fault_Type = 'pre-onboarding-inspector-helm-chart-export-error' -Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' -Pre_Onboarding_Inspector_Job_Not_Scheduled = 'Unable to schedule pre-onboarding-inspector job' -Pre_Onboarding_Inspector_Job_Not_Complete = 'Unable to complete pre-onboarding-inspector job after scheduling' +Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" +Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" +Cluster_Diagnostic_Checks_Failed_Fault_Type = "Error occured while running pre onboarding inspector" +Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' +Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule pre-onboarding-inspector job' +Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete pre-onboarding-inspector job after scheduling' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a45a0e75b9b..e7af31ef9cb 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -84,7 +84,7 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while trying to perform pre onboarding inspector container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error occured while executing the pre onboarding inspector container") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error occured while executing the pre onboarding inspector container") return consts.Diagnostic_Check_Incomplete @@ -124,17 +124,17 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta # If any exception occured we will print the exception and return if exception_occured_counter == 1: logger.warning("An error occured while installing the pre onboarding inspector helm release in the cluster. Exception:") - telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error while executing pre onboarding inspector Job") + telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing pre onboarding inspector Job") return try: - chart_path = azext_utils.get_chart_path(consts.Pre_Onboarding_Inspector_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') + chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: logger.warning("An error occured while installing helm release of pre onboarding inspector in the cluster. Exception:") logger.warning(str(e)) - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Helm_Install_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") # Deleting all the stale resources that got created Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return @@ -158,13 +158,13 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta continue if (is_job_scheduled is False): - telemetry.set_exception(exception="Couldn't schedule pre onboarding inspector job in the cluster", fault_type=consts.Pre_Onboarding_Inspector_Job_Not_Scheduled, + telemetry.set_exception(exception="Couldn't schedule pre onboarding inspector job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, summary="Couldn't schedule pre onboarding inspector job in the cluster") logger.warning("Unable to schedule the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): - telemetry.set_exception(exception="Couldn't complete pre onboarding inspector job after scheduling in the cluster", fault_type=consts.Pre_Onboarding_Inspector_Job_Not_Scheduled, + telemetry.set_exception(exception="Couldn't complete pre onboarding inspector job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, summary="Couldn't complete pre onboarding inspector job after scheduling in the cluster") logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) @@ -186,7 +186,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta except Exception as e: logger.warning("An exception has occured while trying to execute the pre onboarding inspector in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - telemetry.set_exception(exception=e, fault_type=consts.Pre_Onboarding_Inspector_Failed_Fault_Type, summary="Error while executing Pre onboarding inspector Job") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing Pre onboarding inspector Job") return return preonboarding_inspector_container_log @@ -218,6 +218,6 @@ def helm_install_release_preonboarding_inspector(chart_path, http_proxy, https_p if response_helm_install.returncode != 0: if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): telemetry.set_user_fault() - telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Pre_Onboarding_Inspector_Install_HelmRelease_Fault_Type, + telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type, summary='Unable to install pre onboarding inspector helm release') raise CLIInternalError("Unable to install pre onboarding inspector helm release: " + error_helm_install.decode("ascii")) diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 70cc9922a5f..51d929503cc 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -161,7 +161,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat raise ManualInterrupt('Process terminated externally.') if all_checks_passed is False: - raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please refer to the troubleshooting docs for the diagnostic checks and try onboarding again.") + raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please resolve them and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From cae9612bcfeb46d5862c9f4b0042fea2fb7336af Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Sun, 22 Jan 2023 14:17:01 +0530 Subject: [PATCH 41/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 0c0cfe7fb32..7c4b67829da 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -195,7 +195,7 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta def helm_install_release_preonboarding_inspector(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "pre-onboarding-inspector", chart_path] # To set some other helm parameters through file - cmd_helm_install.extend(["--set", "global,location={}".format(location)]) + cmd_helm_install.extend(["--set", "global.location={}".format(location)]) if https_proxy: cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) if http_proxy: From bb02d2bf6453f4e490b6f6f3b541cf93e9bdfc83 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 23 Jan 2023 11:09:55 +0530 Subject: [PATCH 42/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 10 +-- .../azext_connectedk8s/_precheckutils.py | 88 +++++++++---------- src/connectedk8s/azext_connectedk8s/_utils.py | 36 ++++---- src/connectedk8s/azext_connectedk8s/custom.py | 4 +- 4 files changed, 69 insertions(+), 69 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 5450695011e..5fdb338c778 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -180,11 +180,11 @@ Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" -Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing pre onboarding inspector helm release" -Cluster_Diagnostic_Checks_Failed_Fault_Type = "Error occured while running pre onboarding inspector" -Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type = 'pre-onboarding-inspector-helm-release-install-error' -Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule pre-onboarding-inspector job' -Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete pre-onboarding-inspector job after scheduling' +Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release" +Cluster_Diagnostic_Checks_Failed_Fault_Type = "Error occured while running cluster diagnostic checks" +Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type = 'cluster-diagnostic-checks-helm-release-install-error' +Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule cluster-diagnostic-checks job' +Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete cluster-diagnostic-checks job after scheduling' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 7c4b67829da..585d865269c 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -45,21 +45,21 @@ # pylint: disable -def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): +def cluster_diagnostic_checks_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" - # Executing the pre onboarding inspector job and fetching the logs obtained - preonboarding_inspector_container_log = executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) - # If preonboarding_inspector_container_log is not empty then only we will check for the results - if(preonboarding_inspector_container_log is not None and preonboarding_inspector_container_log != ""): - preonboarding_inspector_container_log_list = preonboarding_inspector_container_log.split("\n") - preonboarding_inspector_container_log_list.pop(-1) + # Executing the cluster_diagnostic_checks job and fetching the logs obtained + cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) + # If cluster_diagnostic_checks_container_log is not empty then only we will check for the results + if(cluster_diagnostic_checks_container_log is not None and cluster_diagnostic_checks_container_log != ""): + cluster_diagnostic_checks_container_log_list = cluster_diagnostic_checks_container_log.split("\n") + cluster_diagnostic_checks_container_log_list.pop(-1) dns_check_log = "" counter_container_logs = 1 - # For retrieving only preonboarding inspector logs from the inspector output - for outputs in preonboarding_inspector_container_log_list: + # For retrieving only cluster_diagnostic_checks logs from the output + for outputs in cluster_diagnostic_checks_container_log_list: if consts.Outbound_Connectivity_Check_Result_String in outputs: counter_container_logs = 1 elif consts.DNS_Check_Result_String in outputs: @@ -68,11 +68,11 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins elif counter_container_logs == 0: dns_check_log += " " + outputs dns_check = azext_utils.check_cluster_DNS(dns_check_log, True) - outbound_connectivity_check = azext_utils.check_cluster_outbound_connectivity(preonboarding_inspector_container_log_list[-1], True) + outbound_connectivity_check = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], True) else: return consts.Diagnostic_Check_Incomplete - # If both the check passed then we will return pre onboarding inspector checks Passed + # If both the check passed then we will return cluster diagnostic checks Passed if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed): return consts.Diagnostic_Check_Passed # If any of the check remain Incomplete than we will return Incomplete @@ -83,18 +83,18 @@ def check_preonboarding_inspector_container(corev1_api_instance, batchv1_api_ins # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to perform pre onboarding inspector container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error occured while executing the pre onboarding inspector container") + logger.warning("An exception has occured while trying to perform cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container") return consts.Diagnostic_Check_Incomplete -def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): - job_name = "pre-onboarding-inspector-job" +def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): + job_name = "cluster-diagnostic-checks-job" # Setting the log output as Empty - preonboarding_inspector_container_log = "" + cluster_diagnostic_checks_container_log = "" - cmd_helm_delete = [helm_client_location, "uninstall", "pre-onboarding-inspector"] + cmd_helm_delete = [helm_client_location, "uninstall", "cluster-diagnostic-checks"] if kube_config: cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: @@ -102,9 +102,9 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta # To handle the user keyboard Interrupt try: - # Executing the pre onboarding inspector job yaml + # Executing the cluster diagnostic checks job yaml config.load_kube_config(kube_config, kube_context) - # Attempting deletion of pre onboarding inspector resources to handle the scenario if any stale resources are present + # Attempting deletion of cluster diagnostic checks resources to handle the scenario if any stale resources are present response_kubectl_delete_helm = Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) output_kubectl_delete_helm, error_kubectl_delete_helm = response_kubectl_delete_helm.communicate() # If any error occured while execution of delete command @@ -123,22 +123,22 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: - logger.warning("An error occured while installing the pre onboarding inspector helm release in the cluster. Exception:") - telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing pre onboarding inspector Job") + logger.warning("An error occured while installing the cluster diagnostic checks helm release in the cluster. Exception:") + telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job") return try: - chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'ConnectPrecheckCharts') + chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'cluster_diagnostic_checks') - helm_install_release_preonboarding_inspector(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) + helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) # To handle the Exception that occured except Exception as e: - logger.warning("An error occured while installing helm release of pre onboarding inspector in the cluster. Exception:") + logger.warning("An error occured while installing helm release of cluster diagnostic checks in the cluster. Exception:") logger.warning(str(e)) - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type, summary="Error while installing pre onboarding inspector helm release") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type, summary="Error while installing cluster diagnostic checks helm release") # Deleting all the stale resources that got created Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return - # Watching for pre onboarding inspector container to reach in completed stage + # Watching for cluster diagnostic checks container to reach in completed stage w = watch.Watch() is_job_complete = False is_job_scheduled = False @@ -146,10 +146,10 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=60): try: # Checking if job get scheduled or not - if event["object"].metadata.name == "pre-onboarding-inspector-job": + if event["object"].metadata.name == "cluster-diagnostic-checks-job": is_job_scheduled = True # Checking if job reached completed stage or not - if event["object"].metadata.name == "pre-onboarding-inspector-job" and event["object"].status.conditions[0].type == "Complete": + if event["object"].metadata.name == "cluster-diagnostic-checks-job" and event["object"].status.conditions[0].type == "Complete": is_job_complete = True w.stop() except Exception as e: @@ -158,19 +158,19 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta continue if (is_job_scheduled is False): - telemetry.set_exception(exception="Couldn't schedule pre onboarding inspector job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, - summary="Couldn't schedule pre onboarding inspector job in the cluster") - logger.warning("Unable to schedule the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") + telemetry.set_exception(exception="Couldn't schedule cluster diagnostic checks job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, + summary="Couldn't schedule cluster diagnostic checks job in the cluster") + logger.warning("Unable to schedule the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): - telemetry.set_exception(exception="Couldn't complete pre onboarding inspector job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, - summary="Couldn't complete pre onboarding inspector job after scheduling in the cluster") - logger.warning("Unable to finish the pre onboarding inspector job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") + telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, + summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster") + logger.warning("Unable to finish the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: - # Fetching the pre onboarding inspector Container logs + # Fetching the cluster diagnostic checks Container logs all_pods = corev1_api_instance.list_namespaced_pod('default') # Traversing through all agents for each_pod in all_pods.items: @@ -178,22 +178,22 @@ def executing_preonboarding_inspector_job(corev1_api_instance, batchv1_api_insta pod_name = each_pod.metadata.name if(pod_name.startswith(job_name)): # Creating a text file with the name of the container and adding that containers logs in it - preonboarding_inspector_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="pre-onboarding-inspector-container", namespace='default') - # Clearing all the resources after fetching the pre onboarding inspector container logs + cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='default') + # Clearing all the resources after fetching the cluster diagnostic checks container logs Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to execute the pre onboarding inspector in the cluster. Exception: {}".format(str(e)) + "\n") + logger.warning("An exception has occured while trying to execute the cluster diagnostic checks in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing Pre onboarding inspector Job") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job") return - return preonboarding_inspector_container_log + return cluster_diagnostic_checks_container_log -def helm_install_release_preonboarding_inspector(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): - cmd_helm_install = [helm_client_location, "upgrade", "--install", "pre-onboarding-inspector", chart_path] +def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): + cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path] # To set some other helm parameters through file cmd_helm_install.extend(["--set", "global.location={}".format(location)]) if https_proxy: @@ -220,5 +220,5 @@ def helm_install_release_preonboarding_inspector(chart_path, location, http_prox if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): telemetry.set_user_fault() telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type, - summary='Unable to install pre onboarding inspector helm release') - raise CLIInternalError("Unable to install pre onboarding inspector helm release: " + error_helm_install.decode("ascii")) + summary='Unable to install cluster diagnostic checks helm release') + raise CLIInternalError("Unable to install cluster diagnostic checks helm release: " + error_helm_install.decode("ascii")) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 9aacb78ca02..c63eba46d52 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -72,7 +72,7 @@ def validate_location(cmd, location): def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - if chart_path_name == 'ConnectPrecheckCharts': + if chart_path_name == 'cluster_diagnostic_checks': pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) else: pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, 'azure-arc') @@ -85,14 +85,14 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio except: logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) - if chart_path_name == 'ConnectPrecheckCharts': + if chart_path_name == 'cluster_diagnostic_checks': export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) else: export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') # Returning helm chart path - if chart_path_name == 'ConnectPrecheckCharts': - helm_chart_path = os.path.join(chart_export_path, 'pre-onboarding-inspector') + if chart_path_name == 'cluster_diagnostic_checks': + helm_chart_path = os.path.join(chart_export_path, 'cluster-diagnostic-checks') chart_path = helm_chart_path else: helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') @@ -129,12 +129,12 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) -def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks is False: +def check_cluster_DNS(dns_check_log, for_cluster_diagnostics_checks=False, filepath_with_timestamp=None, storage_space_available=False): + if for_cluster_diagnostics_checks is False: global diagnoser_output try: if consts.DNS_Check_Result_String not in dns_check_log: - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Incomplete else: return consts.Diagnostic_Check_Incomplete, storage_space_available @@ -142,7 +142,7 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi # Validating if DNS is working or not and displaying proper result if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: telemetry.set_exception(exception="DNS not working in the cluster", fault_type=consts.DNS_Failed_Fault_Type, summary="DNS not working in the cluster") return consts.Diagnostic_Check_Failed @@ -154,7 +154,7 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") return consts.Diagnostic_Check_Failed, storage_space_available else: - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Passed else: if storage_space_available: @@ -166,30 +166,30 @@ def check_cluster_DNS(dns_check_log, for_preonboarding_checks=False, filepath_wi # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks is False: + if for_cluster_diagnostics_checks is False: diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Incomplete else: return consts.Diagnostic_Check_Incomplete, storage_space_available -def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_preonboarding_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_preonboarding_checks is False: +def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_cluster_diagnostics_checks=False, filepath_with_timestamp=None, storage_space_available=False): + if for_cluster_diagnostics_checks is False: global diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Incomplete else: return consts.Diagnostic_Check_Incomplete, storage_space_available # Validating if outbound connectiivty is working or not and displaying proper result if(outbound_connectivity_response != "000"): - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Passed else: if storage_space_available: @@ -199,7 +199,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre return consts.Diagnostic_Check_Passed, storage_space_available else: logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: telemetry.set_exception(exception="Failed outbound network connectivity from the cluster", fault_type=consts.Outbound_Connectivity_Failed_Fault_Type, summary="Failed outbound network connectivity from the cluster") return consts.Diagnostic_Check_Failed @@ -214,10 +214,10 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_pre # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_preonboarding_checks is False: + if for_cluster_diagnostics_checks is False: diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - if for_preonboarding_checks: + if for_cluster_diagnostics_checks: return consts.Diagnostic_Check_Incomplete else: return consts.Diagnostic_Check_Incomplete, storage_space_available diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 1900a21b8ee..cd35c80c15c 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -145,8 +145,8 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat helm_client_location = install_helm_client() diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() - # Performing pre onboarding inspector container check - diagnostic_checks = precheckutils.check_preonboarding_inspector_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) + # Performing cluster-diagnostic-checks + diagnostic_checks = precheckutils.cluster_diagnostic_checks_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) # If all the checks passed then display no error found all_checks_passed = True # for checks in diagnostic_checks: From fa8846a04846495048f8e3a520497636e6d55c88 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 23 Jan 2023 11:42:37 +0530 Subject: [PATCH 43/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 5fdb338c778..319723f7611 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -135,8 +135,8 @@ Diagnoser_Container_Check_Failed_Fault_Type = "Error occured while performing the diagnoser container checks" Cluster_DNS_Check_Fault_Type = "Error occured while performing cluster DNS check" Outbound_Connectivity_Check_Fault_Type = "Error occured while performing outbound connectivity check in the cluster" -Outbound_Connectivity_Failed_Fault_Type = "Outbound network connectivity failed in onboarding pre-checks" -DNS_Failed_Fault_Type = "DNS resolution failed in onboarding pre-checks" +Outbound_Connectivity_Failed_Fault_Type = "Outbound network connectivity failed in cluster diagnostic checks" +DNS_Failed_Fault_Type = "DNS resolution failed in cluster diagnostic checks" MSI_Cert_Check_Fault_Type = "Error occurred while trying to perform MSI ceritificate presence check" Cluster_Security_Policy_Check_Fault_Type = "Error occured while performing cluster security policy check" KAP_Cert_Check_Fault_Type = "Error occurred while trying to perform KAP ceritificate presence check" From f4717dea6499b5c711fb6955f150d8818b35d018 Mon Sep 17 00:00:00 2001 From: Siri Teja Reddy Kasireddy Date: Mon, 23 Jan 2023 12:38:02 +0530 Subject: [PATCH 44/62] some naming changes --- .../azext_connectedk8s/_precheckutils.py | 6 +++--- src/connectedk8s/azext_connectedk8s/custom.py | 14 ++++---------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 585d865269c..181a626d835 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -45,13 +45,13 @@ # pylint: disable -def cluster_diagnostic_checks_container(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): +def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" # Executing the cluster_diagnostic_checks job and fetching the logs obtained - cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) + cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) # If cluster_diagnostic_checks_container_log is not empty then only we will check for the results if(cluster_diagnostic_checks_container_log is not None and cluster_diagnostic_checks_container_log != ""): cluster_diagnostic_checks_container_log_list = cluster_diagnostic_checks_container_log.split("\n") @@ -89,7 +89,7 @@ def cluster_diagnostic_checks_container(corev1_api_instance, batchv1_api_instanc return consts.Diagnostic_Check_Incomplete -def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): +def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): job_name = "cluster-diagnostic-checks-job" # Setting the log output as Empty cluster_diagnostic_checks_container_log = "" diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index cd35c80c15c..08a7bdca88d 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -140,28 +140,22 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # Pre onboarding checks try: - absolute_path = os.path.abspath(os.path.dirname(__file__)) kubectl_client_location = install_kubectl_client() helm_client_location = install_helm_client() diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() # Performing cluster-diagnostic-checks - diagnostic_checks = precheckutils.cluster_diagnostic_checks_container(api_instance, batchv1_api_instance, absolute_path, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) - # If all the checks passed then display no error found - all_checks_passed = True - # for checks in diagnostic_checks: - if diagnostic_checks != consts.Diagnostic_Check_Passed: - all_checks_passed = False + diagnostic_checks = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) except Exception as e: - logger.warning("Exception occured : {}".format(str(e))) + logger.warning("An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) # Handling the user manual interrupt except KeyboardInterrupt: raise ManualInterrupt('Process terminated externally.') - if all_checks_passed is False: - raise ValidationError("One or more cluster diagnostic checks failed and hence the cluster cannot be onboarded. Please resolve them and try onboarding again.") + if diagnostic_checks != consts.Diagnostic_Check_Passed: + raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From df6e66c5997fe9f85e264bd8e2ab66d5bdb2450e Mon Sep 17 00:00:00 2001 From: Siri Teja Reddy Kasireddy Date: Mon, 23 Jan 2023 12:48:36 +0530 Subject: [PATCH 45/62] fix clusterDNS and network check in utils --- .../azext_connectedk8s/_precheckutils.py | 4 +- src/connectedk8s/azext_connectedk8s/_utils.py | 163 +++++++++--------- 2 files changed, 80 insertions(+), 87 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 181a626d835..aa0cd21198c 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -67,8 +67,8 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check = azext_utils.check_cluster_DNS(dns_check_log, True) - outbound_connectivity_check = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], True) + dns_check, _ = azext_utils.check_cluster_DNS(dns_check_log, True) + outbound_connectivity_check, _ = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], True) else: return consts.Diagnostic_Check_Incomplete diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index c63eba46d52..79a41d811a3 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -129,98 +129,91 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) -def check_cluster_DNS(dns_check_log, for_cluster_diagnostics_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_cluster_diagnostics_checks is False: - global diagnoser_output - try: - if consts.DNS_Check_Result_String not in dns_check_log: - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Incomplete - else: - return consts.Diagnostic_Check_Incomplete, storage_space_available - formatted_dns_log = dns_check_log.replace('\t', '') - # Validating if DNS is working or not and displaying proper result - if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): - logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if for_cluster_diagnostics_checks: - telemetry.set_exception(exception="DNS not working in the cluster", fault_type=consts.DNS_Failed_Fault_Type, - summary="DNS not working in the cluster") - return consts.Diagnostic_Check_Failed - else: - diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available - else: - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Passed - else: - if storage_space_available: - dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) - with open(dns_check_path, 'w+') as dns: - dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available - - # To handle any exception that may occur during the execution - except Exception as e: +def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available): + + global diagnoser_output + try: + if consts.DNS_Check_Result_String not in dns_check_log: + return consts.Diagnostic_Check_Incomplete, storage_space_available + formatted_dns_log = dns_check_log.replace('\t', '') + # Validating if DNS is working or not and displaying proper result + if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): + logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available + else: + if storage_space_available: + dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) + with open(dns_check_path, 'w+') as dns: + dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + else: logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_cluster_diagnostics_checks is False: - diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Incomplete - else: - return consts.Diagnostic_Check_Incomplete, storage_space_available + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_DNS_Check_Fault_Type, summary="Error occured while performing cluster DNS check") + diagnoser_output.append("An exception has occured while performing the DNS check on the cluster. Exception: {}".format(str(e)) + "\n") + return consts.Diagnostic_Check_Incomplete, storage_space_available -def check_cluster_outbound_connectivity(outbound_connectivity_check_log, for_cluster_diagnostics_checks=False, filepath_with_timestamp=None, storage_space_available=False): - if for_cluster_diagnostics_checks is False: - global diagnoser_output - try: - outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] - outbound_connectivity_response = outbound_connectivity_response[::-1] - if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Incomplete - else: - return consts.Diagnostic_Check_Incomplete, storage_space_available - # Validating if outbound connectiivty is working or not and displaying proper result - if(outbound_connectivity_response != "000"): - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Passed - else: - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") - return consts.Diagnostic_Check_Passed, storage_space_available - else: - logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if for_cluster_diagnostics_checks: - telemetry.set_exception(exception="Failed outbound network connectivity from the cluster", fault_type=consts.Outbound_Connectivity_Failed_Fault_Type, - summary="Failed outbound network connectivity from the cluster") - return consts.Diagnostic_Check_Failed - else: - diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if storage_space_available: - outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) - with open(outbound_connectivity_check_path, 'w+') as outbound: - outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") - return consts.Diagnostic_Check_Failed, storage_space_available - - # To handle any exception that may occur during the execution - except Exception as e: + +def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available): + + global diagnoser_output + try: + outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] + outbound_connectivity_response = outbound_connectivity_response[::-1] + if consts.Outbound_Connectivity_Check_Result_String not in outbound_connectivity_check_log: + return consts.Diagnostic_Check_Incomplete, storage_space_available + # Validating if outbound connectiivty is working or not and displaying proper result + if(outbound_connectivity_response != "000"): + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") + return consts.Diagnostic_Check_Passed, storage_space_available + else: + logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") + if storage_space_available: + outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) + with open(outbound_connectivity_check_path, 'w+') as outbound: + outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") + return consts.Diagnostic_Check_Failed, storage_space_available + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + else: logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") - if for_cluster_diagnostics_checks is False: - diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") - if for_cluster_diagnostics_checks: - return consts.Diagnostic_Check_Incomplete - else: - return consts.Diagnostic_Check_Incomplete, storage_space_available + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Outbound_Connectivity_Check_Fault_Type, summary="Error occured while performing outbound connectivity check in the cluster") + diagnoser_output.append("An exception has occured while performing the outbound connectivity check on the cluster. Exception: {}".format(str(e)) + "\n") + + return consts.Diagnostic_Check_Incomplete, storage_space_available def add_helm_repo(kube_config, kube_context, helm_client_location): From 6d2b31f3d446af1dad848ad4f735546449e1c569 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 23 Jan 2023 15:15:18 +0530 Subject: [PATCH 46/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_troubleshootutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 5 +-- .../azext_connectedk8s/_precheckutils.py | 31 +++++++------------ .../azext_connectedk8s/_troubleshootutils.py | 4 +-- src/connectedk8s/azext_connectedk8s/_utils.py | 12 +++---- src/connectedk8s/azext_connectedk8s/custom.py | 3 ++ 5 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 319723f7611..1e40107e8b1 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -181,10 +181,11 @@ # Connect Precheck Diagnoser constants Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release" -Cluster_Diagnostic_Checks_Failed_Fault_Type = "Error occured while running cluster diagnostic checks" -Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type = 'cluster-diagnostic-checks-helm-release-install-error' +Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks" +Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release" Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule cluster-diagnostic-checks job' Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete cluster-diagnostic-checks job after scheduling' +Pre_Onboarding_Diagnostic_Checks_Execution_Failed= 'Exception occured while trying to execute pre-onboarding diagnostic checks' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index aa0cd21198c..938298bee3a 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -67,8 +67,8 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check, _ = azext_utils.check_cluster_DNS(dns_check_log, True) - outbound_connectivity_check, _ = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], True) + dns_check, _ = azext_utils.check_cluster_DNS(dns_check_log, False) + outbound_connectivity_check, _ = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], False) else: return consts.Diagnostic_Check_Incomplete @@ -83,8 +83,8 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h # To handle any exception that may occur during the execution except Exception as e: - logger.warning("An exception has occured while trying to perform cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container") + logger.warning("An exception has occured while trying to execute cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container") return consts.Diagnostic_Check_Incomplete @@ -124,20 +124,13 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins # If any exception occured we will print the exception and return if exception_occured_counter == 1: logger.warning("An error occured while installing the cluster diagnostic checks helm release in the cluster. Exception:") - telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job") + telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job") return - try: - chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'cluster_diagnostic_checks') - - helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) - # To handle the Exception that occured - except Exception as e: - logger.warning("An error occured while installing helm release of cluster diagnostic checks in the cluster. Exception:") - logger.warning(str(e)) - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type, summary="Error while installing cluster diagnostic checks helm release") - # Deleting all the stale resources that got created - Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - return + + chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'cluster_diagnostic_checks') + + helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) + # Watching for cluster diagnostic checks container to reach in completed stage w = watch.Watch() is_job_complete = False @@ -186,7 +179,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins except Exception as e: logger.warning("An exception has occured while trying to execute the cluster diagnostic checks in the cluster. Exception: {}".format(str(e)) + "\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) - telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job") + telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error while executing cluster diagnostic checks Job") return return cluster_diagnostic_checks_container_log @@ -219,6 +212,6 @@ def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_pr if response_helm_install.returncode != 0: if ('forbidden' in error_helm_install.decode("ascii") or 'timed out waiting for the condition' in error_helm_install.decode("ascii")): telemetry.set_user_fault() - telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Install_HelmRelease_Fault_Type, + telemetry.set_exception(exception=error_helm_install.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type, summary='Unable to install cluster diagnostic checks helm release') raise CLIInternalError("Unable to install cluster diagnostic checks helm release: " + error_helm_install.decode("ascii")) diff --git a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py index 1fed211f02d..c5958cc1507 100644 --- a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py +++ b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py @@ -494,8 +494,8 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, filepat counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, False, filepath_with_timestamp, storage_space_available) - outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], False, filepath_with_timestamp, storage_space_available) + dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, True, filepath_with_timestamp, storage_space_available) + outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], True, filepath_with_timestamp, storage_space_available) else: return consts.Diagnostic_Check_Incomplete, storage_space_available diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 79a41d811a3..f00a9d106f7 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -129,7 +129,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) -def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available): +def check_cluster_DNS(dns_check_log, store_logs, filepath_with_timestamp=None, storage_space_available=False): global diagnoser_output try: @@ -140,13 +140,13 @@ def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_avai if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if storage_space_available: + if storage_space_available and store_logs : dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) with open(dns_check_path, 'w+') as dns: dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") return consts.Diagnostic_Check_Failed, storage_space_available else: - if storage_space_available: + if storage_space_available and store_logs : dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) with open(dns_check_path, 'w+') as dns: dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") @@ -172,7 +172,7 @@ def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_avai return consts.Diagnostic_Check_Incomplete, storage_space_available -def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available): +def check_cluster_outbound_connectivity(outbound_connectivity_check_log, store_logs, filepath_with_timestamp=None, storage_space_available=False): global diagnoser_output try: @@ -182,7 +182,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepat return consts.Diagnostic_Check_Incomplete, storage_space_available # Validating if outbound connectiivty is working or not and displaying proper result if(outbound_connectivity_response != "000"): - if storage_space_available: + if storage_space_available and store_logs : outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) with open(outbound_connectivity_check_path, 'w+') as outbound: outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") @@ -190,7 +190,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepat else: logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if storage_space_available: + if storage_space_available and store_logs : outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) with open(outbound_connectivity_check_path, 'w+') as outbound: outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 08a7bdca88d..b782fb37e29 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -148,7 +148,10 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat diagnostic_checks = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) except Exception as e: + telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), + fault_type=consts.Pre_Onboarding_Diagnostic_Checks_Execution_Failed, summary="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) logger.warning("An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) + return # Handling the user manual interrupt except KeyboardInterrupt: From 5ca78bb0607f9bf75eb00cf20905b8b0659267df Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 23 Jan 2023 15:59:08 +0530 Subject: [PATCH 47/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 1e40107e8b1..2137e0e3505 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -179,7 +179,8 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" +# Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" +Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/cluster-diagnostics-checks:0.1.0" Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release" Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks" Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release" From d715923f2480e8b933c16ad7bbccd6ec6bc377c7 Mon Sep 17 00:00:00 2001 From: Siri Teja Reddy Kasireddy Date: Mon, 23 Jan 2023 16:09:50 +0530 Subject: [PATCH 48/62] install precheck chart in azure-arc-release --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 938298bee3a..7ac08e13362 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -186,7 +186,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): - cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path] + cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path, "--namespace", "{}".format(consts.Release_Install_Namespace), "--create-namespace", "--output", "json"] # To set some other helm parameters through file cmd_helm_install.extend(["--set", "global.location={}".format(location)]) if https_proxy: From 48c8a032ef8e492bbb4bdf46546bac5a238c6391 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 24 Jan 2023 14:46:23 +0530 Subject: [PATCH 49/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_troubleshootutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_precheckutils.py | 37 +++-- .../azext_connectedk8s/_troubleshootutils.py | 133 ++++++------------ src/connectedk8s/azext_connectedk8s/_utils.py | 125 ++++++++++++++-- src/connectedk8s/azext_connectedk8s/custom.py | 40 +++++- 4 files changed, 220 insertions(+), 115 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 7ac08e13362..0be6a23183d 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -44,8 +44,13 @@ # pylint: disable=unused-argument, too-many-locals, too-many-branches, too-many-statements, line-too-long # pylint: disable +diagnoser_output = [] -def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): +def initialize_diagnoser_output(): + global diagnoser_output + +def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available): + global diagnoser_output try: # Setting DNS and Outbound Check as working dns_check = "Starting" @@ -67,26 +72,28 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check, _ = azext_utils.check_cluster_DNS(dns_check_log, False) - outbound_connectivity_check, _ = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], False) + # dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, True) + # outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, True) + dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output) + outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output) else: - return consts.Diagnostic_Check_Incomplete + return consts.Diagnostic_Check_Incomplete, storage_space_available # If both the check passed then we will return cluster diagnostic checks Passed if(dns_check == consts.Diagnostic_Check_Passed and outbound_connectivity_check == consts.Diagnostic_Check_Passed): - return consts.Diagnostic_Check_Passed + return consts.Diagnostic_Check_Passed, storage_space_available # If any of the check remain Incomplete than we will return Incomplete elif(dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete): - return consts.Diagnostic_Check_Incomplete + return consts.Diagnostic_Check_Incomplete, storage_space_available else: - return consts.Diagnostic_Check_Failed + return consts.Diagnostic_Check_Failed, storage_space_available # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while trying to execute cluster diagnostic checks container on the cluster. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, summary="Error occured while executing the cluster diagnostic checks container") - return consts.Diagnostic_Check_Incomplete + return consts.Diagnostic_Check_Incomplete, storage_space_available def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): @@ -94,7 +101,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins # Setting the log output as Empty cluster_diagnostic_checks_container_log = "" - cmd_helm_delete = [helm_client_location, "uninstall", "cluster-diagnostic-checks"] + cmd_helm_delete = [helm_client_location, "uninstall", "cluster-diagnostic-checks", "-n", "azure-arc-release"] if kube_config: cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: @@ -136,7 +143,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins is_job_complete = False is_job_scheduled = False # To watch for changes in the pods states till it reach completed state or exit if it takes more than 180 seconds - for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='default', label_selector="", timeout_seconds=60): + for event in w.stream(batchv1_api_instance.list_namespaced_job, namespace='azure-arc-release', label_selector="", timeout_seconds=60): try: # Checking if job get scheduled or not if event["object"].metadata.name == "cluster-diagnostic-checks-job": @@ -154,26 +161,26 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins telemetry.set_exception(exception="Couldn't schedule cluster diagnostic checks job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, summary="Couldn't schedule cluster diagnostic checks job in the cluster") logger.warning("Unable to schedule the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") - Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster") logger.warning("Unable to finish the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") - Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: # Fetching the cluster diagnostic checks Container logs - all_pods = corev1_api_instance.list_namespaced_pod('default') + all_pods = corev1_api_instance.list_namespaced_pod('azure-arc-release') # Traversing through all agents for each_pod in all_pods.items: # Fetching the current Pod name and creating a folder with that name inside the timestamp folder pod_name = each_pod.metadata.name if(pod_name.startswith(job_name)): # Creating a text file with the name of the container and adding that containers logs in it - cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='default') + cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='azure-arc-release') # Clearing all the resources after fetching the cluster diagnostic checks container logs - Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: diff --git a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py index c5958cc1507..d4ebb9bee74 100644 --- a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py +++ b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py @@ -23,49 +23,6 @@ diagnoser_output = [] -def create_folder_diagnosticlogs(time_stamp): - - global diagnoser_output - try: - # Fetching path to user directory to create the arc diagnostic folder - home_dir = os.path.expanduser('~') - filepath = os.path.join(home_dir, '.azure', consts.Arc_Diagnostic_Logs) - # Creating Diagnostic folder and its subfolder with the given timestamp and cluster name to store all the logs - try: - os.mkdir(filepath) - except FileExistsError: - pass - filepath_with_timestamp = os.path.join(filepath, time_stamp) - try: - os.mkdir(filepath_with_timestamp) - except FileExistsError: - # Deleting the folder if present with the same timestamp to prevent overriding in the same folder and then creating it again - shutil.rmtree(filepath_with_timestamp, ignore_errors=True) - os.mkdir(filepath_with_timestamp) - pass - - return filepath_with_timestamp, True - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - return "", False - else: - logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") - diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") - return "", False - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") - diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") - return "", False - - def fetch_kubectl_cluster_info(filepath_with_timestamp, storage_space_available, kubectl_client_location, kube_config, kube_context): global diagnoser_output @@ -494,8 +451,10 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, filepat counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, True, filepath_with_timestamp, storage_space_available) - outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], True, filepath_with_timestamp, storage_space_available) + # dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available) + # outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], filepath_with_timestamp, storage_space_available) + dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output) + outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output) else: return consts.Diagnostic_Check_Incomplete, storage_space_available @@ -913,45 +872,45 @@ def describe_non_ready_agent_log(filepath_with_timestamp, corev1_api_instance, a return storage_space_available -def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag): - - # This function is used to store the output that is obtained throughout the Diagnoser process - global diagnoser_output - try: - # If storage space is available then only we store the output - if storage_space_available: - # Path to store the diagnoser results - cli_output_logger_path = os.path.join(filepath_with_timestamp, consts.Diagnoser_Results) - # If any results are obtained during the process than we will add it to the text file. - if len(diagnoser_output) > 0: - with open(cli_output_logger_path, 'w+') as cli_output_writer: - for output in diagnoser_output: - cli_output_writer.write(output + "\n") - # If flag is 0 that means that process was terminated using the Keyboard Interrupt so adding that also to the text file - if flag == 0: - cli_output_writer.write("Process terminated externally.\n") - - # If no issues was found during the whole troubleshoot execution - elif flag: - with open(cli_output_logger_path, 'w+') as cli_output_writer: - cli_output_writer.write("The diagnoser didn't find any issues on the cluster.\n") - # If process was terminated by user - else: - with open(cli_output_logger_path, 'w+') as cli_output_writer: - cli_output_writer.write("Process terminated externally.\n") - - return consts.Diagnostic_Check_Passed - - # For handling storage or OS exception that may occur during the execution - except OSError as e: - if "[Errno 28]" in str(e): - storage_space_available = False - telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") - shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - - # To handle any exception that may occur during the execution - except Exception as e: - logger.warning("An exception has occured while trying to store the diagnoser results. Exception: {}".format(str(e)) + "\n") - telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Result_Fault_Type, summary="Error while storing the diagnoser results") - - return consts.Diagnostic_Check_Failed +# def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag): + +# # This function is used to store the output that is obtained throughout the Diagnoser process +# global diagnoser_output +# try: +# # If storage space is available then only we store the output +# if storage_space_available: +# # Path to store the diagnoser results +# cli_output_logger_path = os.path.join(filepath_with_timestamp, consts.Diagnoser_Results) +# # If any results are obtained during the process than we will add it to the text file. +# if len(diagnoser_output) > 0: +# with open(cli_output_logger_path, 'w+') as cli_output_writer: +# for output in diagnoser_output: +# cli_output_writer.write(output + "\n") +# # If flag is 0 that means that process was terminated using the Keyboard Interrupt so adding that also to the text file +# if flag == 0: +# cli_output_writer.write("Process terminated externally.\n") + +# # If no issues was found during the whole troubleshoot execution +# elif flag: +# with open(cli_output_logger_path, 'w+') as cli_output_writer: +# cli_output_writer.write("The diagnoser didn't find any issues on the cluster.\n") +# # If process was terminated by user +# else: +# with open(cli_output_logger_path, 'w+') as cli_output_writer: +# cli_output_writer.write("Process terminated externally.\n") + +# return consts.Diagnostic_Check_Passed + +# # For handling storage or OS exception that may occur during the execution +# except OSError as e: +# if "[Errno 28]" in str(e): +# storage_space_available = False +# telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") +# shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + +# # To handle any exception that may occur during the execution +# except Exception as e: +# logger.warning("An exception has occured while trying to store the diagnoser results. Exception: {}".format(str(e)) + "\n") +# telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Result_Fault_Type, summary="Error while storing the diagnoser results") + +# return consts.Diagnostic_Check_Failed diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index f00a9d106f7..da7f9f1fe90 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -25,6 +25,8 @@ from kubernetes.client.rest import ApiException from azext_connectedk8s._client_factory import _resource_client_factory, _resource_providers_client import azext_connectedk8s._constants as consts +import azext_connectedk8s._precheckutils as precheckutils +import azext_connectedk8s._troubleshootutils as troubleshootutils from kubernetes import client as kube_client from azure.cli.core import get_default_cli from azure.cli.core.azclierror import CLIInternalError, ClientRequestError, ArgumentUsageError, ManualInterrupt, AzureResponseError, AzureInternalError, ValidationError @@ -129,9 +131,14 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) -def check_cluster_DNS(dns_check_log, store_logs, filepath_with_timestamp=None, storage_space_available=False): +# def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): +def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): - global diagnoser_output + # global diagnoser_output + # if for_preonboarding_checks: + # diagnoser_output = precheckutils.diagnoser_output + # else: + # diagnoser_output = troubleshootutils.diagnoser_output try: if consts.DNS_Check_Result_String not in dns_check_log: return consts.Diagnostic_Check_Incomplete, storage_space_available @@ -140,13 +147,13 @@ def check_cluster_DNS(dns_check_log, store_logs, filepath_with_timestamp=None, s if("NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log): logger.warning("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") diagnoser_output.append("Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS issues visit 'https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/'.\n") - if storage_space_available and store_logs : + if storage_space_available: dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) with open(dns_check_path, 'w+') as dns: dns.write(formatted_dns_log + "\nWe found an issue with the DNS resolution on your cluster.") return consts.Diagnostic_Check_Failed, storage_space_available else: - if storage_space_available and store_logs : + if storage_space_available: dns_check_path = os.path.join(filepath_with_timestamp, consts.DNS_Check) with open(dns_check_path, 'w+') as dns: dns.write(formatted_dns_log + "\nCluster DNS check passed successfully.") @@ -172,9 +179,14 @@ def check_cluster_DNS(dns_check_log, store_logs, filepath_with_timestamp=None, s return consts.Diagnostic_Check_Incomplete, storage_space_available -def check_cluster_outbound_connectivity(outbound_connectivity_check_log, store_logs, filepath_with_timestamp=None, storage_space_available=False): +# def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): +def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): - global diagnoser_output + # global diagnoser_output + # if for_preonboarding_checks: + # diagnoser_output = precheckutils.diagnoser_output + # else: + # diagnoser_output = troubleshootutils.diagnoser_output try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] @@ -182,7 +194,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, store_l return consts.Diagnostic_Check_Incomplete, storage_space_available # Validating if outbound connectiivty is working or not and displaying proper result if(outbound_connectivity_response != "000"): - if storage_space_available and store_logs : + if storage_space_available: outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) with open(outbound_connectivity_check_path, 'w+') as outbound: outbound.write("Response code " + outbound_connectivity_response + "\nOutbound network connectivity check passed successfully.") @@ -190,7 +202,7 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, store_l else: logger.warning("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") diagnoser_output.append("Error: We found an issue with outbound network connectivity from the cluster.\nIf your cluster is behind an outbound proxy server, please ensure that you have passed proxy parameters during the onboarding of your cluster.\nFor more details visit 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#connect-using-an-outbound-proxy-server'.\nPlease ensure to meet the following network requirements 'https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster?tabs=azure-cli#meet-network-requirements' \n") - if storage_space_available and store_logs : + if storage_space_available: outbound_connectivity_check_path = os.path.join(filepath_with_timestamp, consts.Outbound_Network_Connectivity_Check) with open(outbound_connectivity_check_path, 'w+') as outbound: outbound.write("Response code " + outbound_connectivity_response + "\nWe found an issue with Outbound network connectivity from the cluster.") @@ -216,6 +228,103 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, store_l return consts.Diagnostic_Check_Incomplete, storage_space_available +def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag, for_preonboarding_checks=False): +# def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag, diagnoser_output): + + # This function is used to store the output that is obtained throughout the Diagnoser process + # global diagnoser_output + if for_preonboarding_checks: + diagnoser_output = precheckutils.diagnoser_output + else: + diagnoser_output = troubleshootutils.diagnoser_output + try: + # If storage space is available then only we store the output + if storage_space_available: + # Path to store the diagnoser results + cli_output_logger_path = os.path.join(filepath_with_timestamp, consts.Diagnoser_Results) + # If any results are obtained during the process than we will add it to the text file. + if len(diagnoser_output) > 0: + with open(cli_output_logger_path, 'w+') as cli_output_writer: + for output in diagnoser_output: + cli_output_writer.write(output + "\n") + # If flag is 0 that means that process was terminated using the Keyboard Interrupt so adding that also to the text file + if flag == 0: + cli_output_writer.write("Process terminated externally.\n") + + # If no issues was found during the whole troubleshoot execution + elif flag: + with open(cli_output_logger_path, 'w+') as cli_output_writer: + cli_output_writer.write("The diagnoser didn't find any issues on the cluster.\n") + # If process was terminated by user + else: + with open(cli_output_logger_path, 'w+') as cli_output_writer: + cli_output_writer.write("Process terminated externally.\n") + + return consts.Diagnostic_Check_Passed + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while trying to store the diagnoser results. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Result_Fault_Type, summary="Error while storing the diagnoser results") + + return consts.Diagnostic_Check_Failed + + +def create_folder_diagnosticlogs(time_stamp, for_preonboarding_checks=False): +# def create_folder_diagnosticlogs(time_stamp, diagnoser_output): + + # global diagnoser_output + if for_preonboarding_checks: + diagnoser_output = precheckutils.diagnoser_output + else: + diagnoser_output = troubleshootutils.diagnoser_output + try: + # Fetching path to user directory to create the arc diagnostic folder + home_dir = os.path.expanduser('~') + filepath = os.path.join(home_dir, '.azure', consts.Arc_Diagnostic_Logs) + # Creating Diagnostic folder and its subfolder with the given timestamp and cluster name to store all the logs + try: + os.mkdir(filepath) + except FileExistsError: + pass + filepath_with_timestamp = os.path.join(filepath, time_stamp) + try: + os.mkdir(filepath_with_timestamp) + except FileExistsError: + # Deleting the folder if present with the same timestamp to prevent overriding in the same folder and then creating it again + shutil.rmtree(filepath_with_timestamp, ignore_errors=True) + os.mkdir(filepath_with_timestamp) + pass + + return filepath_with_timestamp, True + + # For handling storage or OS exception that may occur during the execution + except OSError as e: + if "[Errno 28]" in str(e): + shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) + telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") + return "", False + else: + logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") + diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") + return "", False + + # To handle any exception that may occur during the execution + except Exception as e: + logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") + telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") + diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") + return "", False + + def add_helm_repo(kube_config, kube_context, helm_client_location): repo_name = os.getenv('HELMREPONAME') repo_url = os.getenv('HELMREPOURL') diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index b782fb37e29..28ea7f95b77 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -144,8 +144,32 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat helm_client_location = install_helm_client() diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() + storage_space_available = True + precheckutils.initialize_diagnoser_output() + # diagnoser_output = precheckutils.initialize_diagnoser_output() + current_time = time.ctime(time.time()) + time_stamp = "" + for elements in current_time: + if(elements == ' '): + time_stamp += '-' + continue + elif(elements == ':'): + time_stamp += '.' + continue + time_stamp += elements + time_stamp = cluster_name + '-' + time_stamp + + # Generate the diagnostic folder in a given location + filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, True) + # filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, diagnoser_output) + + if(diagnostic_folder_status is not True): + storage_space_available = False + # Performing cluster-diagnostic-checks - diagnostic_checks = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) + diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available) + Storing_Diagnoser_Results_Logs = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) + # Storing_Diagnoser_Results_Logs = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, diagnoser_output) except Exception as e: telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), @@ -157,8 +181,14 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat except KeyboardInterrupt: raise ManualInterrupt('Process terminated externally.') + # If the checks didnt pass then stop the onboarding if diagnostic_checks != consts.Diagnostic_Check_Passed: - raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") + # all_checks_passed = False + if storage_space_available: + logger.warning("The diagnoser logs have been saved at this path:" + filepath_with_timestamp + " .\nThese logs can be attached while filing a support ticket for further assistance.\n") + raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") + else: + raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: @@ -2258,7 +2288,7 @@ def troubleshoot(cmd, client, resource_group_name, cluster_name, kube_config=Non time_stamp += elements time_stamp = cluster_name + '-' + time_stamp # Generate the diagnostic folder in a given location - filepath_with_timestamp, diagnostic_folder_status = troubleshootutils.create_folder_diagnosticlogs(time_stamp) + filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp) if(diagnostic_folder_status is not True): storage_space_available = False @@ -2338,7 +2368,7 @@ def troubleshoot(cmd, client, resource_group_name, cluster_name, kube_config=Non diagnostic_checks[consts.Diagnoser_Check], storage_space_available = troubleshootutils.check_diagnoser_container(corev1_api_instance, batchv1_api_instance, filepath_with_timestamp, storage_space_available, absolute_path, probable_sufficient_resource_for_agents, helm_client_location, kubectl_client_location, release_namespace, diagnostic_checks[consts.KAP_Security_Policy_Check], kube_config, kube_context) # Adding cli output to the logs - diagnostic_checks[consts.Storing_Diagnoser_Results_Logs] = troubleshootutils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1) + diagnostic_checks[consts.Storing_Diagnoser_Results_Logs] = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1) # If all the checks passed then display no error found all_checks_passed = True @@ -2359,7 +2389,7 @@ def troubleshoot(cmd, client, resource_group_name, cluster_name, kube_config=Non # Handling the user manual interrupt except KeyboardInterrupt: try: - troubleshootutils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 0) + utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 0) except Exception as e: pass raise ManualInterrupt('Process terminated externally.') From 4ab9ab05391718a9398df61329bb848206091af9 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 24 Jan 2023 15:42:26 +0530 Subject: [PATCH 50/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_troubleshootutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 2 ++ .../azext_connectedk8s/_precheckutils.py | 4 +-- .../azext_connectedk8s/_troubleshootutils.py | 2 -- src/connectedk8s/azext_connectedk8s/_utils.py | 33 ++++--------------- src/connectedk8s/azext_connectedk8s/custom.py | 11 ++++--- 5 files changed, 16 insertions(+), 36 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 2137e0e3505..c1c397740ae 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -169,6 +169,8 @@ Arc_Agents_Logs = "arc_agents_logs" Arc_Deployment_Logs = "arc_deployment_logs" Arc_Diagnostic_Logs = "arc_diagnostic_logs" +Pre_Onboarding_Check_Logs = "pre_onboarding_check_logs" +Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingCharts' Describe_Non_Ready_Arc_Agents = "describe_non_ready_arc_agents" Agent_State = "agent_state.txt" Arc_Agents_Events = "arc_agent_events.txt" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 0be6a23183d..1f671a907d1 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -72,8 +72,6 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - # dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, True) - # outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, True) dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output) outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(cluster_diagnostic_checks_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output) else: @@ -134,7 +132,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job") return - chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, 'cluster_diagnostic_checks') + chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name) helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) diff --git a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py index d4ebb9bee74..5917ae7e1b8 100644 --- a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py +++ b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py @@ -451,8 +451,6 @@ def check_diagnoser_container(corev1_api_instance, batchv1_api_instance, filepat counter_container_logs = 0 elif counter_container_logs == 0: dns_check_log += " " + outputs - # dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available) - # outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], filepath_with_timestamp, storage_space_available) dns_check, storage_space_available = azext_utils.check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output) outbound_connectivity_check, storage_space_available = azext_utils.check_cluster_outbound_connectivity(diagnoser_container_log_list[-1], filepath_with_timestamp, storage_space_available, diagnoser_output) else: diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index da7f9f1fe90..53e31c1161c 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -74,7 +74,7 @@ def validate_location(cmd, location): def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - if chart_path_name == 'cluster_diagnostic_checks': + if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) else: pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, 'azure-arc') @@ -87,13 +87,13 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio except: logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) - if chart_path_name == 'cluster_diagnostic_checks': + if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) else: export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') # Returning helm chart path - if chart_path_name == 'cluster_diagnostic_checks': + if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: helm_chart_path = os.path.join(chart_export_path, 'cluster-diagnostic-checks') chart_path = helm_chart_path else: @@ -134,11 +134,6 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex # def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): - # global diagnoser_output - # if for_preonboarding_checks: - # diagnoser_output = precheckutils.diagnoser_output - # else: - # diagnoser_output = troubleshootutils.diagnoser_output try: if consts.DNS_Check_Result_String not in dns_check_log: return consts.Diagnostic_Check_Incomplete, storage_space_available @@ -181,12 +176,7 @@ def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_avai # def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): - - # global diagnoser_output - # if for_preonboarding_checks: - # diagnoser_output = precheckutils.diagnoser_output - # else: - # diagnoser_output = troubleshootutils.diagnoser_output + try: outbound_connectivity_response = outbound_connectivity_check_log[-1:-4:-1] outbound_connectivity_response = outbound_connectivity_response[::-1] @@ -229,14 +219,13 @@ def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepat def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag, for_preonboarding_checks=False): -# def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag, diagnoser_output): # This function is used to store the output that is obtained throughout the Diagnoser process - # global diagnoser_output if for_preonboarding_checks: diagnoser_output = precheckutils.diagnoser_output else: diagnoser_output = troubleshootutils.diagnoser_output + try: # If storage space is available then only we store the output if storage_space_available: @@ -277,18 +266,12 @@ def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, f return consts.Diagnostic_Check_Failed -def create_folder_diagnosticlogs(time_stamp, for_preonboarding_checks=False): -# def create_folder_diagnosticlogs(time_stamp, diagnoser_output): +def create_folder_diagnosticlogs(time_stamp, folder_name): - # global diagnoser_output - if for_preonboarding_checks: - diagnoser_output = precheckutils.diagnoser_output - else: - diagnoser_output = troubleshootutils.diagnoser_output try: # Fetching path to user directory to create the arc diagnostic folder home_dir = os.path.expanduser('~') - filepath = os.path.join(home_dir, '.azure', consts.Arc_Diagnostic_Logs) + filepath = os.path.join(home_dir, '.azure', folder_name) # Creating Diagnostic folder and its subfolder with the given timestamp and cluster name to store all the logs try: os.mkdir(filepath) @@ -314,14 +297,12 @@ def create_folder_diagnosticlogs(time_stamp, for_preonboarding_checks=False): else: logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") - diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") return "", False # To handle any exception that may occur during the execution except Exception as e: logger.warning("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") telemetry.set_exception(exception=e, fault_type=consts.Diagnostics_Folder_Creation_Failed_Fault_Type, summary="Error while trying to create diagnostic logs folder") - diagnoser_output.append("An exception has occured while creating the diagnostic logs folder in your local machine. Exception: {}".format(str(e)) + "\n") return "", False diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 28ea7f95b77..69b08bf2a04 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -146,7 +146,6 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat batchv1_api_instance = kube_client.BatchV1Api() storage_space_available = True precheckutils.initialize_diagnoser_output() - # diagnoser_output = precheckutils.initialize_diagnoser_output() current_time = time.ctime(time.time()) time_stamp = "" for elements in current_time: @@ -160,8 +159,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat time_stamp = cluster_name + '-' + time_stamp # Generate the diagnostic folder in a given location - filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, True) - # filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, diagnoser_output) + filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, consts.Pre_Onboarding_Check_Logs) if(diagnostic_folder_status is not True): storage_space_available = False @@ -169,7 +167,6 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # Performing cluster-diagnostic-checks diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available) Storing_Diagnoser_Results_Logs = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) - # Storing_Diagnoser_Results_Logs = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, diagnoser_output) except Exception as e: telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), @@ -179,6 +176,10 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # Handling the user manual interrupt except KeyboardInterrupt: + try: + utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 0, True) + except Exception as e: + pass raise ManualInterrupt('Process terminated externally.') # If the checks didnt pass then stop the onboarding @@ -2288,7 +2289,7 @@ def troubleshoot(cmd, client, resource_group_name, cluster_name, kube_config=Non time_stamp += elements time_stamp = cluster_name + '-' + time_stamp # Generate the diagnostic folder in a given location - filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp) + filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, consts.Arc_Diagnostic_Logs) if(diagnostic_folder_status is not True): storage_space_available = False From 84dc48b69a50b04a9dba2ba2da0d20bbc0bc6586 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 24 Jan 2023 15:53:38 +0530 Subject: [PATCH 51/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 3 --- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 1f671a907d1..5f0b58d17f7 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -46,9 +46,6 @@ diagnoser_output = [] -def initialize_diagnoser_output(): - global diagnoser_output - def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available): global diagnoser_output try: diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 69b08bf2a04..0cb9e69085c 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -145,7 +145,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat diagnostic_checks = "Failed" batchv1_api_instance = kube_client.BatchV1Api() storage_space_available = True - precheckutils.initialize_diagnoser_output() + current_time = time.ctime(time.time()) time_stamp = "" for elements in current_time: From bbf2192b6ce5a23a9c89b15f67ed7daea19e4b7b Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 24 Jan 2023 16:20:29 +0530 Subject: [PATCH 52/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 5f0b58d17f7..5ac57c3403e 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -156,13 +156,13 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins telemetry.set_exception(exception="Couldn't schedule cluster diagnostic checks job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, summary="Couldn't schedule cluster diagnostic checks job in the cluster") logger.warning("Unable to schedule the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be presence of a security policy or security context constraint (SCC) or it may happen becuase of lack of ResourceQuota.\n") - # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return elif (is_job_scheduled is True and is_job_complete is False): telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster") logger.warning("Unable to finish the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") - # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: # Fetching the cluster diagnostic checks Container logs @@ -175,7 +175,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins # Creating a text file with the name of the container and adding that containers logs in it cluster_diagnostic_checks_container_log = corev1_api_instance.read_namespaced_pod_log(name=pod_name, container="cluster-diagnostic-checks-container", namespace='azure-arc-release') # Clearing all the resources after fetching the cluster diagnostic checks container logs - # Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) + Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: From 402b5d814149b2889f65d156be7316feb317d220 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Tue, 24 Jan 2023 18:59:52 +0530 Subject: [PATCH 53/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 2 +- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 1 + src/connectedk8s/azext_connectedk8s/_utils.py | 2 +- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index c1c397740ae..51242131624 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -188,7 +188,7 @@ Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release" Cluster_Diagnostic_Checks_Job_Not_Scheduled = 'Unable to schedule cluster-diagnostic-checks job' Cluster_Diagnostic_Checks_Job_Not_Complete = 'Unable to complete cluster-diagnostic-checks job after scheduling' -Pre_Onboarding_Diagnostic_Checks_Execution_Failed= 'Exception occured while trying to execute pre-onboarding diagnostic checks' +Pre_Onboarding_Diagnostic_Checks_Execution_Failed = 'Exception occured while trying to execute pre-onboarding diagnostic checks' # Diagnostic Results Name Outbound_Connectivity_Check_Result_String = "Outbound Network Connectivity Result:" DNS_Check_Result_String = "DNS Result:" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 5ac57c3403e..a3a351f0a90 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -46,6 +46,7 @@ diagnoser_output = [] + def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available): global diagnoser_output try: diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 53e31c1161c..b8a49090233 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -133,7 +133,7 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex # def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): - + try: if consts.DNS_Check_Result_String not in dns_check_log: return consts.Diagnostic_Check_Incomplete, storage_space_available diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 0cb9e69085c..cbe7a365d35 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -157,7 +157,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat continue time_stamp += elements time_stamp = cluster_name + '-' + time_stamp - + # Generate the diagnostic folder in a given location filepath_with_timestamp, diagnostic_folder_status = utils.create_folder_diagnosticlogs(time_stamp, consts.Pre_Onboarding_Check_Logs) From 9354118a9e61fdef025014dd4aeaae6f67921eae Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 10:49:46 +0530 Subject: [PATCH 54/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_constants.py | 3 +-- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 51242131624..196d89a1a1c 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -181,8 +181,7 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -# Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/pre-onboarding-inspector:0.1.0" -Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/cluster-diagnostics-checks:0.1.0" +Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/azurearck8s/helmchart/stable/clusterdiagnosticchecks:0.1.0" Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release" Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks" Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release" diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index cbe7a365d35..fa92a84192d 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -166,7 +166,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # Performing cluster-diagnostic-checks diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available) - Storing_Diagnoser_Results_Logs = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) + _ = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) except Exception as e: telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), From cc6845ccfa2639a929773793f7c4adedb761500b Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 11:29:52 +0530 Subject: [PATCH 55/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_troubleshootutils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- .../azext_connectedk8s/_constants.py | 2 +- .../azext_connectedk8s/_troubleshootutils.py | 44 ------------------- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 3 files changed, 2 insertions(+), 46 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 196d89a1a1c..620ee6434fb 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -170,7 +170,7 @@ Arc_Deployment_Logs = "arc_deployment_logs" Arc_Diagnostic_Logs = "arc_diagnostic_logs" Pre_Onboarding_Check_Logs = "pre_onboarding_check_logs" -Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingCharts' +Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingChecksCharts' Describe_Non_Ready_Arc_Agents = "describe_non_ready_arc_agents" Agent_State = "agent_state.txt" Arc_Agents_Events = "arc_agent_events.txt" diff --git a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py index 5917ae7e1b8..d7224406aac 100644 --- a/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py +++ b/src/connectedk8s/azext_connectedk8s/_troubleshootutils.py @@ -868,47 +868,3 @@ def describe_non_ready_agent_log(filepath_with_timestamp, corev1_api_instance, a diagnoser_output.append("An exception has occured while storing stuck agent logs in the user local machine. Exception: {}".format(str(e)) + "\n") return storage_space_available - - -# def fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, flag): - -# # This function is used to store the output that is obtained throughout the Diagnoser process -# global diagnoser_output -# try: -# # If storage space is available then only we store the output -# if storage_space_available: -# # Path to store the diagnoser results -# cli_output_logger_path = os.path.join(filepath_with_timestamp, consts.Diagnoser_Results) -# # If any results are obtained during the process than we will add it to the text file. -# if len(diagnoser_output) > 0: -# with open(cli_output_logger_path, 'w+') as cli_output_writer: -# for output in diagnoser_output: -# cli_output_writer.write(output + "\n") -# # If flag is 0 that means that process was terminated using the Keyboard Interrupt so adding that also to the text file -# if flag == 0: -# cli_output_writer.write("Process terminated externally.\n") - -# # If no issues was found during the whole troubleshoot execution -# elif flag: -# with open(cli_output_logger_path, 'w+') as cli_output_writer: -# cli_output_writer.write("The diagnoser didn't find any issues on the cluster.\n") -# # If process was terminated by user -# else: -# with open(cli_output_logger_path, 'w+') as cli_output_writer: -# cli_output_writer.write("Process terminated externally.\n") - -# return consts.Diagnostic_Check_Passed - -# # For handling storage or OS exception that may occur during the execution -# except OSError as e: -# if "[Errno 28]" in str(e): -# storage_space_available = False -# telemetry.set_exception(exception=e, fault_type=consts.No_Storage_Space_Available_Fault_Type, summary="No space left on device") -# shutil.rmtree(filepath_with_timestamp, ignore_errors=False, onerror=None) - -# # To handle any exception that may occur during the execution -# except Exception as e: -# logger.warning("An exception has occured while trying to store the diagnoser results. Exception: {}".format(str(e)) + "\n") -# telemetry.set_exception(exception=e, fault_type=consts.Diagnoser_Result_Fault_Type, summary="Error while storing the diagnoser results") - -# return consts.Diagnostic_Check_Failed diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index fa92a84192d..562bd22049e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -166,7 +166,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # Performing cluster-diagnostic-checks diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available) - _ = utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) + utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) except Exception as e: telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), From 551ef0d685ef7f509a47058332ad069e4709335f Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 12:04:22 +0530 Subject: [PATCH 56/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py modified: src/connectedk8s/azext_connectedk8s/custom.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 2 +- src/connectedk8s/azext_connectedk8s/_utils.py | 2 -- src/connectedk8s/azext_connectedk8s/custom.py | 10 +++------- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index a3a351f0a90..725864fd055 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -126,7 +126,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: - logger.warning("An error occured while installing the cluster diagnostic checks helm release in the cluster. Exception:") + logger.warning("Cleanup of previous diagnostic checks helm release failed and hence couldn't install the new helm release. Please cleanup older release using \"helm delete cluster-diagnostic-checks -n azuer-arc-release\" and try onboarding again") telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job") return diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index b8a49090233..a2c5068f3c9 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -131,7 +131,6 @@ def export_helm_chart(registry_path, chart_export_path, kube_config, kube_contex raise CLIInternalError("Unable to export {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_export.decode("ascii")) -# def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): try: @@ -174,7 +173,6 @@ def check_cluster_DNS(dns_check_log, filepath_with_timestamp, storage_space_avai return consts.Diagnostic_Check_Incomplete, storage_space_available -# def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, for_preonboarding_checks=False): def check_cluster_outbound_connectivity(outbound_connectivity_check_log, filepath_with_timestamp, storage_space_available, diagnoser_output): try: diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index 562bd22049e..ab30d506493 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -171,8 +171,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat except Exception as e: telemetry.set_exception(exception="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e)), fault_type=consts.Pre_Onboarding_Diagnostic_Checks_Execution_Failed, summary="An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) - logger.warning("An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) - return + raise CLIInternalError("An exception has occured while trying to execute pre-onboarding diagnostic checks : {}".format(str(e))) # Handling the user manual interrupt except KeyboardInterrupt: @@ -184,12 +183,9 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat # If the checks didnt pass then stop the onboarding if diagnostic_checks != consts.Diagnostic_Check_Passed: - # all_checks_passed = False if storage_space_available: - logger.warning("The diagnoser logs have been saved at this path:" + filepath_with_timestamp + " .\nThese logs can be attached while filing a support ticket for further assistance.\n") - raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") - else: - raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") + logger.warning("The pre-check result logs logs have been saved at this path:" + filepath_with_timestamp + " .\nThese logs can be attached while filing a support ticket for further assistance.\n") + raise ValidationError("One or more pre-onboarding diagnostic checks failed and hence not proceeding with cluster onboarding. Please resolve them and try onboarding again.") required_node_exists = check_linux_amd64_node(node_api_response) if not required_node_exists: From ca6c97a14c6ed9623d7fbb9dff6836a5ce8ab3a1 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 14:27:19 +0530 Subject: [PATCH 57/62] modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py modified: src/connectedk8s/azext_connectedk8s/_utils.py --- .../azext_connectedk8s/_constants.py | 1 + .../azext_connectedk8s/_precheckutils.py | 4 ++-- src/connectedk8s/azext_connectedk8s/_utils.py | 19 ++++++------------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 620ee6434fb..d0e08a56fcc 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -171,6 +171,7 @@ Arc_Diagnostic_Logs = "arc_diagnostic_logs" Pre_Onboarding_Check_Logs = "pre_onboarding_check_logs" Pre_Onboarding_Helm_Charts_Folder_Name = 'PreOnboardingChecksCharts' +Pre_Onboarding_Helm_Charts_Release_Name = 'cluster-diagnostic-checks' Describe_Non_Ready_Arc_Agents = "describe_non_ready_arc_agents" Agent_State = "agent_state.txt" Arc_Agents_Events = "arc_agent_events.txt" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 725864fd055..50f2b9075a6 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -130,7 +130,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins telemetry.set_exception(exception=error_kubectl_delete_helm.decode("ascii"), fault_type=consts.Cluster_Diagnostic_Checks_Release_Cleanup_Failed, summary="Error while executing cluster diagnostic checks Job") return - chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name) + chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name, consts.Pre_Onboarding_Helm_Charts_Release_Name) helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) @@ -162,7 +162,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins elif (is_job_scheduled is True and is_job_complete is False): telemetry.set_exception(exception="Couldn't complete cluster diagnostic checks job after scheduling in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Complete, summary="Couldn't complete cluster diagnostic checks job after scheduling in the cluster") - logger.warning("Unable to finish the cluster diagnostic checks job in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") + logger.warning("Cluster diagnostics job didn't reach completed state in the kubernetes cluster. The possible reasons can be resource constraints on the cluster.\n") Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) return else: diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index a2c5068f3c9..16ca74f0c24 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -71,33 +71,26 @@ def validate_location(cmd, location): break -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_path_name='AzureArcCharts'): +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_folder_name='AzureArcCharts', chart_name='azure-arc'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' - if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_path_name) - else: - pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, 'azure-arc') + pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_name) # Exporting helm chart after cleanup - chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', chart_path_name) + chart_export_path = os.path.join(os.path.expanduser('~'), '.azure', chart_folder_name) try: if os.path.isdir(chart_export_path): shutil.rmtree(chart_export_path) except: logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) - if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_path_name) - else: - export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, 'azure-arc') + export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_name) # Returning helm chart path - if chart_path_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: - helm_chart_path = os.path.join(chart_export_path, 'cluster-diagnostic-checks') + helm_chart_path = os.path.join(chart_export_path, chart_name) + if chart_folder_name == consts.Pre_Onboarding_Helm_Charts_Folder_Name: chart_path = helm_chart_path else: - helm_chart_path = os.path.join(chart_export_path, 'azure-arc-k8sagents') chart_path = os.getenv('HELMCHART') if os.getenv('HELMCHART') else helm_chart_path return chart_path From 840c656d82c4a76d8ac0773866c5fce57577e5fb Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 14:52:10 +0530 Subject: [PATCH 58/62] modified: src/connectedk8s/azext_connectedk8s/_utils.py --- src/connectedk8s/azext_connectedk8s/_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 16ca74f0c24..6efe15b118c 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -71,7 +71,7 @@ def validate_location(cmd, location): break -def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_folder_name='AzureArcCharts', chart_name='azure-arc'): +def get_chart_path(registry_path, kube_config, kube_context, helm_client_location, chart_folder_name='AzureArcCharts', chart_name='azure-arc-k8sagents'): # Pulling helm chart from registry os.environ['HELM_EXPERIMENTAL_OCI'] = '1' pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_name) @@ -82,7 +82,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio if os.path.isdir(chart_export_path): shutil.rmtree(chart_export_path) except: - logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_path_name, chart_export_path)) + logger.warning("Unable to cleanup the {} already present on the machine. In case of failure, please cleanup the directory '{}' and try again.".format(chart_folder_name, chart_export_path)) export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_name) @@ -96,7 +96,7 @@ def get_chart_path(registry_path, kube_config, kube_context, helm_client_locatio return chart_path -def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc'): +def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc-k8sagents'): cmd_helm_chart_pull = [helm_client_location, "chart", "pull", registry_path] if kube_config: cmd_helm_chart_pull.extend(["--kubeconfig", kube_config]) @@ -110,7 +110,7 @@ def pull_helm_chart(registry_path, kube_config, kube_context, helm_client_locati raise CLIInternalError("Unable to pull {} helm chart from the registry '{}': ".format(chart_name, registry_path) + error_helm_chart_pull.decode("ascii")) -def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc'): +def export_helm_chart(registry_path, chart_export_path, kube_config, kube_context, helm_client_location, chart_name='azure-arc-k8sagents'): cmd_helm_chart_export = [helm_client_location, "chart", "export", registry_path, "--destination", chart_export_path] if kube_config: cmd_helm_chart_export.extend(["--kubeconfig", kube_config]) From 01fd351eaac1aa8622273749c60066581d77c885 Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Wed, 25 Jan 2023 15:48:39 +0530 Subject: [PATCH 59/62] modified: src/connectedk8s/azext_connectedk8s/_precheckutils.py --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 50f2b9075a6..110a21e9fc8 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -101,7 +101,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins if kube_config: cmd_helm_delete.extend(["--kubeconfig", kube_config]) if kube_context: - cmd_helm_delete.extend(["--context", kube_context]) + cmd_helm_delete.extend(["--kube-context", kube_context]) # To handle the user keyboard Interrupt try: From 871e0acb202f272648c268e3c54ef5251d571afa Mon Sep 17 00:00:00 2001 From: Siri Teja Reddy Kasireddy Date: Wed, 25 Jan 2023 18:40:30 +0530 Subject: [PATCH 60/62] add handling based on cloud --- src/connectedk8s/azext_connectedk8s/_precheckutils.py | 11 ++++++----- src/connectedk8s/azext_connectedk8s/custom.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 110a21e9fc8..a44a03fe836 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -47,14 +47,14 @@ diagnoser_output = [] -def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available): +def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, filepath_with_timestamp, storage_space_available): global diagnoser_output try: # Setting DNS and Outbound Check as working dns_check = "Starting" outbound_connectivity_check = "Starting" # Executing the cluster_diagnostic_checks job and fetching the logs obtained - cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert) + cluster_diagnostic_checks_container_log = executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud) # If cluster_diagnostic_checks_container_log is not empty then only we will check for the results if(cluster_diagnostic_checks_container_log is not None and cluster_diagnostic_checks_container_log != ""): cluster_diagnostic_checks_container_log_list = cluster_diagnostic_checks_container_log.split("\n") @@ -92,7 +92,7 @@ def fetch_diagnostic_checks_results(corev1_api_instance, batchv1_api_instance, h return consts.Diagnostic_Check_Incomplete, storage_space_available -def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert): +def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud): job_name = "cluster-diagnostic-checks-job" # Setting the log output as Empty cluster_diagnostic_checks_container_log = "" @@ -132,7 +132,7 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins chart_path = azext_utils.get_chart_path(consts.Cluster_Diagnostic_Checks_Job_Registry_Path, kube_config, kube_context, helm_client_location, consts.Pre_Onboarding_Helm_Charts_Folder_Name, consts.Pre_Onboarding_Helm_Charts_Release_Name) - helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location) + helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location) # Watching for cluster diagnostic checks container to reach in completed stage w = watch.Watch() @@ -188,10 +188,11 @@ def executing_cluster_diagnostic_checks_job(corev1_api_instance, batchv1_api_ins return cluster_diagnostic_checks_container_log -def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): +def helm_install_release_cluster_diagnostic_checks(chart_path, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, kube_config, kube_context, helm_client_location, onboarding_timeout="60"): cmd_helm_install = [helm_client_location, "upgrade", "--install", "cluster-diagnostic-checks", chart_path, "--namespace", "{}".format(consts.Release_Install_Namespace), "--create-namespace", "--output", "json"] # To set some other helm parameters through file cmd_helm_install.extend(["--set", "global.location={}".format(location)]) + cmd_helm_install.extend(["--set", "global.azureCloud={}".format(azure_cloud)]) if https_proxy: cmd_helm_install.extend(["--set", "global.httpsProxy={}".format(https_proxy)]) if http_proxy: diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index ab30d506493..69fffe4387a 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -165,7 +165,7 @@ def create_connectedk8s(cmd, client, resource_group_name, cluster_name, correlat storage_space_available = False # Performing cluster-diagnostic-checks - diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, filepath_with_timestamp, storage_space_available) + diagnostic_checks, storage_space_available = precheckutils.fetch_diagnostic_checks_results(api_instance, batchv1_api_instance, helm_client_location, kubectl_client_location, kube_config, kube_context, location, http_proxy, https_proxy, no_proxy, proxy_cert, azure_cloud, filepath_with_timestamp, storage_space_available) utils.fetching_cli_output_logs(filepath_with_timestamp, storage_space_available, 1, True) except Exception as e: From 33ee014f7cefb67a39dc22460253530f622fdf2d Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 30 Jan 2023 12:16:11 +0530 Subject: [PATCH 61/62] modified: src/connectedk8s/HISTORY.rst modified: src/connectedk8s/azext_connectedk8s/_constants.py modified: src/connectedk8s/setup.py --- src/connectedk8s/HISTORY.rst | 5 ----- src/connectedk8s/azext_connectedk8s/_constants.py | 2 +- src/connectedk8s/setup.py | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index fbec032acd8..a6063104fc1 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,11 +2,6 @@ Release History =============== -1.3.9 -++++++ - -* Added DNS and outbound connectivity prechecks in connect command - 1.3.8 ++++++ diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index d0e08a56fcc..03f327f262d 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -182,7 +182,7 @@ Outbound_Network_Connectivity_Check = "outbound_network_connectivity_check.txt" Events_of_Incomplete_Diagnoser_Job = "diagnoser_failure_events.txt" # Connect Precheck Diagnoser constants -Cluster_Diagnostic_Checks_Job_Registry_Path = "arck8sdiagnoser.azurecr.io/public/azurearck8s/helmchart/stable/clusterdiagnosticchecks:0.1.0" +Cluster_Diagnostic_Checks_Job_Registry_Path = "mcr.microsoft.com/azurearck8s/helmchart/stable/clusterdiagnosticchecks:0.1.0" Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = "Error while installing cluster diagnostic checks helm release" Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type = "Error occured while executing cluster diagnostic checks" Cluster_Diagnostic_Checks_Release_Cleanup_Failed = "Error occured while cleaning up the cluster diagnostic checks helm release" diff --git a/src/connectedk8s/setup.py b/src/connectedk8s/setup.py index ef5b845dcfe..d074e94dbb4 100644 --- a/src/connectedk8s/setup.py +++ b/src/connectedk8s/setup.py @@ -17,7 +17,7 @@ # TODO: Confirm this is the right version number you want and it matches your # HISTORY.rst entry. -VERSION = '1.3.9' +VERSION = '1.3.8' # The full list of classifiers is available at # https://pypi.python.org/pypi?%3Aaction=list_classifiers From f531f66f9b32c838be84b71a38357ad39ced996b Mon Sep 17 00:00:00 2001 From: rohan-dassani Date: Mon, 30 Jan 2023 13:57:37 +0530 Subject: [PATCH 62/62] modified: src/connectedk8s/HISTORY.rst modified: src/connectedk8s/setup.py --- src/connectedk8s/HISTORY.rst | 5 +++++ src/connectedk8s/setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index a6063104fc1..fbec032acd8 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,6 +2,11 @@ Release History =============== +1.3.9 +++++++ + +* Added DNS and outbound connectivity prechecks in connect command + 1.3.8 ++++++ diff --git a/src/connectedk8s/setup.py b/src/connectedk8s/setup.py index d074e94dbb4..ef5b845dcfe 100644 --- a/src/connectedk8s/setup.py +++ b/src/connectedk8s/setup.py @@ -17,7 +17,7 @@ # TODO: Confirm this is the right version number you want and it matches your # HISTORY.rst entry. -VERSION = '1.3.8' +VERSION = '1.3.9' # The full list of classifiers is available at # https://pypi.python.org/pypi?%3Aaction=list_classifiers