From 74426a81709a7607af85311da811e2d72532dac2 Mon Sep 17 00:00:00 2001 From: Gregory Giguashvili Date: Thu, 30 Nov 2023 12:00:24 +0200 Subject: [PATCH 1/2] Fail-fast greenboot errors in scenario and work around topolvm issue --- test/bin/scenario.sh | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/test/bin/scenario.sh b/test/bin/scenario.sh index 89db47dd80..f2bf1aac71 100755 --- a/test/bin/scenario.sh +++ b/test/bin/scenario.sh @@ -168,7 +168,7 @@ prepare_kickstart() { local full_vmname local output_file - local vm_hostname + local vm_hostname local fips_command="" full_vmname="$(full_vm_name "${vmname}")" @@ -254,22 +254,54 @@ wait_for_greenboot() { if "${SKIP_GREENBOOT}"; then echo "Skipping greenboot check" + record_junit "${vmname}" "greenboot-check" "SKIPPED" return 0 fi echo "Waiting ${VM_BOOT_TIMEOUT} for greenboot on ${vmname} to complete" local -r start_time=$(date +%s) + local -r ssh_cmd="ssh -oConnectTimeout=10 -oBatchMode=yes -oStrictHostKeyChecking=accept-new redhat@${ip}" + local -r kube_opt="--kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig" + local retry_count=2 while [ $(( $(date +%s) - start_time )) -lt "${VM_BOOT_TIMEOUT}" ] ; do - if ssh -oConnectTimeout=10 -oBatchMode=yes -oStrictHostKeyChecking=accept-new "redhat@${ip}" \ - "sudo journalctl -n 5 -u greenboot-healthcheck; \ - systemctl show --property=SubState --value greenboot-healthcheck | grep -w exited" ; then + local svc_state + + svc_state="$(${ssh_cmd} systemctl show --property=SubState --value greenboot-healthcheck || true)" + if [ "${svc_state}" = "exited" ] ; then + record_junit "${vmname}" "greenboot-check" "OK" return 0 fi + + # Print the last log and check for terminal failure + ${ssh_cmd} "sudo journalctl -n 10 -u greenboot-healthcheck" || true + if [ "${svc_state}" = "failed" ] ; then + # FIXME: See OCPBUGS-24222 + # Workaround for TopoLVM images getting stuck + # Delete the TopoLVM pods and retry greenboot check + # Remove this code when the problem is addressed + if [ ${retry_count} -gt 0 ] ; then + echo "Deleting TopoLVM pods and retrying the greenboot checks (${retry_count} attempts remaining)" + (( retry_count-- )) + + if ! ${ssh_cmd} "sudo oc ${kube_opt} delete pods -n openshift-storage --all" ; then + echo "WARNING: TopoLVM pod deletion returned an error" + fi + if ! ${ssh_cmd} "sudo systemctl restart --no-block greenboot-healthcheck.service" ; then + echo "WARNING: Greenboot service restart returned an error" + fi + else + echo "The greenboot service reported a failed state, no need to wait any longer" + break + fi + fi + date sleep 10 done - # Return an error if non of the ssh attempts succeeded + + # Return an error if none of the ssh attempts succeeded + record_junit "${vmname}" "greenboot-check" "FAILED" return 1 } From 48a64ff41a3dfebdb78848dbe22b958dc06edebd Mon Sep 17 00:00:00 2001 From: Gregory Giguashvili Date: Thu, 30 Nov 2023 15:56:00 +0200 Subject: [PATCH 2/2] Add timeout when deleting openshift-storage pods --- test/bin/scenario.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/bin/scenario.sh b/test/bin/scenario.sh index f2bf1aac71..ec2f833104 100755 --- a/test/bin/scenario.sh +++ b/test/bin/scenario.sh @@ -284,7 +284,7 @@ wait_for_greenboot() { echo "Deleting TopoLVM pods and retrying the greenboot checks (${retry_count} attempts remaining)" (( retry_count-- )) - if ! ${ssh_cmd} "sudo oc ${kube_opt} delete pods -n openshift-storage --all" ; then + if ! ${ssh_cmd} "sudo oc ${kube_opt} delete pods -n openshift-storage --all --grace-period=1 --timeout=15s" ; then echo "WARNING: TopoLVM pod deletion returned an error" fi if ! ${ssh_cmd} "sudo systemctl restart --no-block greenboot-healthcheck.service" ; then