From 8940640f64ea01808f098bf805d7ce0c11e57a44 Mon Sep 17 00:00:00 2001 From: Eric Fried Date: Thu, 10 Aug 2023 08:58:46 -0500 Subject: [PATCH] e2e-pool: Improve cleanup artifact gathering We've been seeing some weird timeouts in e2e-pool where the test itself succeeds, but clusterclaim deletion hangs until the job times out and is killed by the monitor. Previously we had no good way to debug this because we had last captured manifests & logs *before* issuing the clusterclaim deletion. With this commit, we: - Do that deletion (and that of the clusterpools) in the background. - Re-capture logs and manifests periodically while we're waiting for cleanup to complete. --- hack/e2e-pool-test.sh | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/hack/e2e-pool-test.sh b/hack/e2e-pool-test.sh index 0def9563582..0a91e8f55c3 100755 --- a/hack/e2e-pool-test.sh +++ b/hack/e2e-pool-test.sh @@ -96,25 +96,32 @@ REAL_POOL_NAME=$CLUSTER_NAME function cleanup() { echo "!EXIT TRAP!" - capture_manifests EXIT - # Let's save the logs now in case any of the following never finish + capture_manifests CLEANUP_000 + # Let's save the logs now in case any of the following fail echo "Saving hive logs before cleanup" save_hive_logs - oc delete clusterclaim --all - oc delete clusterpool --all + # Do these asynchronously so we can keep polling the logs + oc delete clusterclaim --all & + oc delete clusterpool --all & # Wait indefinitely for all CDs to disappear. If we exceed the test timeout, # we'll get killed, and resources will leak. + i=0 while true; do sleep ${sleep_between_tries} + i=$((i+1)) + # re-capture logs so we can debug if things aren't deleting properly + echo "Re-capturing hive logs during cleanup" + save_hive_logs + # re-capture manifests, likewise, but not *too* often as these get added, not overwritten. + # This will fire every 100s. + [[ $((i%10)) -eq 0 ]] && capture_manifests CLEANUP_$(printf "%03d" $i) NUM_CDS=$(count_cds) if [[ $NUM_CDS == "0" ]]; then break fi echo "Waiting for $NUM_CDS ClusterDeployment(s) to be cleaned up" done - # And if we get this far, overwrite the logs with the latest - echo "Saving hive logs after cleanup" - save_hive_logs + echo "Cleanup complete" } trap 'kill %1; cleanup' EXIT