From 8cbcebe1fd39944eaf04cffb8b1a4d5c30e640b8 Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Wed, 8 Apr 2026 13:18:21 -0700 Subject: [PATCH 1/6] Fix LRT workflow: add resource type health check and defensive cleanup When the skip-delete-resources-list cache is unavailable (e.g. after cancelled runs), the cleanup script was deleting all resources.ucp.dev entries including system-critical ones. This caused UCP to lose resource type registrations, breaking every subsequent run. manage-radius-installation.sh: - Add verify_resource_types_available() that makes a live API call to detect missing resource types (instead of only checking stale pod logs) - When versions match but resource types are missing, automatically uninstall and reinstall Radius to re-register them - When versions match and types are healthy, still verify manifests and refresh the skip-delete-resources list cleanup-long-running-cluster.sh: - When no skip list is available, only delete scope.* entries (test resource groups) and preserve non-scope resource.* entries that may include system-critical UCP resources --- .../scripts/cleanup-long-running-cluster.sh | 33 ++++++++++--- .github/scripts/manage-radius-installation.sh | 48 ++++++++++++++++++- 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/.github/scripts/cleanup-long-running-cluster.sh b/.github/scripts/cleanup-long-running-cluster.sh index 9c2f8b5007..d01d502ebe 100755 --- a/.github/scripts/cleanup-long-running-cluster.sh +++ b/.github/scripts/cleanup-long-running-cluster.sh @@ -35,14 +35,35 @@ if kubectl get crd resources.ucp.dev >/dev/null 2>&1; then echo "delete all resources in resources.ucp.dev" resources=$(kubectl get resources.ucp.dev -n radius-system --no-headers -o custom-columns=":metadata.name") for r in $resources; do - # Skip resources if they're either scope.* or listed in skip resource file - if [[ $r == scope.local.* || $r == scope.aws.* || -z "$r" ]]; then + if [[ -z "$r" ]]; then + continue + fi + + # Always skip built-in plane scopes. + if [[ $r == scope.local.* || $r == scope.aws.* ]]; then echo "skip deletion: $r" - elif [ -n "$SKIP_RESOURCE_FILE" ] && [ -f "$SKIP_RESOURCE_FILE" ] && grep -q "$r" "$SKIP_RESOURCE_FILE"; then - echo "Skip deletion: $r (found in skip-resource-list $SKIP_RESOURCE_FILE)" - else - echo "deleting resource: $r" + continue + fi + + # If a skip-resource file is available, use it to protect system resources. + if [ -n "$SKIP_RESOURCE_FILE" ] && [ -f "$SKIP_RESOURCE_FILE" ]; then + if grep -q "$r" "$SKIP_RESOURCE_FILE"; then + echo "skip deletion: $r (found in skip-resource-list $SKIP_RESOURCE_FILE)" + else + echo "deleting resource: $r" + kubectl delete resources.ucp.dev "$r" -n radius-system --ignore-not-found=true --wait=false + fi + continue + fi + + # No skip-resource file: only delete scope entries (test resource groups). + # Non-scope resources (resource.*) may include system-critical entries + # that must not be deleted without a valid skip list. + if [[ $r == scope.* ]]; then + echo "deleting resource: $r (no skip list, scope entry)" kubectl delete resources.ucp.dev "$r" -n radius-system --ignore-not-found=true --wait=false + else + echo "skip deletion: $r (no skip list available, preserving non-scope resource)" fi done fi diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index 4b10ff6e40..863cb64571 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -100,6 +100,40 @@ verify_manifests_registered() { echo "Manifest verification complete." } +# Actively verify that resource types are registered and the Radius API is +# able to serve requests. Unlike verify_manifests_registered (which reads +# historical pod logs), this makes a live API call. +verify_resource_types_available() { + echo "" + echo "Verifying resource types are available..." + + # Set up a temporary workspace/group so we can issue a rad CLI command + # that exercises the Applications.Core resource-type path. + rad workspace create kubernetes --force >/dev/null 2>&1 || true + rad group create __healthcheck >/dev/null 2>&1 || true + + local output exit_code + output=$(rad env list --group __healthcheck 2>&1) && exit_code=0 || exit_code=$? + + # Clean up the temporary group + rad group delete __healthcheck >/dev/null 2>&1 || true + + if [[ ${exit_code} -eq 0 ]]; then + echo "Resource types are available." + return 0 + fi + + if echo "${output}" | grep -qi "resource type.*not found"; then + echo "ERROR: Resource types are NOT registered." + echo "API response: ${output}" + return 1 + fi + + # Other errors (network, auth, etc.) don't indicate a resource-type issue. + echo "Resource type check returned a non-resource-type error (continuing): ${output}" + return 0 +} + # Save the list of Radius UCP resources to skip-delete-resources-list.txt # This file is used by the cleanup job to avoid deleting Radius-managed resources. save_skip_resources_list() { @@ -196,7 +230,19 @@ main() { install_radius elif [[ "${cp_version}" == "${cli_version}" ]]; then echo "" - echo "Radius control plane version matches CLI version (${cli_version}). No action needed." + echo "Radius control plane version matches CLI version (${cli_version}). Skipping install/upgrade." + + if ! verify_resource_types_available; then + echo "" + echo "Resource types missing despite matching versions. Reinstalling Radius..." + if ! rad uninstall kubernetes --purge --yes; then + echo "Warning: Uninstall failed, continuing with install attempt..." + fi + install_radius + else + verify_manifests_registered + save_skip_resources_list + fi else echo "" echo "Version mismatch detected. Attempting upgrade from ${cp_version} to ${cli_version}..." From d1b4e2a24403821bec580eec78c8649ceb4bbac5 Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Wed, 8 Apr 2026 14:11:17 -0700 Subject: [PATCH 2/6] Fix health check to use rad resource-provider list The previous health check used rad env list with a temporary group, but the group creation failed silently, causing the check to return a 'resource group not found' error instead of detecting the missing resource types. Use rad resource-provider list instead, which only needs a workspace and directly checks whether Applications.Core is registered. --- .github/scripts/manage-radius-installation.sh | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index 863cb64571..a11ddeb1e2 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -107,31 +107,22 @@ verify_resource_types_available() { echo "" echo "Verifying resource types are available..." - # Set up a temporary workspace/group so we can issue a rad CLI command - # that exercises the Applications.Core resource-type path. + # Ensure a workspace exists so rad CLI can reach the cluster. rad workspace create kubernetes --force >/dev/null 2>&1 || true - rad group create __healthcheck >/dev/null 2>&1 || true + # List registered resource providers. Applications.Core must be present + # for environment/container operations to work. local output exit_code - output=$(rad env list --group __healthcheck 2>&1) && exit_code=0 || exit_code=$? + output=$(rad resource-provider list 2>&1) && exit_code=0 || exit_code=$? - # Clean up the temporary group - rad group delete __healthcheck >/dev/null 2>&1 || true - - if [[ ${exit_code} -eq 0 ]]; then - echo "Resource types are available." + if [[ ${exit_code} -eq 0 ]] && echo "${output}" | grep -q "Applications.Core"; then + echo "Resource types are available (Applications.Core provider found)." return 0 fi - if echo "${output}" | grep -qi "resource type.*not found"; then - echo "ERROR: Resource types are NOT registered." - echo "API response: ${output}" - return 1 - fi - - # Other errors (network, auth, etc.) don't indicate a resource-type issue. - echo "Resource type check returned a non-resource-type error (continuing): ${output}" - return 0 + echo "ERROR: Applications.Core resource provider is NOT registered." + echo "rad resource-provider list output: ${output}" + return 1 } # Save the list of Radius UCP resources to skip-delete-resources-list.txt From 220b24da1eaeb17971e4689d368ba4d4136a24f4 Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Thu, 9 Apr 2026 10:49:56 -0700 Subject: [PATCH 3/6] Address PR review comments - Let rad workspace create fail visibly instead of suppressing errors - Remove redundant log-based verify_manifests_registered from the version-match path since verify_resource_types_available already does a live API check; log-based check is only used after install/upgrade - Use grep -F -x for exact fixed-string whole-line matching in the skip list lookup to avoid regex and substring false positives --- .github/scripts/cleanup-long-running-cluster.sh | 2 +- .github/scripts/manage-radius-installation.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/scripts/cleanup-long-running-cluster.sh b/.github/scripts/cleanup-long-running-cluster.sh index d01d502ebe..b774227128 100755 --- a/.github/scripts/cleanup-long-running-cluster.sh +++ b/.github/scripts/cleanup-long-running-cluster.sh @@ -47,7 +47,7 @@ if kubectl get crd resources.ucp.dev >/dev/null 2>&1; then # If a skip-resource file is available, use it to protect system resources. if [ -n "$SKIP_RESOURCE_FILE" ] && [ -f "$SKIP_RESOURCE_FILE" ]; then - if grep -q "$r" "$SKIP_RESOURCE_FILE"; then + if grep -F -x -q -- "$r" "$SKIP_RESOURCE_FILE"; then echo "skip deletion: $r (found in skip-resource-list $SKIP_RESOURCE_FILE)" else echo "deleting resource: $r" diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index a11ddeb1e2..daf7485d2d 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -108,7 +108,7 @@ verify_resource_types_available() { echo "Verifying resource types are available..." # Ensure a workspace exists so rad CLI can reach the cluster. - rad workspace create kubernetes --force >/dev/null 2>&1 || true + rad workspace create kubernetes --force # List registered resource providers. Applications.Core must be present # for environment/container operations to work. @@ -231,7 +231,6 @@ main() { fi install_radius else - verify_manifests_registered save_skip_resources_list fi else From 9fe68a2bebfbe0deb283206bf8240b12ef5db33c Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Thu, 9 Apr 2026 13:10:39 -0700 Subject: [PATCH 4/6] Use grep -Fq for fixed-string match on Applications.Core --- .github/scripts/manage-radius-installation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index daf7485d2d..b3f1bcd166 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -115,7 +115,7 @@ verify_resource_types_available() { local output exit_code output=$(rad resource-provider list 2>&1) && exit_code=0 || exit_code=$? - if [[ ${exit_code} -eq 0 ]] && echo "${output}" | grep -q "Applications.Core"; then + if [[ ${exit_code} -eq 0 ]] && echo "${output}" | grep -Fq "Applications.Core"; then echo "Resource types are available (Applications.Core provider found)." return 0 fi From fb8e6f5b6a0355f8a8218fdb2e0ccf867d497854 Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Thu, 9 Apr 2026 13:23:04 -0700 Subject: [PATCH 5/6] Address review: split error cases, add retry, fix log message - verify_resource_types_available now returns distinct codes: 0=healthy, 1=provider missing, 2=query failed - Only trigger reinstall when the provider is definitively missing (rc=1); for query failures (rc=2), retry once after 30s before failing - Update cleanup log message to reflect conditional behavior when no skip list is available --- .../scripts/cleanup-long-running-cluster.sh | 6 +++- .github/scripts/manage-radius-installation.sh | 32 +++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/.github/scripts/cleanup-long-running-cluster.sh b/.github/scripts/cleanup-long-running-cluster.sh index b774227128..59331e312f 100755 --- a/.github/scripts/cleanup-long-running-cluster.sh +++ b/.github/scripts/cleanup-long-running-cluster.sh @@ -32,7 +32,11 @@ fi # Delete all test resources in resources without proxy resource. if kubectl get crd resources.ucp.dev >/dev/null 2>&1; then - echo "delete all resources in resources.ucp.dev" + if [[ -n "$SKIP_RESOURCE_FILE" && -f "$SKIP_RESOURCE_FILE" ]]; then + echo "delete resources in resources.ucp.dev except entries in skip-resource-list" + else + echo "no skip-resource-list available; delete only scope.* resources in resources.ucp.dev" + fi resources=$(kubectl get resources.ucp.dev -n radius-system --no-headers -o custom-columns=":metadata.name") for r in $resources; do if [[ -z "$r" ]]; then diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index b3f1bcd166..9e469d1480 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -103,6 +103,7 @@ verify_manifests_registered() { # Actively verify that resource types are registered and the Radius API is # able to serve requests. Unlike verify_manifests_registered (which reads # historical pod logs), this makes a live API call. +# Returns: 0 = healthy, 1 = provider missing, 2 = query failed verify_resource_types_available() { echo "" echo "Verifying resource types are available..." @@ -115,7 +116,13 @@ verify_resource_types_available() { local output exit_code output=$(rad resource-provider list 2>&1) && exit_code=0 || exit_code=$? - if [[ ${exit_code} -eq 0 ]] && echo "${output}" | grep -Fq "Applications.Core"; then + if [[ ${exit_code} -ne 0 ]]; then + echo "ERROR: Failed to query registered resource providers (exit code: ${exit_code})." + echo "rad resource-provider list output: ${output}" + return 2 + fi + + if echo "${output}" | grep -Fq "Applications.Core"; then echo "Resource types are available (Applications.Core provider found)." return 0 fi @@ -223,7 +230,23 @@ main() { echo "" echo "Radius control plane version matches CLI version (${cli_version}). Skipping install/upgrade." - if ! verify_resource_types_available; then + # Verify resource types with retry for transient failures. + local check_result=0 + verify_resource_types_available || check_result=$? + + if [[ ${check_result} -eq 2 ]]; then + # Query failed (connectivity/auth issue). Retry once after a brief wait + # before taking destructive action. + echo "" + echo "Resource type query failed. Retrying in 30 seconds..." + sleep 30 + check_result=0 + verify_resource_types_available || check_result=$? + fi + + if [[ ${check_result} -eq 0 ]]; then + save_skip_resources_list + elif [[ ${check_result} -eq 1 ]]; then echo "" echo "Resource types missing despite matching versions. Reinstalling Radius..." if ! rad uninstall kubernetes --purge --yes; then @@ -231,7 +254,10 @@ main() { fi install_radius else - save_skip_resources_list + echo "" + echo "ERROR: Unable to verify resource types after retry." + echo "This may indicate a connectivity or authentication issue." + exit 1 fi else echo "" From 55a3cf22b1cb4299361a3c9b544612ca910ec2f6 Mon Sep 17 00:00:00 2001 From: willdavsmith Date: Thu, 9 Apr 2026 13:28:04 -0700 Subject: [PATCH 6/6] Handle workspace creation failure in health check Capture rad workspace create failure explicitly and return rc=2 instead of letting set -euo pipefail exit the script, so the caller's retry logic can handle it. --- .github/scripts/manage-radius-installation.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/scripts/manage-radius-installation.sh b/.github/scripts/manage-radius-installation.sh index 9e469d1480..ef8183ad91 100755 --- a/.github/scripts/manage-radius-installation.sh +++ b/.github/scripts/manage-radius-installation.sh @@ -109,7 +109,15 @@ verify_resource_types_available() { echo "Verifying resource types are available..." # Ensure a workspace exists so rad CLI can reach the cluster. - rad workspace create kubernetes --force + local workspace_output workspace_exit_code + workspace_output=$(rad workspace create kubernetes --force 2>&1) && + workspace_exit_code=0 || workspace_exit_code=$? + + if [[ ${workspace_exit_code} -ne 0 ]]; then + echo "ERROR: Failed to create Radius Kubernetes workspace (exit code: ${workspace_exit_code})." + echo "rad workspace create output: ${workspace_output}" + return 2 + fi # List registered resource providers. Applications.Core must be present # for environment/container operations to work.