From 4a47462cdaf89a1971e091f8f7a22433d43e1140 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 21 Apr 2026 17:25:19 +0200 Subject: [PATCH 01/15] feat: smoke tests for capms sonic and dell_sonic flavors --- .github/workflows/integration.yaml | 2 + capi-lab/Makefile | 60 ++++++++++++++++-------------- capi-lab/compose.kamaji.yaml | 18 --------- capi-lab/compose.yaml | 18 ++++++++- capi-lab/mini-lab | 2 +- capi-lab/test/integration.sh | 41 +++++++++++++++++++- 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 07b987a..90ca32a 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -17,6 +17,8 @@ jobs: strategy: matrix: flavors: + - name: capms_dell_sonic + - name: capms_sonic - name: kamaji steps: diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 6ff0985..ab5b128 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig -MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms) +MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_sonic) CLUSTER_NAME ?= metal-test KUBERNETES_VERSION ?= 1.33.5 @@ -35,7 +35,9 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD) MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA)) MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA)) -ifeq ($(MINI_LAB_FLAVOR),capms) +ifeq ($(MINI_LAB_FLAVOR),capms_sonic) +DEPLOY_TARGET=deploy-kubeadm +else ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) DEPLOY_TARGET=deploy-kubeadm else ifeq ($(MINI_LAB_FLAVOR),kamaji) DEPLOY_TARGET=deploy-kamaji @@ -101,24 +103,26 @@ control-plane-ip: apply-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) echo $(CLUSTER_NAME) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --kubeconfig=$(KUBECONFIG) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-sample-cluster delete-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --kubeconfig=$(KUBECONFIG) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: mtu-fix mtu-fix: @@ -133,22 +137,22 @@ create-kamaji-tenant: # let MetalLB assign the IP to the tenant cluster control plane service envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-kamaji-tenant delete-kamaji-tenant: docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: kamaji-tenant-kubeconfig kamaji-tenant-kubeconfig: diff --git a/capi-lab/compose.kamaji.yaml b/capi-lab/compose.kamaji.yaml index a66ec34..a1aa589 100644 --- a/capi-lab/compose.kamaji.yaml +++ b/capi-lab/compose.kamaji.yaml @@ -3,25 +3,7 @@ services: clusterctl: image: registry.k8s.io/cluster-api/clusterctl:v1.12.3 environment: - - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - - METAL_API_HMAC=${METAL_API_HMAC} - - METAL_API_URL=${METAL_API_URL} - - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} - - CLUSTER_NAME=${CLUSTER_NAME} - TENANT_NAMESPACE=${TENANT_NAMESPACE} - - KUBERNETES_VERSION=${KUBERNETES_VERSION} - - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} - PODS_CIDR=${PODS_CIDR} - SERVICES_CIDR=${SERVICES_CIDR} - - METAL_PARTITION=${METAL_PARTITION} - - METAL_PROJECT_ID=${METAL_PROJECT_ID} - - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} - - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} - - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} - - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} - - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} - - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} - METAL_NODE_NETWORK_ID=${METAL_NODE_NETWORK_ID} - - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} - volumes: - - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/compose.yaml b/capi-lab/compose.yaml index f680835..104fc52 100644 --- a/capi-lab/compose.yaml +++ b/capi-lab/compose.yaml @@ -5,8 +5,24 @@ services: environment: - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - KUBECONFIG=/kubeconfig - image: registry.k8s.io/cluster-api/clusterctl:v1.9.5 + - METAL_API_HMAC=${METAL_API_HMAC} + - METAL_API_URL=${METAL_API_URL} + - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} + - CLUSTER_NAME=${CLUSTER_NAME} + - KUBERNETES_VERSION=${KUBERNETES_VERSION} + - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} + - METAL_PARTITION=${METAL_PARTITION} + - METAL_PROJECT_ID=${METAL_PROJECT_ID} + - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} + - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} + - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} + - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} + - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} + - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} + - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} + image: registry.k8s.io/cluster-api/clusterctl:v1.11.4 network_mode: host user: root volumes: - ${KUBECONFIG}:/kubeconfig:ro + - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index f7d4a9b..1664eff 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit f7d4a9b4aec09c454fe638d49b9bc8493d6385b5 +Subproject commit 1664eff4656afcf5cf2e0bae74e5a2a5f7939f14 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index f6938d4..3623728 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -24,9 +24,48 @@ echo "$waiting/$minWaiting machines are waiting" make push-to-capi-lab +if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + + if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then + 2>&1 echo "Starting capms dell sonic flavor tests" + else + 2>&1 echo "Starting capms sonic flavor tests" + fi + + echo "Creating control plane IP" + make -C capi-lab control-plane-ip + + echo "Applying sample cluster" + make -C capi-lab apply-sample-cluster + + echo "Waiting for control-plane to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=2 + declare -i attempts=0 + until [ "$phoned" -ge $minPhoned ] + do + if [ "$attempts" -ge 120 ]; then + echo "not enough machines phoned home - timeout reached" + exit 1 + fi + echo "$phoned/$minPhoned machines have phoned home" + sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + attempts+=1 + done + echo "$phoned/$minPhoned machines have phoned home" + + echo "Applying mtu fix" + make -C capi-lab mtu-fix + + # TODO further checks + +fi + + if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then - echo "Starting kamaji tests" + echo "Starting kamaji flavor tests" echo "Creating control plane IP" export CLUSTER_NAME=kamaji-tenant-test From d0d5448fe85e2f69236b9f63a68712ce3e59bd81 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 21 Apr 2026 22:41:33 +0200 Subject: [PATCH 02/15] fix: remove redundant kubeconfig for clusterctl sample cluster --- capi-lab/Makefile | 6 ++---- capi-lab/test/integration.sh | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index ab5b128..35cfef2 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -105,11 +105,10 @@ apply-sample-cluster: echo $(CLUSTER_NAME) docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ --worker-machine-count 1 \ --control-plane-machine-count 1 \ --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + --from /templates/cluster-template-calico-lab.yaml \ | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-sample-cluster @@ -117,11 +116,10 @@ delete-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ --worker-machine-count 1 \ --control-plane-machine-count 1 \ --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ + --from /templates/cluster-template-calico-lab.yaml \ | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: mtu-fix diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 3623728..a6612f0 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -18,7 +18,7 @@ do echo "$waiting/$minWaiting machines are waiting" sleep 5 waiting=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Waiting | wc -l) - attempts=$attempts+1 + attempts+=1 done echo "$waiting/$minWaiting machines are waiting" @@ -27,9 +27,9 @@ make push-to-capi-lab if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then - 2>&1 echo "Starting capms dell sonic flavor tests" + echo "Starting capms dell sonic flavor tests" else - 2>&1 echo "Starting capms sonic flavor tests" + echo "Starting capms sonic flavor tests" fi echo "Creating control plane IP" From dc7c9898e1af926fb0462469d455037268ccf209 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:05:28 +0200 Subject: [PATCH 03/15] chore: do not continue lab tests on error and do not fail fast --- .github/workflows/integration.yaml | 5 +++-- capi-lab/mini-lab | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 90ca32a..ea76b05 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -12,9 +12,10 @@ jobs: test: name: Run tests runs-on: ubuntu-latest - - continue-on-error: true + # TODO should we run on self-hosted? + strategy: + fail-fast: false matrix: flavors: - name: capms_dell_sonic diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 1664eff..5a5bbeb 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 1664eff4656afcf5cf2e0bae74e5a2a5f7939f14 +Subproject commit 5a5bbeb08d74efbd105aed0b7b4a4eb5dab82418 From afbfaf0d330c2283179139fa759f4b64be1aec67 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:14:15 +0200 Subject: [PATCH 04/15] chore: add timeout for lab integration tests --- .github/workflows/integration.yaml | 3 ++- capi-lab/test/integration.sh | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index ea76b05..a0b2537 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -13,7 +13,7 @@ jobs: name: Run tests runs-on: ubuntu-latest # TODO should we run on self-hosted? - + strategy: fail-fast: false matrix: @@ -49,6 +49,7 @@ jobs: - name: Run integration tests shell: bash + timeout-minutes: 45 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index a6612f0..d153691 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -11,7 +11,7 @@ minWaiting=2 declare -i attempts=0 until [ "$waiting" -ge $minWaiting ] do - if [ "$attempts" -ge 60 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines in waiting state - timeout reached" exit 1 fi @@ -44,7 +44,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 120 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi @@ -81,7 +81,7 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 120 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi From 1815d836c1faaa014f73c5e2ea4adb1aded91a6e Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Wed, 22 Apr 2026 00:46:05 +0200 Subject: [PATCH 05/15] test(lab): wait for capms nodes to become ready --- capi-lab/Makefile | 12 ++++++++++- capi-lab/test/integration.sh | 39 +++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 35cfef2..d9fd0eb 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -127,6 +127,16 @@ mtu-fix: cd mini-lab && ssh -F files/ssh/config leaf01 'ip link set dev vtep-1001 mtu 9100 && echo done' cd mini-lab && ssh -F files/ssh/config leaf02 'ip link set dev vtep-1001 mtu 9100 && echo done' +.PHONY: sample-cluster-kubeconfig +sample-cluster-kubeconfig: + kubectl --kubeconfig=$(KUBECONFIG) get secret $(CLUSTER_NAME)-kubeconfig -o jsonpath='{.data.value}' | base64 -d > ../$(CLUSTER_NAME).kubeconfig + @echo "Sample cluster kubeconfig written to $(CLUSTER_NAME).kubeconfig" + +.PHONY: sample-cluster-deploy-metal-ccm +sample-cluster-deploy-metal-ccm: + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + .PHONY: create-kamaji-tenant create-kamaji-tenant: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) @@ -164,4 +174,4 @@ kamaji-tenant-deploy-calico: .PHONY: kamaji-tenant-deploy-metal-ccm kamaji-tenant-deploy-metal-ccm: - kustomize build ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + kubectl kustomize ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index d153691..cda9d54 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -32,6 +32,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Starting capms sonic flavor tests" fi + export CLUSTER_NAME=metal-test + echo "Creating control plane IP" make -C capi-lab control-plane-ip @@ -58,7 +60,42 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying mtu fix" make -C capi-lab mtu-fix - # TODO further checks + echo "Waiting for cluster to be provisioned" + declare -i attempts=0 + until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + do + if [ "$attempts" -ge 180 ]; then + echo "cluster was not provisioned - timeout reached" + kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + exit 1 + fi + echo "cluster ${CLUSTER_NAME} is not yet provisioned" + sleep 5 + attempts+=1 + done + echo "Cluster ${CLUSTER_NAME} is provisioned" + + echo "Generating kubeconfig for sample cluster" + make -C capi-lab sample-cluster-kubeconfig + + echo "Deploying metal-ccm to sample cluster" + make -C capi-lab sample-cluster-deploy-metal-ccm + + echo "Waiting for nodes to become Ready" + declare -i attempts=0 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -q "^Ready$" + do + if [ "$attempts" -ge 180 ]; then + echo "no nodes became Ready - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + exit 1 + fi + echo "no nodes are Ready yet" + sleep 5 + attempts+=1 + done + echo "At least one node is Ready" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi From 184cce61e912fe460602ad7b00f42b33aa823fee Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 11:19:02 +0200 Subject: [PATCH 06/15] test(lab): wait for tenant API server to be reachable --- capi-lab/Makefile | 4 ++-- capi-lab/test/integration.sh | 32 +++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/capi-lab/Makefile b/capi-lab/Makefile index d9fd0eb..b5bcc52 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -134,13 +134,13 @@ sample-cluster-kubeconfig: .PHONY: sample-cluster-deploy-metal-ccm sample-cluster-deploy-metal-ccm: - $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - .PHONY: create-kamaji-tenant create-kamaji-tenant: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) kubectl --kubeconfig=$(KUBECONFIG) create namespace $(TENANT_NAMESPACE) --dry-run=client -o yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - # let MetalLB assign the IP to the tenant cluster control plane service envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index cda9d54..b1cdc49 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,9 +40,9 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster - echo "Waiting for control-plane to get to Phoned Home state" + echo "Waiting for firewall, control-plane and worker to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) - minPhoned=2 + minPhoned=3 declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do @@ -78,23 +78,41 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Generating kubeconfig for sample cluster" make -C capi-lab sample-cluster-kubeconfig + echo "Waiting for tenant API server to be reachable" + declare -i attempts=0 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 + do + if [ "$attempts" -ge 180 ]; then + echo "tenant API server not reachable - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version || true + exit 1 + fi + echo "tenant API server not reachable yet" + sleep 5 + attempts+=1 + done + echo "Tenant API server is reachable" + echo "Deploying metal-ccm to sample cluster" make -C capi-lab sample-cluster-deploy-metal-ccm - echo "Waiting for nodes to become Ready" + echo "Waiting for control-plane and worker node to become Ready" + minReady=2 + ready=0 declare -i attempts=0 - until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -q "^Ready$" + until [ "$ready" -ge $minReady ] do if [ "$attempts" -ge 180 ]; then - echo "no nodes became Ready - timeout reached" + echo "not enough nodes became Ready - timeout reached" kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true exit 1 fi - echo "no nodes are Ready yet" + echo "$ready/$minReady nodes are Ready" sleep 5 + ready=$(kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -c "^Ready$" || true) attempts+=1 done - echo "At least one node is Ready" + echo "$ready/$minReady nodes are Ready" kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi From a3a0c74e1c1551dbc31871acb96873d2d18b4128 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 11:42:25 +0200 Subject: [PATCH 07/15] chore: update submodule --- capi-lab/mini-lab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 5a5bbeb..6789fb6 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 5a5bbeb08d74efbd105aed0b7b4a4eb5dab82418 +Subproject commit 6789fb6bb5d2c10ccf08759bd58a05a6ee127852 From e71f943383868d41ff3df0e7690792f70a4988be Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Thu, 23 Apr 2026 12:54:01 +0200 Subject: [PATCH 08/15] fix: give capms machines more time to phone home --- capi-lab/test/integration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index b1cdc49..556f219 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -46,7 +46,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 180 ]; then + if [ "$attempts" -ge 240 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi From 406d6291f79aadc5e0d4db9e57619430284448d2 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Fri, 24 Apr 2026 23:55:28 +0200 Subject: [PATCH 09/15] chore: update submodule --- capi-lab/mini-lab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 6789fb6..7367c66 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 6789fb6bb5d2c10ccf08759bd58a05a6ee127852 +Subproject commit 7367c66ce46623415aaf09a568fb3e76c45279f9 From c64f6685af2ca5ee1181e03d9743b677f137b619 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Mon, 27 Apr 2026 12:20:36 +0200 Subject: [PATCH 10/15] fix: expect only 2 workers --- capi-lab/test/integration.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 556f219..a5b9c06 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,9 +40,9 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster - echo "Waiting for firewall, control-plane and worker to get to Phoned Home state" + echo "Waiting for firewall and control-plane to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) - minPhoned=3 + minPhoned=2 declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do @@ -57,8 +57,10 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ done echo "$phoned/$minPhoned machines have phoned home" - echo "Applying mtu fix" - make -C capi-lab mtu-fix + if [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + echo "Applying mtu fix" + make -C capi-lab mtu-fix + fi echo "Waiting for cluster to be provisioned" declare -i attempts=0 @@ -96,8 +98,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Deploying metal-ccm to sample cluster" make -C capi-lab sample-cluster-deploy-metal-ccm - echo "Waiting for control-plane and worker node to become Ready" - minReady=2 + echo "Waiting for control-plane node to become Ready" + minReady=1 ready=0 declare -i attempts=0 until [ "$ready" -ge $minReady ] From f1a11a4912c146460384d5ccbfce6db1765d9530 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 08:59:26 +0200 Subject: [PATCH 11/15] fix: calico-lab template support for cluster-api v1.11 --- config/clusterctl-templates/cluster-template-calico-lab.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/clusterctl-templates/cluster-template-calico-lab.yaml b/config/clusterctl-templates/cluster-template-calico-lab.yaml index 1e07261..ddda727 100644 --- a/config/clusterctl-templates/cluster-template-calico-lab.yaml +++ b/config/clusterctl-templates/cluster-template-calico-lab.yaml @@ -258,8 +258,6 @@ spec: spec: format: ignition clusterConfiguration: - apiServer: - extraArgs: {} controllerManager: extraArgs: cloud-provider: external From 743d7133d718e06b161e4520b05ebf965acd9b73 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 12:21:18 +0200 Subject: [PATCH 12/15] fix: metal-ccm is already deployed via template --- capi-lab/mini-lab | 2 +- capi-lab/test/integration.sh | 47 ++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index 7367c66..195c7a1 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit 7367c66ce46623415aaf09a568fb3e76c45279f9 +Subproject commit 195c7a1975f846b76f5c55b00540dd05886c2887 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index a5b9c06..45c098c 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -40,6 +40,22 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ echo "Applying sample cluster" make -C capi-lab apply-sample-cluster + echo "Waiting for cluster to be provisioned" + declare -i attempts=0 + until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + do + if [ "$attempts" -ge 180 ]; then + echo "cluster was not provisioned - timeout reached" + kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + exit 1 + fi + echo "cluster ${CLUSTER_NAME} is not yet provisioned" + sleep 5 + attempts+=1 + done + echo "Cluster ${CLUSTER_NAME} is provisioned" + + echo "Waiting for firewall and control-plane to get to Phoned Home state" phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) minPhoned=2 @@ -62,24 +78,30 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ make -C capi-lab mtu-fix fi - echo "Waiting for cluster to be provisioned" + echo "Waiting for worker to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=3 declare -i attempts=0 - until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 180 ]; then - echo "cluster was not provisioned - timeout reached" - kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + if [ "$attempts" -ge 240 ]; then + echo "not enough machines phoned home - timeout reached" exit 1 fi - echo "cluster ${CLUSTER_NAME} is not yet provisioned" + echo "$phoned/$minPhoned machines have phoned home" sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) attempts+=1 done - echo "Cluster ${CLUSTER_NAME} is provisioned" + echo "$phoned/$minPhoned machines have phoned home" echo "Generating kubeconfig for sample cluster" make -C capi-lab sample-cluster-kubeconfig + # TODO remove once we can reliably check for the nodes to be ready + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A + echo "Waiting for tenant API server to be reachable" declare -i attempts=0 until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 @@ -95,11 +117,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ done echo "Tenant API server is reachable" - echo "Deploying metal-ccm to sample cluster" - make -C capi-lab sample-cluster-deploy-metal-ccm - - echo "Waiting for control-plane node to become Ready" - minReady=1 + echo "Waiting for control-plane node and worker node to become Ready" + minReady=2 ready=0 declare -i attempts=0 until [ "$ready" -ge $minReady ] @@ -115,7 +134,6 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ attempts+=1 done echo "$ready/$minReady nodes are Ready" - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes fi @@ -163,6 +181,9 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then echo "Checking if tenant cluster exists" if kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "Ready"; then + # Currently this also catches NotReady nodes, but that's good enough for now to verify + # that the node has joined. + # Only metal-ccm will be able to set the node to Ready but we do not go that far here echo "Nodes have joined the cluster and are ready" elif kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "No resources found"; then echo "Nodes have not joined yet" From c89c426c3e93ede2e880691cec7e46bd1faa71c4 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 13:21:54 +0200 Subject: [PATCH 13/15] fix: ccm tolerations and timeouts for third machine --- .github/workflows/integration.yaml | 2 +- capi-lab/test/integration.sh | 6 +++--- .../clusterctl-templates/cluster-template-calico-lab.yaml | 5 +++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index a0b2537..99d1a42 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -49,7 +49,7 @@ jobs: - name: Run integration tests shell: bash - timeout-minutes: 45 + timeout-minutes: 60 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 45c098c..7f0ba0a 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -84,7 +84,7 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 240 ]; then + if [ "$attempts" -ge 360 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi @@ -99,8 +99,8 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ make -C capi-lab sample-cluster-kubeconfig # TODO remove once we can reliably check for the nodes to be ready - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes - kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true echo "Waiting for tenant API server to be reachable" declare -i attempts=0 diff --git a/config/clusterctl-templates/cluster-template-calico-lab.yaml b/config/clusterctl-templates/cluster-template-calico-lab.yaml index ddda727..43a3993 100644 --- a/config/clusterctl-templates/cluster-template-calico-lab.yaml +++ b/config/clusterctl-templates/cluster-template-calico-lab.yaml @@ -559,6 +559,11 @@ data: - effect: NoSchedule key: node.cloudprovider.kubernetes.io/uninitialized value: "true" + - effect: NoSchedule + key: node.cluster.x-k8s.io/uninitialized + operator: Exists + - key: node.kubernetes.io/not-ready + operator: Exists restartPolicy: Always volumes: - name: cloud-controller-manager From bc51603c39f885ea1199b7d0c6f5a37128ce92d9 Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 16:14:20 +0200 Subject: [PATCH 14/15] debug: higher timeouts and debug output --- .github/workflows/integration.yaml | 2 +- capi-lab/test/integration.sh | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 99d1a42..9871c5f 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -49,7 +49,7 @@ jobs: - name: Run integration tests shell: bash - timeout-minutes: 60 + timeout-minutes: 150 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index 7f0ba0a..634a18f 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -84,10 +84,15 @@ if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_ declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 360 ]; then + if [ "$attempts" -ge 480 ]; then echo "not enough machines phoned home - timeout reached" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true exit 1 fi + if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then + echo "machine states after $attempts attempts:" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true + fi echo "$phoned/$minPhoned machines have phoned home" sleep 5 phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) From 9a3a4d577b93f57d5fd9f21f0565b29a92f8a15d Mon Sep 17 00:00:00 2001 From: Matthias Hartmann Date: Tue, 28 Apr 2026 16:52:17 +0200 Subject: [PATCH 15/15] fix(ci): free disk space on gh ubuntu runner --- .github/workflows/integration.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 9871c5f..ed75ff2 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -23,6 +23,15 @@ jobs: - name: kamaji steps: + - name: Free disk space + # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it. + # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB. + # apt-get clean removes cached .deb files (~few hundred MB). + run: | + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + - name: Gain back workspace permissions # https://github.com/actions/checkout/issues/211 run: | [ -d "${GITHUB_WORKSPACE}" ] && sudo chown -R $USER:$USER ${GITHUB_WORKSPACE}