diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 07b987a..ed75ff2 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -12,14 +12,26 @@ jobs: test: name: Run tests runs-on: ubuntu-latest + # TODO should we run on self-hosted? - continue-on-error: true strategy: + fail-fast: false matrix: flavors: + - name: capms_dell_sonic + - name: capms_sonic - name: kamaji steps: + - name: Free disk space + # ubuntu-latest only has ~14GB free; kind + QEMU VMs + containerlab + Docker images exhaust it. + # Remove preinstalled SDKs/toolchains we don't need to recover ~10-12GB. + # apt-get clean removes cached .deb files (~few hundred MB). + run: | + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /usr/share/swift /opt/ghc /usr/local/.ghcup /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + - name: Gain back workspace permissions # https://github.com/actions/checkout/issues/211 run: | [ -d "${GITHUB_WORKSPACE}" ] && sudo chown -R $USER:$USER ${GITHUB_WORKSPACE} @@ -46,6 +58,7 @@ jobs: - name: Run integration tests shell: bash + timeout-minutes: 150 run: | eval $(make -C capi-lab --silent dev-env) ./capi-lab/test/ci-cleanup.sh diff --git a/capi-lab/Makefile b/capi-lab/Makefile index 6ff0985..b5bcc52 100644 --- a/capi-lab/Makefile +++ b/capi-lab/Makefile @@ -5,7 +5,7 @@ ANSIBLE_EXTRA_VARS_FILE=$(shell pwd)/mini-lab-overrides/extra-vars.yaml KIND_EXPERIMENTAL_DOCKER_NETWORK=mini_lab_ext KUBECONFIG := $(shell pwd)/mini-lab/.kubeconfig -MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms) +MINI_LAB_FLAVOR := $(or $(MINI_LAB_FLAVOR),capms_sonic) CLUSTER_NAME ?= metal-test KUBERNETES_VERSION ?= 1.33.5 @@ -35,7 +35,9 @@ SUBMODULE_SHA=$(shell git -C mini-lab rev-parse --short=8 HEAD) MINI_LAB_VM_IMAGE := $(or $(MINI_LAB_VM_IMAGE),ghcr.io/metal-stack/mini-lab-vms:$(SUBMODULE_SHA)) MINI_LAB_SONIC_IMAGE := $(or $(MINI_LAB_SONIC_IMAGE),ghcr.io/metal-stack/mini-lab-sonic:$(SUBMODULE_SHA)) -ifeq ($(MINI_LAB_FLAVOR),capms) +ifeq ($(MINI_LAB_FLAVOR),capms_sonic) +DEPLOY_TARGET=deploy-kubeadm +else ifeq ($(MINI_LAB_FLAVOR),capms_dell_sonic) DEPLOY_TARGET=deploy-kubeadm else ifeq ($(MINI_LAB_FLAVOR),kamaji) DEPLOY_TARGET=deploy-kamaji @@ -101,54 +103,64 @@ control-plane-ip: apply-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) echo $(CLUSTER_NAME) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-sample-cluster delete-sample-cluster: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - clusterctl generate cluster $(CLUSTER_NAME) \ - --kubeconfig=$(KUBECONFIG) \ - --worker-machine-count 1 \ - --control-plane-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from ../config/clusterctl-templates/cluster-template-calico-lab.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + docker compose -f compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ + clusterctl generate cluster $(CLUSTER_NAME) \ + --worker-machine-count 1 \ + --control-plane-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-calico-lab.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: mtu-fix mtu-fix: cd mini-lab && ssh -F files/ssh/config leaf01 'ip link set dev vtep-1001 mtu 9100 && echo done' cd mini-lab && ssh -F files/ssh/config leaf02 'ip link set dev vtep-1001 mtu 9100 && echo done' +.PHONY: sample-cluster-kubeconfig +sample-cluster-kubeconfig: + kubectl --kubeconfig=$(KUBECONFIG) get secret $(CLUSTER_NAME)-kubeconfig -o jsonpath='{.data.value}' | base64 -d > ../$(CLUSTER_NAME).kubeconfig + @echo "Sample cluster kubeconfig written to $(CLUSTER_NAME).kubeconfig" + +.PHONY: sample-cluster-deploy-metal-ccm +sample-cluster-deploy-metal-ccm: + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) + kubectl kustomize ../config/target-cluster/overlays/kubeadm | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + .PHONY: create-kamaji-tenant create-kamaji-tenant: $(eval CONTROL_PLANE_IP = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network ip list --name "$(CLUSTER_NAME)-vip" -o template --template '{{ .ipaddress }}')) - $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o template --template '{{ (index . 0).id }}')) + $(eval METAL_NODE_NETWORK_ID = $(shell docker compose -f mini-lab/compose.yaml run $(DOCKER_COMPOSE_RUN_ARG) metalctl network list --project $(METAL_PROJECT_ID) -o json | jq -r '.[0].id')) kubectl --kubeconfig=$(KUBECONFIG) create namespace $(TENANT_NAMESPACE) --dry-run=client -o yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - # let MetalLB assign the IP to the tenant cluster control plane service envsubst < kamaji/metallb-tenant-pool.yaml | kubectl --kubeconfig=$(KUBECONFIG) apply -f - docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) apply -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) apply -f - .PHONY: delete-kamaji-tenant delete-kamaji-tenant: docker compose -f compose.yaml -f compose.kamaji.yaml run $(DOCKER_COMPOSE_RUN_ARG) \ - clusterctl generate cluster $(CLUSTER_NAME) \ - --target-namespace $(TENANT_NAMESPACE) \ - --worker-machine-count 1 \ - --kubernetes-version $(KUBERNETES_VERSION) \ - --from /templates/cluster-template-kamaji-tenant.yaml \ - | kubectl --kubeconfig=$(KUBECONFIG) delete -f - + clusterctl generate cluster $(CLUSTER_NAME) \ + --target-namespace $(TENANT_NAMESPACE) \ + --worker-machine-count 1 \ + --kubernetes-version $(KUBERNETES_VERSION) \ + --from /templates/cluster-template-kamaji-tenant.yaml \ + | kubectl --kubeconfig=$(KUBECONFIG) delete -f - .PHONY: kamaji-tenant-kubeconfig kamaji-tenant-kubeconfig: @@ -162,4 +174,4 @@ kamaji-tenant-deploy-calico: .PHONY: kamaji-tenant-deploy-metal-ccm kamaji-tenant-deploy-metal-ccm: - kustomize build ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - + kubectl kustomize ../config/target-cluster/overlays/kamaji | envsubst | kubectl --kubeconfig=../$(CLUSTER_NAME).kubeconfig apply -f - diff --git a/capi-lab/compose.kamaji.yaml b/capi-lab/compose.kamaji.yaml index a66ec34..a1aa589 100644 --- a/capi-lab/compose.kamaji.yaml +++ b/capi-lab/compose.kamaji.yaml @@ -3,25 +3,7 @@ services: clusterctl: image: registry.k8s.io/cluster-api/clusterctl:v1.12.3 environment: - - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - - METAL_API_HMAC=${METAL_API_HMAC} - - METAL_API_URL=${METAL_API_URL} - - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} - - CLUSTER_NAME=${CLUSTER_NAME} - TENANT_NAMESPACE=${TENANT_NAMESPACE} - - KUBERNETES_VERSION=${KUBERNETES_VERSION} - - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} - PODS_CIDR=${PODS_CIDR} - SERVICES_CIDR=${SERVICES_CIDR} - - METAL_PARTITION=${METAL_PARTITION} - - METAL_PROJECT_ID=${METAL_PROJECT_ID} - - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} - - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} - - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} - - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} - - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} - - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} - METAL_NODE_NETWORK_ID=${METAL_NODE_NETWORK_ID} - - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} - volumes: - - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/compose.yaml b/capi-lab/compose.yaml index f680835..104fc52 100644 --- a/capi-lab/compose.yaml +++ b/capi-lab/compose.yaml @@ -5,8 +5,24 @@ services: environment: - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION=true - KUBECONFIG=/kubeconfig - image: registry.k8s.io/cluster-api/clusterctl:v1.9.5 + - METAL_API_HMAC=${METAL_API_HMAC} + - METAL_API_URL=${METAL_API_URL} + - METAL_API_HMAC_AUTH_TYPE=${METAL_API_HMAC_AUTH_TYPE} + - CLUSTER_NAME=${CLUSTER_NAME} + - KUBERNETES_VERSION=${KUBERNETES_VERSION} + - CONTROL_PLANE_IP=${CONTROL_PLANE_IP} + - METAL_PARTITION=${METAL_PARTITION} + - METAL_PROJECT_ID=${METAL_PROJECT_ID} + - CONTROL_PLANE_MACHINE_IMAGE=${CONTROL_PLANE_MACHINE_IMAGE} + - CONTROL_PLANE_MACHINE_SIZE=${CONTROL_PLANE_MACHINE_SIZE} + - WORKER_MACHINE_IMAGE=${WORKER_MACHINE_IMAGE} + - WORKER_MACHINE_SIZE=${WORKER_MACHINE_SIZE} + - FIREWALL_MACHINE_IMAGE=${FIREWALL_MACHINE_IMAGE} + - FIREWALL_MACHINE_SIZE=${FIREWALL_MACHINE_SIZE} + - FIREWALL_EXTERNAL_NETWORKS=${FIREWALL_EXTERNAL_NETWORKS} + image: registry.k8s.io/cluster-api/clusterctl:v1.11.4 network_mode: host user: root volumes: - ${KUBECONFIG}:/kubeconfig:ro + - ../config/clusterctl-templates:/templates:ro diff --git a/capi-lab/mini-lab b/capi-lab/mini-lab index f7d4a9b..195c7a1 160000 --- a/capi-lab/mini-lab +++ b/capi-lab/mini-lab @@ -1 +1 @@ -Subproject commit f7d4a9b4aec09c454fe638d49b9bc8493d6385b5 +Subproject commit 195c7a1975f846b76f5c55b00540dd05886c2887 diff --git a/capi-lab/test/integration.sh b/capi-lab/test/integration.sh index f6938d4..634a18f 100755 --- a/capi-lab/test/integration.sh +++ b/capi-lab/test/integration.sh @@ -11,22 +11,141 @@ minWaiting=2 declare -i attempts=0 until [ "$waiting" -ge $minWaiting ] do - if [ "$attempts" -ge 60 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines in waiting state - timeout reached" exit 1 fi echo "$waiting/$minWaiting machines are waiting" sleep 5 waiting=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Waiting | wc -l) - attempts=$attempts+1 + attempts+=1 done echo "$waiting/$minWaiting machines are waiting" make push-to-capi-lab +if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ] || [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + + if [ "$MINI_LAB_FLAVOR" = "capms_dell_sonic" ]; then + echo "Starting capms dell sonic flavor tests" + else + echo "Starting capms sonic flavor tests" + fi + + export CLUSTER_NAME=metal-test + + echo "Creating control plane IP" + make -C capi-lab control-plane-ip + + echo "Applying sample cluster" + make -C capi-lab apply-sample-cluster + + echo "Waiting for cluster to be provisioned" + declare -i attempts=0 + until kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Provisioned" + do + if [ "$attempts" -ge 180 ]; then + echo "cluster was not provisioned - timeout reached" + kubectl --kubeconfig ${KUBECONFIG} get cluster ${CLUSTER_NAME} -o yaml || true + exit 1 + fi + echo "cluster ${CLUSTER_NAME} is not yet provisioned" + sleep 5 + attempts+=1 + done + echo "Cluster ${CLUSTER_NAME} is provisioned" + + + echo "Waiting for firewall and control-plane to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=2 + declare -i attempts=0 + until [ "$phoned" -ge $minPhoned ] + do + if [ "$attempts" -ge 240 ]; then + echo "not enough machines phoned home - timeout reached" + exit 1 + fi + echo "$phoned/$minPhoned machines have phoned home" + sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + attempts+=1 + done + echo "$phoned/$minPhoned machines have phoned home" + + if [ "$MINI_LAB_FLAVOR" = "capms_sonic" ]; then + echo "Applying mtu fix" + make -C capi-lab mtu-fix + fi + + echo "Waiting for worker to get to Phoned Home state" + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + minPhoned=3 + declare -i attempts=0 + until [ "$phoned" -ge $minPhoned ] + do + if [ "$attempts" -ge 480 ]; then + echo "not enough machines phoned home - timeout reached" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true + exit 1 + fi + if [ $((attempts % 60)) -eq 0 ] && [ "$attempts" -gt 0 ]; then + echo "machine states after $attempts attempts:" + docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls || true + fi + echo "$phoned/$minPhoned machines have phoned home" + sleep 5 + phoned=$(docker compose -f capi-lab/mini-lab/compose.yaml run --no-TTY --rm metalctl machine ls | grep Phoned | wc -l) + attempts+=1 + done + echo "$phoned/$minPhoned machines have phoned home" + + echo "Generating kubeconfig for sample cluster" + make -C capi-lab sample-cluster-kubeconfig + + # TODO remove once we can reliably check for the nodes to be ready + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get pods -A || true + + echo "Waiting for tenant API server to be reachable" + declare -i attempts=0 + until kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version >/dev/null 2>&1 + do + if [ "$attempts" -ge 180 ]; then + echo "tenant API server not reachable - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig version || true + exit 1 + fi + echo "tenant API server not reachable yet" + sleep 5 + attempts+=1 + done + echo "Tenant API server is reachable" + + echo "Waiting for control-plane node and worker node to become Ready" + minReady=2 + ready=0 + declare -i attempts=0 + until [ "$ready" -ge $minReady ] + do + if [ "$attempts" -ge 180 ]; then + echo "not enough nodes became Ready - timeout reached" + kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes || true + exit 1 + fi + echo "$ready/$minReady nodes are Ready" + sleep 5 + ready=$(kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes --no-headers 2>/dev/null | awk '{ print $2 }' | grep -c "^Ready$" || true) + attempts+=1 + done + echo "$ready/$minReady nodes are Ready" + +fi + + if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then - echo "Starting kamaji tests" + echo "Starting kamaji flavor tests" echo "Creating control plane IP" export CLUSTER_NAME=kamaji-tenant-test @@ -42,7 +161,7 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then declare -i attempts=0 until [ "$phoned" -ge $minPhoned ] do - if [ "$attempts" -ge 120 ]; then + if [ "$attempts" -ge 180 ]; then echo "not enough machines phoned home - timeout reached" exit 1 fi @@ -67,6 +186,9 @@ if [ "$MINI_LAB_FLAVOR" = "kamaji" ]; then echo "Checking if tenant cluster exists" if kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "Ready"; then + # Currently this also catches NotReady nodes, but that's good enough for now to verify + # that the node has joined. + # Only metal-ccm will be able to set the node to Ready but we do not go that far here echo "Nodes have joined the cluster and are ready" elif kubectl --kubeconfig ${CLUSTER_NAME}.kubeconfig get nodes | grep -e "No resources found"; then echo "Nodes have not joined yet" diff --git a/config/clusterctl-templates/cluster-template-calico-lab.yaml b/config/clusterctl-templates/cluster-template-calico-lab.yaml index 1e07261..43a3993 100644 --- a/config/clusterctl-templates/cluster-template-calico-lab.yaml +++ b/config/clusterctl-templates/cluster-template-calico-lab.yaml @@ -258,8 +258,6 @@ spec: spec: format: ignition clusterConfiguration: - apiServer: - extraArgs: {} controllerManager: extraArgs: cloud-provider: external @@ -561,6 +559,11 @@ data: - effect: NoSchedule key: node.cloudprovider.kubernetes.io/uninitialized value: "true" + - effect: NoSchedule + key: node.cluster.x-k8s.io/uninitialized + operator: Exists + - key: node.kubernetes.io/not-ready + operator: Exists restartPolicy: Always volumes: - name: cloud-controller-manager