From 94cf986573ceb77195eeb02e2e89eb804d185af4 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Fri, 13 Mar 2026 10:49:54 +0100 Subject: [PATCH 1/2] fix: verify API server against local IP before switching to NLB HA cluster kubeadm init succeeded but kubectl version failed because admin.conf pointed at the NLB before its health checks passed. Verify the API server using the local private IP first, then switch to the NLB endpoint. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubeadm_cluster.go | 39 ++++++++++------ pkg/provisioner/templates/kubernetes_test.go | 48 ++++++++++++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index be0ed4ed..3669e4ec 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -186,22 +186,9 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then holodeck_log "INFO" "$COMPONENT" "Running kubeadm init with args: ${INIT_ARGS[*]}" holodeck_retry 3 "$COMPONENT" sudo kubeadm init "${INIT_ARGS[@]}" - # For HA with NLB: after init succeeds, update the cluster config to use NLB DNS - # so that join tokens reference the NLB endpoint (reachable by other nodes). - if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then - holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" - # Update the kubeadm-config ConfigMap - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ - sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ - holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" - # Also update admin.conf kubeconfig to use the NLB - sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ - /etc/kubernetes/admin.conf - fi fi -# Setup kubeconfig +# Setup kubeconfig (still points at local IP for HA; NLB switch happens after verification) mkdir -p "$HOME/.kube" sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" @@ -209,7 +196,29 @@ export KUBECONFIG="${HOME}/.kube/config" holodeck_progress "$COMPONENT" 6 8 "Waiting for API server" -holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" version +# Verify API server against local private IP first. For HA clusters, admin.conf +# still points at the local IP at this stage. For non-HA clusters this is a no-op +# since KUBECONFIG already targets the right endpoint. +holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" \ + --server="https://${NODE_PRIVATE_IP}:6443" version + +# For HA with NLB: now that the API server is verified locally, switch the cluster +# config to use the NLB DNS so that join tokens reference the NLB endpoint +# (reachable by other nodes). +if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then + holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" + # Update the kubeadm-config ConfigMap + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ + sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ + holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" + # Update admin.conf kubeconfig to use the NLB + sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ + /etc/kubernetes/admin.conf + # Re-copy the updated admin.conf to user kubeconfig + sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" + sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" +fi holodeck_progress "$COMPONENT" 7 8 "Installing Calico CNI" diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index bea0ba7b..a66ea2ec 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -2,11 +2,13 @@ package templates import ( "bytes" + "strings" "testing" "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNewKubernetes(t *testing.T) { @@ -1195,3 +1197,49 @@ func TestKubernetes_DefaultConstants(t *testing.T) { assert.NotEmpty(t, defaultCRIVersion, "defaultCRIVersion must be set") assert.NotEmpty(t, defaultCalicoVersion, "defaultCalicoVersion must be set") } + +func TestKubeadmInit_HA_VerifiesLocalIPBeforeNLBSwitch(t *testing.T) { + // HA clusters use an NLB endpoint. After kubeadm init, the API server must + // be verified against the local private IP BEFORE switching admin.conf to + // the NLB endpoint. Otherwise kubectl version hits the NLB which hasn't + // passed health checks yet, exhausting all retry attempts. + env := v1alpha1.Environment{ + Spec: v1alpha1.EnvironmentSpec{ + Kubernetes: v1alpha1.Kubernetes{ + KubernetesVersion: "v1.33.0", + }, + ContainerRuntime: v1alpha1.ContainerRuntime{ + Name: "containerd", + }, + }, + } + + cfg := &KubeadmInitConfig{ + Environment: &env, + ControlPlaneEndpoint: "my-nlb-1234567890.us-west-2.elb.amazonaws.com", + IsHA: true, + } + + var buf bytes.Buffer + err := cfg.Execute(&buf) + require.NoError(t, err) + + out := buf.String() + + // The template must contain a local IP verification step + localVerifyMarker := `--server="https://${NODE_PRIVATE_IP}:6443" version` + nlbSwitchMarker := "Updating cluster config to use NLB endpoint" + + assert.Contains(t, out, localVerifyMarker, + "HA init template must verify API server against local private IP") + assert.Contains(t, out, nlbSwitchMarker, + "HA init template must update cluster config to use NLB endpoint") + + // Critical ordering: local verification MUST happen BEFORE the NLB switch + localVerifyPos := strings.Index(out, localVerifyMarker) + nlbSwitchPos := strings.Index(out, nlbSwitchMarker) + + assert.Greater(t, nlbSwitchPos, localVerifyPos, + "Local IP verification (pos %d) must happen BEFORE NLB endpoint switch (pos %d)", + localVerifyPos, nlbSwitchPos) +} From 96c81312a8dc8ce79462b455c96b493930b94649 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Fri, 13 Mar 2026 12:15:31 +0100 Subject: [PATCH 2/2] fix: move NODE_PRIVATE_IP and NLB switch outside init guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address 4 issues from Distinguished Engineer review: 1. BLOCKING: NODE_PRIVATE_IP and INIT_ENDPOINT were scoped inside the `if [[ ! -f admin.conf ]]` block but used outside it for API verification and NLB switch — would crash on re-run. Moved both assignments before the init guard. 2. Escape dots in INIT_ENDPOINT for sed regex safety — IP addresses contain literal dots that are regex metacharacters. 3. Move NLB switch block to after Calico installation per design: NLB health checks require a working CNI to pass. Previous position was before Calico, violating the design contract. 4. Add test assertion verifying Calico installation happens before NLB endpoint switch. Signed-off-by: Eduardo Arango Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubeadm_cluster.go | 68 +++++++++++--------- pkg/provisioner/templates/kubernetes_test.go | 10 +++ 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index 3669e4ec..582469af 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -136,6 +136,21 @@ sudo systemctl enable --now kubelet holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" +# Detect this node's private IP for API server binding. +# Must be outside the init guard — used by verification and NLB switch below. +NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') + +# Always use local IP for init health checks: kubeadm v1.33+ validates the API +# server via control-plane-endpoint, which may not be routable from within the +# instance during init (public IPs, NLB DNS, etc.). Use private IP for init and +# include the original endpoint in cert SANs so external access works. +if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then + INIT_ENDPOINT="${NODE_PRIVATE_IP}" + holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" +else + INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" +fi + # Initialize cluster if [[ ! -f /etc/kubernetes/admin.conf ]]; then # Wait for control-plane endpoint to be resolvable (NLB DNS may take time) @@ -155,20 +170,6 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then done fi - # Detect this node's private IP for API server binding - NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') - - # Always use local IP for init health checks: kubeadm v1.33+ validates the API - # server via control-plane-endpoint, which may not be routable from within the - # instance during init (public IPs, NLB DNS, etc.). Use private IP for init and - # include the original endpoint in cert SANs so external access works. - if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then - INIT_ENDPOINT="${NODE_PRIVATE_IP}" - holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" - else - INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" - fi - INIT_ARGS=( --kubernetes-version="${K8S_VERSION}" --pod-network-cidr=192.168.0.0/16 @@ -202,24 +203,6 @@ holodeck_progress "$COMPONENT" 6 8 "Waiting for API server" holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" \ --server="https://${NODE_PRIVATE_IP}:6443" version -# For HA with NLB: now that the API server is verified locally, switch the cluster -# config to use the NLB DNS so that join tokens reference the NLB endpoint -# (reachable by other nodes). -if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then - holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" - # Update the kubeadm-config ConfigMap - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ - sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ - holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" - # Update admin.conf kubeconfig to use the NLB - sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ - /etc/kubernetes/admin.conf - # Re-copy the updated admin.conf to user kubeconfig - sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" - sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" -fi - holodeck_progress "$COMPONENT" 7 8 "Installing Calico CNI" # Install Calico (idempotent) @@ -300,6 +283,27 @@ holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" wait \ holodeck_progress "$COMPONENT" 8 8 "Finalizing cluster configuration" +# For HA with NLB: now that Calico is running and the cluster is fully functional, +# switch the cluster config to use the NLB DNS so that join tokens reference the +# NLB endpoint (reachable by other nodes). This MUST happen after Calico — the NLB +# health checks require a working CNI to pass. +if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then + # Escape dots in INIT_ENDPOINT for safe sed regex matching (IPs contain literal dots) + INIT_ESCAPED=$(echo "$INIT_ENDPOINT" | sed 's/\./\\./g') + holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" + # Update the kubeadm-config ConfigMap + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ + sed "s|controlPlaneEndpoint: ${INIT_ESCAPED}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ + holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" + # Update admin.conf kubeconfig to use the NLB + sudo sed -i "s|server: https://${INIT_ESCAPED}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ + /etc/kubernetes/admin.conf + # Re-copy the updated admin.conf to user kubeconfig + sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" + sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" +fi + # Label this node as control-plane (keep the taint for multinode) kubectl label node --all nvidia.com/holodeck.managed=true --overwrite 2>/dev/null || true diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index a66ea2ec..93a8c221 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -1238,8 +1238,18 @@ func TestKubeadmInit_HA_VerifiesLocalIPBeforeNLBSwitch(t *testing.T) { // Critical ordering: local verification MUST happen BEFORE the NLB switch localVerifyPos := strings.Index(out, localVerifyMarker) nlbSwitchPos := strings.Index(out, nlbSwitchMarker) + calicoInstallMarker := "Installing Calico" + calicoInstallPos := strings.Index(out, calicoInstallMarker) assert.Greater(t, nlbSwitchPos, localVerifyPos, "Local IP verification (pos %d) must happen BEFORE NLB endpoint switch (pos %d)", localVerifyPos, nlbSwitchPos) + + // Calico must be installed BEFORE the NLB switch — NLB health checks + // require a working CNI to pass. + assert.Greater(t, calicoInstallPos, 0, + "Template must contain Calico installation") + assert.Greater(t, nlbSwitchPos, calicoInstallPos, + "Calico installation (pos %d) must happen BEFORE NLB endpoint switch (pos %d)", + calicoInstallPos, nlbSwitchPos) }