diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index be0ed4ed..582469af 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -136,6 +136,21 @@ sudo systemctl enable --now kubelet holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" +# Detect this node's private IP for API server binding. +# Must be outside the init guard — used by verification and NLB switch below. +NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') + +# Always use local IP for init health checks: kubeadm v1.33+ validates the API +# server via control-plane-endpoint, which may not be routable from within the +# instance during init (public IPs, NLB DNS, etc.). Use private IP for init and +# include the original endpoint in cert SANs so external access works. +if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then + INIT_ENDPOINT="${NODE_PRIVATE_IP}" + holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" +else + INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" +fi + # Initialize cluster if [[ ! -f /etc/kubernetes/admin.conf ]]; then # Wait for control-plane endpoint to be resolvable (NLB DNS may take time) @@ -155,20 +170,6 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then done fi - # Detect this node's private IP for API server binding - NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') - - # Always use local IP for init health checks: kubeadm v1.33+ validates the API - # server via control-plane-endpoint, which may not be routable from within the - # instance during init (public IPs, NLB DNS, etc.). Use private IP for init and - # include the original endpoint in cert SANs so external access works. - if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then - INIT_ENDPOINT="${NODE_PRIVATE_IP}" - holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" - else - INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" - fi - INIT_ARGS=( --kubernetes-version="${K8S_VERSION}" --pod-network-cidr=192.168.0.0/16 @@ -186,22 +187,9 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then holodeck_log "INFO" "$COMPONENT" "Running kubeadm init with args: ${INIT_ARGS[*]}" holodeck_retry 3 "$COMPONENT" sudo kubeadm init "${INIT_ARGS[@]}" - # For HA with NLB: after init succeeds, update the cluster config to use NLB DNS - # so that join tokens reference the NLB endpoint (reachable by other nodes). - if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then - holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" - # Update the kubeadm-config ConfigMap - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ - sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ - sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ - holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" - # Also update admin.conf kubeconfig to use the NLB - sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ - /etc/kubernetes/admin.conf - fi fi -# Setup kubeconfig +# Setup kubeconfig (still points at local IP for HA; NLB switch happens after verification) mkdir -p "$HOME/.kube" sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" @@ -209,7 +197,11 @@ export KUBECONFIG="${HOME}/.kube/config" holodeck_progress "$COMPONENT" 6 8 "Waiting for API server" -holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" version +# Verify API server against local private IP first. For HA clusters, admin.conf +# still points at the local IP at this stage. For non-HA clusters this is a no-op +# since KUBECONFIG already targets the right endpoint. +holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" \ + --server="https://${NODE_PRIVATE_IP}:6443" version holodeck_progress "$COMPONENT" 7 8 "Installing Calico CNI" @@ -291,6 +283,27 @@ holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" wait \ holodeck_progress "$COMPONENT" 8 8 "Finalizing cluster configuration" +# For HA with NLB: now that Calico is running and the cluster is fully functional, +# switch the cluster config to use the NLB DNS so that join tokens reference the +# NLB endpoint (reachable by other nodes). This MUST happen after Calico — the NLB +# health checks require a working CNI to pass. +if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then + # Escape dots in INIT_ENDPOINT for safe sed regex matching (IPs contain literal dots) + INIT_ESCAPED=$(echo "$INIT_ENDPOINT" | sed 's/\./\\./g') + holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" + # Update the kubeadm-config ConfigMap + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ + sed "s|controlPlaneEndpoint: ${INIT_ESCAPED}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ + holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" + # Update admin.conf kubeconfig to use the NLB + sudo sed -i "s|server: https://${INIT_ESCAPED}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ + /etc/kubernetes/admin.conf + # Re-copy the updated admin.conf to user kubeconfig + sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" + sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" +fi + # Label this node as control-plane (keep the taint for multinode) kubectl label node --all nvidia.com/holodeck.managed=true --overwrite 2>/dev/null || true diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index bea0ba7b..93a8c221 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -2,11 +2,13 @@ package templates import ( "bytes" + "strings" "testing" "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNewKubernetes(t *testing.T) { @@ -1195,3 +1197,59 @@ func TestKubernetes_DefaultConstants(t *testing.T) { assert.NotEmpty(t, defaultCRIVersion, "defaultCRIVersion must be set") assert.NotEmpty(t, defaultCalicoVersion, "defaultCalicoVersion must be set") } + +func TestKubeadmInit_HA_VerifiesLocalIPBeforeNLBSwitch(t *testing.T) { + // HA clusters use an NLB endpoint. After kubeadm init, the API server must + // be verified against the local private IP BEFORE switching admin.conf to + // the NLB endpoint. Otherwise kubectl version hits the NLB which hasn't + // passed health checks yet, exhausting all retry attempts. + env := v1alpha1.Environment{ + Spec: v1alpha1.EnvironmentSpec{ + Kubernetes: v1alpha1.Kubernetes{ + KubernetesVersion: "v1.33.0", + }, + ContainerRuntime: v1alpha1.ContainerRuntime{ + Name: "containerd", + }, + }, + } + + cfg := &KubeadmInitConfig{ + Environment: &env, + ControlPlaneEndpoint: "my-nlb-1234567890.us-west-2.elb.amazonaws.com", + IsHA: true, + } + + var buf bytes.Buffer + err := cfg.Execute(&buf) + require.NoError(t, err) + + out := buf.String() + + // The template must contain a local IP verification step + localVerifyMarker := `--server="https://${NODE_PRIVATE_IP}:6443" version` + nlbSwitchMarker := "Updating cluster config to use NLB endpoint" + + assert.Contains(t, out, localVerifyMarker, + "HA init template must verify API server against local private IP") + assert.Contains(t, out, nlbSwitchMarker, + "HA init template must update cluster config to use NLB endpoint") + + // Critical ordering: local verification MUST happen BEFORE the NLB switch + localVerifyPos := strings.Index(out, localVerifyMarker) + nlbSwitchPos := strings.Index(out, nlbSwitchMarker) + calicoInstallMarker := "Installing Calico" + calicoInstallPos := strings.Index(out, calicoInstallMarker) + + assert.Greater(t, nlbSwitchPos, localVerifyPos, + "Local IP verification (pos %d) must happen BEFORE NLB endpoint switch (pos %d)", + localVerifyPos, nlbSwitchPos) + + // Calico must be installed BEFORE the NLB switch — NLB health checks + // require a working CNI to pass. + assert.Greater(t, calicoInstallPos, 0, + "Template must contain Calico installation") + assert.Greater(t, nlbSwitchPos, calicoInstallPos, + "Calico installation (pos %d) must happen BEFORE NLB endpoint switch (pos %d)", + calicoInstallPos, nlbSwitchPos) +}