Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 42 additions & 29 deletions pkg/provisioner/templates/kubeadm_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,21 @@ sudo systemctl enable --now kubelet

holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster"

# Detect this node's private IP for API server binding.
# Must be outside the init guard — used by verification and NLB switch below.
NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}')

# Always use local IP for init health checks: kubeadm v1.33+ validates the API
# server via control-plane-endpoint, which may not be routable from within the
# instance during init (public IPs, NLB DNS, etc.). Use private IP for init and
# include the original endpoint in cert SANs so external access works.
if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then
INIT_ENDPOINT="${NODE_PRIVATE_IP}"
holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)"
else
INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}"
fi

# Initialize cluster
if [[ ! -f /etc/kubernetes/admin.conf ]]; then
# Wait for control-plane endpoint to be resolvable (NLB DNS may take time)
Expand All @@ -155,20 +170,6 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then
done
fi

# Detect this node's private IP for API server binding
NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}')

# Always use local IP for init health checks: kubeadm v1.33+ validates the API
# server via control-plane-endpoint, which may not be routable from within the
# instance during init (public IPs, NLB DNS, etc.). Use private IP for init and
# include the original endpoint in cert SANs so external access works.
if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then
INIT_ENDPOINT="${NODE_PRIVATE_IP}"
holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)"
else
INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}"
fi

INIT_ARGS=(
--kubernetes-version="${K8S_VERSION}"
--pod-network-cidr=192.168.0.0/16
Expand All @@ -186,30 +187,21 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then
holodeck_log "INFO" "$COMPONENT" "Running kubeadm init with args: ${INIT_ARGS[*]}"
holodeck_retry 3 "$COMPONENT" sudo kubeadm init "${INIT_ARGS[@]}"

# For HA with NLB: after init succeeds, update the cluster config to use NLB DNS
# so that join tokens reference the NLB endpoint (reachable by other nodes).
if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then
holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443"
# Update the kubeadm-config ConfigMap
sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \
sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \
sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \
holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint"
# Also update admin.conf kubeconfig to use the NLB
sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \
/etc/kubernetes/admin.conf
fi
fi

# Setup kubeconfig
# Setup kubeconfig (still points at local IP for HA; NLB switch happens after verification)
mkdir -p "$HOME/.kube"
sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config"
sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config"
export KUBECONFIG="${HOME}/.kube/config"

holodeck_progress "$COMPONENT" 6 8 "Waiting for API server"

holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" version
# Verify API server against local private IP first. For HA clusters, admin.conf
# still points at the local IP at this stage. For non-HA clusters this is a no-op
# since KUBECONFIG already targets the right endpoint.
holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" \
--server="https://${NODE_PRIVATE_IP}:6443" version
Comment on lines +200 to +204

holodeck_progress "$COMPONENT" 7 8 "Installing Calico CNI"

Expand Down Expand Up @@ -291,6 +283,27 @@ holodeck_retry 10 "$COMPONENT" kubectl --kubeconfig "$KUBECONFIG" wait \

holodeck_progress "$COMPONENT" 8 8 "Finalizing cluster configuration"

# For HA with NLB: now that Calico is running and the cluster is fully functional,
# switch the cluster config to use the NLB DNS so that join tokens reference the
# NLB endpoint (reachable by other nodes). This MUST happen after Calico — the NLB
# health checks require a working CNI to pass.
if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then
# Escape dots in INIT_ENDPOINT for safe sed regex matching (IPs contain literal dots)
INIT_ESCAPED=$(echo "$INIT_ENDPOINT" | sed 's/\./\\./g')
holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443"
# Update the kubeadm-config ConfigMap
sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \
sed "s|controlPlaneEndpoint: ${INIT_ESCAPED}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \
sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \
holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint"
# Update admin.conf kubeconfig to use the NLB
sudo sed -i "s|server: https://${INIT_ESCAPED}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \
/etc/kubernetes/admin.conf
# Re-copy the updated admin.conf to user kubeconfig
sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config"
sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config"
fi

# Label this node as control-plane (keep the taint for multinode)
kubectl label node --all nvidia.com/holodeck.managed=true --overwrite 2>/dev/null || true

Expand Down
58 changes: 58 additions & 0 deletions pkg/provisioner/templates/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ package templates

import (
"bytes"
"strings"
"testing"

"github.com/NVIDIA/holodeck/api/holodeck/v1alpha1"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestNewKubernetes(t *testing.T) {
Expand Down Expand Up @@ -1195,3 +1197,59 @@ func TestKubernetes_DefaultConstants(t *testing.T) {
assert.NotEmpty(t, defaultCRIVersion, "defaultCRIVersion must be set")
assert.NotEmpty(t, defaultCalicoVersion, "defaultCalicoVersion must be set")
}

func TestKubeadmInit_HA_VerifiesLocalIPBeforeNLBSwitch(t *testing.T) {
// HA clusters use an NLB endpoint. After kubeadm init, the API server must
// be verified against the local private IP BEFORE switching admin.conf to
// the NLB endpoint. Otherwise kubectl version hits the NLB which hasn't
// passed health checks yet, exhausting all retry attempts.
Comment on lines +1201 to +1205
env := v1alpha1.Environment{
Spec: v1alpha1.EnvironmentSpec{
Kubernetes: v1alpha1.Kubernetes{
KubernetesVersion: "v1.33.0",
},
ContainerRuntime: v1alpha1.ContainerRuntime{
Name: "containerd",
},
},
}

cfg := &KubeadmInitConfig{
Environment: &env,
ControlPlaneEndpoint: "my-nlb-1234567890.us-west-2.elb.amazonaws.com",
IsHA: true,
}

var buf bytes.Buffer
err := cfg.Execute(&buf)
require.NoError(t, err)

out := buf.String()

// The template must contain a local IP verification step
localVerifyMarker := `--server="https://${NODE_PRIVATE_IP}:6443" version`
nlbSwitchMarker := "Updating cluster config to use NLB endpoint"

assert.Contains(t, out, localVerifyMarker,
"HA init template must verify API server against local private IP")
assert.Contains(t, out, nlbSwitchMarker,
"HA init template must update cluster config to use NLB endpoint")

// Critical ordering: local verification MUST happen BEFORE the NLB switch
localVerifyPos := strings.Index(out, localVerifyMarker)
nlbSwitchPos := strings.Index(out, nlbSwitchMarker)
calicoInstallMarker := "Installing Calico"
calicoInstallPos := strings.Index(out, calicoInstallMarker)
Comment on lines +1233 to +1242

assert.Greater(t, nlbSwitchPos, localVerifyPos,
"Local IP verification (pos %d) must happen BEFORE NLB endpoint switch (pos %d)",
localVerifyPos, nlbSwitchPos)

// Calico must be installed BEFORE the NLB switch — NLB health checks
// require a working CNI to pass.
assert.Greater(t, calicoInstallPos, 0,
"Template must contain Calico installation")
assert.Greater(t, nlbSwitchPos, calicoInstallPos,
"Calico installation (pos %d) must happen BEFORE NLB endpoint switch (pos %d)",
calicoInstallPos, nlbSwitchPos)
}
Loading