From 011d9191ded5e55443cb3a8cd9071490f9cf7aff Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 31 Mar 2026 11:32:43 +0200 Subject: [PATCH 1/3] fix: switch HA NLB to internal scheme to fix hairpin routing (#746) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internet-facing NLB resolves to a public IP. When control-plane nodes connect to it after kubeconfig switchover, the hairpin routing (node → IGW → NLB → same node) is not supported by AWS NLBs, causing i/o timeouts on port 6443. Switch to an internal NLB which gets a private VPC IP, routing traffic directly within the VPC. Also remove the NLB DNS propagation wait since internal NLBs resolve immediately via VPC DNS. Fixes #746 Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/nlb.go | 7 ++++--- pkg/provisioner/templates/kubeadm_cluster.go | 17 ----------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/pkg/provider/aws/nlb.go b/pkg/provider/aws/nlb.go index 4d880e48..6bd4e85d 100644 --- a/pkg/provider/aws/nlb.go +++ b/pkg/provider/aws/nlb.go @@ -54,15 +54,16 @@ func (p *Provider) createNLB(cache *ClusterCache) error { } lbName := nlbBaseName + nlbSuffix - // Use the public subnet for the internet-facing NLB + // Use the public subnet for the internal NLB (same subnet as instances) subnetIDs := []string{cache.PublicSubnetid} - // Create load balancer + // Create load balancer — internal scheme avoids hairpin routing issues + // where nodes connecting to the NLB's public IP get i/o timeouts createLBInput := &elasticloadbalancingv2.CreateLoadBalancerInput{ Name: aws.String(lbName), Type: lbType, Subnets: subnetIDs, - Scheme: elbv2types.LoadBalancerSchemeEnumInternetFacing, + Scheme: elbv2types.LoadBalancerSchemeEnumInternal, IpAddressType: elbv2types.IpAddressTypeIpv4, Tags: p.convertTagsToELBv2Tags(), } diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index 582469af..163f40c8 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -153,23 +153,6 @@ fi # Initialize cluster if [[ ! -f /etc/kubernetes/admin.conf ]]; then - # Wait for control-plane endpoint to be resolvable (NLB DNS may take time) - if [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \ - [[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; then - holodeck_log "INFO" "$COMPONENT" "Waiting for NLB DNS to resolve: ${CONTROL_PLANE_ENDPOINT}" - for i in {1..30}; do - if host "${CONTROL_PLANE_ENDPOINT}" &>/dev/null || \ - getent hosts "${CONTROL_PLANE_ENDPOINT}" &>/dev/null; then - holodeck_log "INFO" "$COMPONENT" "NLB DNS resolved successfully" - break - fi - if [[ $i -eq 30 ]]; then - holodeck_log "WARN" "$COMPONENT" "NLB DNS not yet resolved after 5 min, proceeding anyway" - fi - sleep 10 - done - fi - INIT_ARGS=( --kubernetes-version="${K8S_VERSION}" --pod-network-cidr=192.168.0.0/16 From 3052a6d6f106af796291784d6408079308a94f7c Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 31 Mar 2026 15:13:13 +0200 Subject: [PATCH 2/3] fix: use local API server on CP nodes to avoid NLB hairpin (#746) AWS NLBs drop traffic when a registered target connects through the NLB and gets routed back to itself (hairpin/loopback). This happens on every control-plane node since they are all NLB targets. Stop patching admin.conf to use the NLB endpoint on the init node. On joining CP nodes, patch admin.conf to use localhost:6443 instead of the NLB. The kubeadm-config ConfigMap still points to the NLB so joining nodes and workers discover the correct endpoint. Fixes #746 Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubeadm_cluster.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index 163f40c8..ca12a0aa 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -279,12 +279,10 @@ if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" sed "s|controlPlaneEndpoint: ${INIT_ESCAPED}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" - # Update admin.conf kubeconfig to use the NLB - sudo sed -i "s|server: https://${INIT_ESCAPED}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ - /etc/kubernetes/admin.conf - # Re-copy the updated admin.conf to user kubeconfig - sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" - sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" + # NOTE: Do NOT patch admin.conf to use the NLB endpoint. + # CP nodes must use their local API server (localhost:6443) to avoid + # AWS NLB hairpin routing — NLBs drop traffic when a registered target + # connects through the NLB and gets routed back to itself. fi # Label this node as control-plane (keep the taint for multinode) @@ -371,6 +369,11 @@ holodeck_progress "$COMPONENT" 4 4 "Configuring node" # Setup kubeconfig for control-plane nodes if [[ "$IS_CONTROL_PLANE" == "true" ]]; then + # Patch admin.conf to use the local API server instead of the NLB endpoint. + # AWS NLBs drop hairpin traffic (target connects through NLB back to itself), + # so CP nodes must talk to their own local kube-apiserver. + sudo sed -i 's|server: https://.*:6443|server: https://localhost:6443|g' \ + /etc/kubernetes/admin.conf mkdir -p "$HOME/.kube" sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config" sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config" From 1c12d22b77baf075a692e073664d48b5d4bf9797 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 31 Mar 2026 15:59:08 +0200 Subject: [PATCH 3/3] fix: add NLB cleanup to periodic VPC cleaner The periodic cleanup utility (pkg/cleanup) only handles EC2 resources (instances, security groups, subnets, IGW, VPC). When HA clusters create NLBs, the NLB ENIs in subnets block subnet and VPC deletion with DependencyViolation errors. Add ELBv2 client to the Cleaner and delete load balancers (with their listeners and target groups) as the first step in VPC cleanup, before instance termination. Include a 30s wait after NLB deletion for ENI detachment. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/cleanup/cleanup.go | 125 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 5 deletions(-) diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go index 60b99015..d221e7ab 100644 --- a/pkg/cleanup/cleanup.go +++ b/pkg/cleanup/cleanup.go @@ -29,6 +29,7 @@ import ( "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" internalaws "github.com/NVIDIA/holodeck/internal/aws" "github.com/NVIDIA/holodeck/internal/logger" @@ -60,8 +61,9 @@ func safeString(s *string) string { // Cleaner handles cleanup of AWS resources type Cleaner struct { - ec2 internalaws.EC2Client - log *logger.FunLogger + ec2 internalaws.EC2Client + elbv2 internalaws.ELBv2Client + log *logger.FunLogger } // CleanerOption is a functional option for configuring the Cleaner. @@ -75,6 +77,14 @@ func WithEC2Client(client internalaws.EC2Client) CleanerOption { } } +// WithELBv2Client sets a custom ELBv2 client for the Cleaner. +// This is primarily used for testing to inject mock clients. +func WithELBv2Client(client internalaws.ELBv2Client) CleanerOption { + return func(c *Cleaner) { + c.elbv2 = client + } +} + // New creates a new AWS resource cleaner. // Optional functional options can be provided to customize the cleaner, // such as injecting a mock EC2 client for testing. @@ -89,8 +99,8 @@ func New(log *logger.FunLogger, region string, opt(c) } - // If no EC2 client was injected, create the real one - if c.ec2 == nil { + // If no clients were injected, create real ones from shared config + if c.ec2 == nil || c.elbv2 == nil { // Use Background here because New is a top-level initializer // without caller-provided context. cfg, err := config.LoadDefaultConfig(context.Background(), @@ -98,7 +108,12 @@ func New(log *logger.FunLogger, region string, if err != nil { return nil, fmt.Errorf("failed to load AWS config: %w", err) } - c.ec2 = ec2.NewFromConfig(cfg) + if c.ec2 == nil { + c.ec2 = ec2.NewFromConfig(cfg) + } + if c.elbv2 == nil { + c.elbv2 = elasticloadbalancingv2.NewFromConfig(cfg) + } } return c, nil @@ -240,6 +255,15 @@ func (c *Cleaner) DeleteVPCResources(ctx context.Context, vpcID string) error { return fmt.Errorf("cleanup cancelled: %w", err) } + // Delete load balancers (NLBs/ALBs) — must happen before subnets/IGW/VPC + if err := c.deleteLoadBalancers(ctx, vpcID); err != nil { + c.log.Warning("Failed to delete load balancers: %v", err) + } + + if err := ctx.Err(); err != nil { + return fmt.Errorf("cleanup cancelled after load balancer deletion: %w", err) + } + // Delete instances if err := c.deleteInstances(ctx, vpcID); err != nil { return fmt.Errorf("failed to delete instances: %w", err) @@ -619,3 +643,94 @@ func (c *Cleaner) deleteVPC(ctx context.Context, vpcID string) error { return fmt.Errorf("failed to delete VPC %s after %d attempts", vpcID, maxAttempts) } + +func (c *Cleaner) deleteLoadBalancers(ctx context.Context, vpcID string) error { + // Describe all load balancers and filter by VPC + describeInput := &elasticloadbalancingv2.DescribeLoadBalancersInput{} + result, err := c.elbv2.DescribeLoadBalancers(ctx, describeInput) + if err != nil { + return fmt.Errorf("failed to describe load balancers: %w", err) + } + + var count int + for _, lb := range result.LoadBalancers { + if lb.VpcId == nil || *lb.VpcId != vpcID { + continue + } + + lbArn := aws.ToString(lb.LoadBalancerArn) + lbName := aws.ToString(lb.LoadBalancerName) + c.log.Info("Deleting load balancer %s (%s)", lbName, lbArn) + + // Delete listeners + if err := c.deleteLBListeners(ctx, lbArn); err != nil { + c.log.Warning("Failed to delete listeners for %s: %v", lbName, err) + } + + // Delete target groups + if err := c.deleteLBTargetGroups(ctx, lbArn); err != nil { + c.log.Warning("Failed to delete target groups for %s: %v", lbName, err) + } + + // Delete the load balancer + _, err := c.elbv2.DeleteLoadBalancer(ctx, &elasticloadbalancingv2.DeleteLoadBalancerInput{ + LoadBalancerArn: aws.String(lbArn), + }) + if err != nil { + c.log.Warning("Failed to delete load balancer %s: %v", lbName, err) + continue + } + count++ + } + + if count > 0 { + c.log.Info("Deleted %d load balancer(s), waiting for ENIs to detach", count) + // NLBs take time to fully decommission and release ENIs. + // Wait briefly to allow ENI cleanup before subnet deletion. + select { + case <-ctx.Done(): + return fmt.Errorf("cancelled waiting for LB decommission: %w", ctx.Err()) + case <-time.After(30 * time.Second): + } + } + + return nil +} + +func (c *Cleaner) deleteLBListeners(ctx context.Context, lbArn string) error { + result, err := c.elbv2.DescribeListeners(ctx, &elasticloadbalancingv2.DescribeListenersInput{ + LoadBalancerArn: aws.String(lbArn), + }) + if err != nil { + return fmt.Errorf("failed to describe listeners: %w", err) + } + + for _, listener := range result.Listeners { + _, err := c.elbv2.DeleteListener(ctx, &elasticloadbalancingv2.DeleteListenerInput{ + ListenerArn: listener.ListenerArn, + }) + if err != nil { + c.log.Warning("Failed to delete listener %s: %v", safeString(listener.ListenerArn), err) + } + } + return nil +} + +func (c *Cleaner) deleteLBTargetGroups(ctx context.Context, lbArn string) error { + result, err := c.elbv2.DescribeTargetGroups(ctx, &elasticloadbalancingv2.DescribeTargetGroupsInput{ + LoadBalancerArn: aws.String(lbArn), + }) + if err != nil { + return fmt.Errorf("failed to describe target groups: %w", err) + } + + for _, tg := range result.TargetGroups { + _, err := c.elbv2.DeleteTargetGroup(ctx, &elasticloadbalancingv2.DeleteTargetGroupInput{ + TargetGroupArn: tg.TargetGroupArn, + }) + if err != nil { + c.log.Warning("Failed to delete target group %s: %v", safeString(tg.TargetGroupArn), err) + } + } + return nil +}