Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 120 additions & 5 deletions pkg/cleanup/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2"

internalaws "github.com/NVIDIA/holodeck/internal/aws"
"github.com/NVIDIA/holodeck/internal/logger"
Expand Down Expand Up @@ -60,8 +61,9 @@ func safeString(s *string) string {

// Cleaner handles cleanup of AWS resources
type Cleaner struct {
ec2 internalaws.EC2Client
log *logger.FunLogger
ec2 internalaws.EC2Client
elbv2 internalaws.ELBv2Client
log *logger.FunLogger
}

// CleanerOption is a functional option for configuring the Cleaner.
Expand All @@ -75,6 +77,14 @@ func WithEC2Client(client internalaws.EC2Client) CleanerOption {
}
}

// WithELBv2Client sets a custom ELBv2 client for the Cleaner.
// This is primarily used for testing to inject mock clients.
func WithELBv2Client(client internalaws.ELBv2Client) CleanerOption {
return func(c *Cleaner) {
c.elbv2 = client
}
}

// New creates a new AWS resource cleaner.
// Optional functional options can be provided to customize the cleaner,
// such as injecting a mock EC2 client for testing.
Expand All @@ -89,16 +99,21 @@ func New(log *logger.FunLogger, region string,
opt(c)
}

// If no EC2 client was injected, create the real one
if c.ec2 == nil {
// If no clients were injected, create real ones from shared config
if c.ec2 == nil || c.elbv2 == nil {
// Use Background here because New is a top-level initializer
// without caller-provided context.
cfg, err := config.LoadDefaultConfig(context.Background(),
config.WithRegion(region))
if err != nil {
return nil, fmt.Errorf("failed to load AWS config: %w", err)
}
c.ec2 = ec2.NewFromConfig(cfg)
if c.ec2 == nil {
c.ec2 = ec2.NewFromConfig(cfg)
}
if c.elbv2 == nil {
c.elbv2 = elasticloadbalancingv2.NewFromConfig(cfg)
}
}

return c, nil
Expand Down Expand Up @@ -240,6 +255,15 @@ func (c *Cleaner) DeleteVPCResources(ctx context.Context, vpcID string) error {
return fmt.Errorf("cleanup cancelled: %w", err)
}

// Delete load balancers (NLBs/ALBs) — must happen before subnets/IGW/VPC
if err := c.deleteLoadBalancers(ctx, vpcID); err != nil {
c.log.Warning("Failed to delete load balancers: %v", err)
}

if err := ctx.Err(); err != nil {
return fmt.Errorf("cleanup cancelled after load balancer deletion: %w", err)
}

// Delete instances
if err := c.deleteInstances(ctx, vpcID); err != nil {
return fmt.Errorf("failed to delete instances: %w", err)
Expand Down Expand Up @@ -619,3 +643,94 @@ func (c *Cleaner) deleteVPC(ctx context.Context, vpcID string) error {

return fmt.Errorf("failed to delete VPC %s after %d attempts", vpcID, maxAttempts)
}

func (c *Cleaner) deleteLoadBalancers(ctx context.Context, vpcID string) error {
// Describe all load balancers and filter by VPC
describeInput := &elasticloadbalancingv2.DescribeLoadBalancersInput{}
result, err := c.elbv2.DescribeLoadBalancers(ctx, describeInput)
if err != nil {
return fmt.Errorf("failed to describe load balancers: %w", err)
}

var count int
for _, lb := range result.LoadBalancers {
if lb.VpcId == nil || *lb.VpcId != vpcID {
continue
}

lbArn := aws.ToString(lb.LoadBalancerArn)
lbName := aws.ToString(lb.LoadBalancerName)
c.log.Info("Deleting load balancer %s (%s)", lbName, lbArn)

// Delete listeners
if err := c.deleteLBListeners(ctx, lbArn); err != nil {
c.log.Warning("Failed to delete listeners for %s: %v", lbName, err)
}

// Delete target groups
if err := c.deleteLBTargetGroups(ctx, lbArn); err != nil {
c.log.Warning("Failed to delete target groups for %s: %v", lbName, err)
}

// Delete the load balancer
_, err := c.elbv2.DeleteLoadBalancer(ctx, &elasticloadbalancingv2.DeleteLoadBalancerInput{
LoadBalancerArn: aws.String(lbArn),
})
if err != nil {
c.log.Warning("Failed to delete load balancer %s: %v", lbName, err)
continue
}
count++
}

if count > 0 {
c.log.Info("Deleted %d load balancer(s), waiting for ENIs to detach", count)
// NLBs take time to fully decommission and release ENIs.
// Wait briefly to allow ENI cleanup before subnet deletion.
select {
case <-ctx.Done():
return fmt.Errorf("cancelled waiting for LB decommission: %w", ctx.Err())
case <-time.After(30 * time.Second):
}
}

return nil
}

func (c *Cleaner) deleteLBListeners(ctx context.Context, lbArn string) error {
result, err := c.elbv2.DescribeListeners(ctx, &elasticloadbalancingv2.DescribeListenersInput{
LoadBalancerArn: aws.String(lbArn),
})
if err != nil {
return fmt.Errorf("failed to describe listeners: %w", err)
}

for _, listener := range result.Listeners {
_, err := c.elbv2.DeleteListener(ctx, &elasticloadbalancingv2.DeleteListenerInput{
ListenerArn: listener.ListenerArn,
})
if err != nil {
c.log.Warning("Failed to delete listener %s: %v", safeString(listener.ListenerArn), err)
}
}
return nil
}

func (c *Cleaner) deleteLBTargetGroups(ctx context.Context, lbArn string) error {
result, err := c.elbv2.DescribeTargetGroups(ctx, &elasticloadbalancingv2.DescribeTargetGroupsInput{
LoadBalancerArn: aws.String(lbArn),
})
if err != nil {
return fmt.Errorf("failed to describe target groups: %w", err)
}

for _, tg := range result.TargetGroups {
_, err := c.elbv2.DeleteTargetGroup(ctx, &elasticloadbalancingv2.DeleteTargetGroupInput{
TargetGroupArn: tg.TargetGroupArn,
})
if err != nil {
c.log.Warning("Failed to delete target group %s: %v", safeString(tg.TargetGroupArn), err)
}
}
return nil
}
7 changes: 4 additions & 3 deletions pkg/provider/aws/nlb.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,16 @@ func (p *Provider) createNLB(cache *ClusterCache) error {
}
lbName := nlbBaseName + nlbSuffix

// Use the public subnet for the internet-facing NLB
// Use the public subnet for the internal NLB (same subnet as instances)
subnetIDs := []string{cache.PublicSubnetid}

// Create load balancer
// Create load balancer — internal scheme avoids hairpin routing issues
// where nodes connecting to the NLB's public IP get i/o timeouts
createLBInput := &elasticloadbalancingv2.CreateLoadBalancerInput{
Name: aws.String(lbName),
Type: lbType,
Subnets: subnetIDs,
Scheme: elbv2types.LoadBalancerSchemeEnumInternetFacing,
Scheme: elbv2types.LoadBalancerSchemeEnumInternal,
IpAddressType: elbv2types.IpAddressTypeIpv4,
Tags: p.convertTagsToELBv2Tags(),
}
Expand Down
32 changes: 9 additions & 23 deletions pkg/provisioner/templates/kubeadm_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,23 +153,6 @@ fi

# Initialize cluster
if [[ ! -f /etc/kubernetes/admin.conf ]]; then
# Wait for control-plane endpoint to be resolvable (NLB DNS may take time)
if [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \
[[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; then
holodeck_log "INFO" "$COMPONENT" "Waiting for NLB DNS to resolve: ${CONTROL_PLANE_ENDPOINT}"
for i in {1..30}; do
if host "${CONTROL_PLANE_ENDPOINT}" &>/dev/null || \
getent hosts "${CONTROL_PLANE_ENDPOINT}" &>/dev/null; then
holodeck_log "INFO" "$COMPONENT" "NLB DNS resolved successfully"
break
fi
if [[ $i -eq 30 ]]; then
holodeck_log "WARN" "$COMPONENT" "NLB DNS not yet resolved after 5 min, proceeding anyway"
fi
sleep 10
done
fi

INIT_ARGS=(
--kubernetes-version="${K8S_VERSION}"
--pod-network-cidr=192.168.0.0/16
Expand Down Expand Up @@ -296,12 +279,10 @@ if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT"
sed "s|controlPlaneEndpoint: ${INIT_ESCAPED}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \
sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \
holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint"
# Update admin.conf kubeconfig to use the NLB
sudo sed -i "s|server: https://${INIT_ESCAPED}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \
/etc/kubernetes/admin.conf
# Re-copy the updated admin.conf to user kubeconfig
sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config"
sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config"
# NOTE: Do NOT patch admin.conf to use the NLB endpoint.
# CP nodes must use their local API server (localhost:6443) to avoid
# AWS NLB hairpin routing — NLBs drop traffic when a registered target
# connects through the NLB and gets routed back to itself.
fi

# Label this node as control-plane (keep the taint for multinode)
Expand Down Expand Up @@ -388,6 +369,11 @@ holodeck_progress "$COMPONENT" 4 4 "Configuring node"

# Setup kubeconfig for control-plane nodes
if [[ "$IS_CONTROL_PLANE" == "true" ]]; then
# Patch admin.conf to use the local API server instead of the NLB endpoint.
# AWS NLBs drop hairpin traffic (target connects through NLB back to itself),
# so CP nodes must talk to their own local kube-apiserver.
sudo sed -i 's|server: https://.*:6443|server: https://localhost:6443|g' \
/etc/kubernetes/admin.conf
mkdir -p "$HOME/.kube"
sudo cp -f /etc/kubernetes/admin.conf "$HOME/.kube/config"
sudo chown "$(id -u):$(id -g)" "$HOME/.kube/config"
Expand Down
Loading