Skip to content
Merged
2 changes: 1 addition & 1 deletion .github/workflows/periodic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:

- name: Clean up VPCs
if: steps.identify-resources.outputs.AWS_VPC_IDS != ''
uses: NVIDIA/holodeck@v0.3.3
uses: NVIDIA/holodeck@v0.3.4
with:
action: cleanup
vpc_ids: ${{ steps.identify-resources.outputs.AWS_VPC_IDS }}
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

All notable changes to this project will be documented in this file.

## [v0.3.4] - 2026-04-01

### Bug Fixes

- **fix: handle InvalidInternetGatewayID.NotFound in IGW detach** — When an Internet Gateway is already deleted, the detach step now recognizes `InvalidInternetGatewayID.NotFound` alongside `Gateway.NotAttached` and skips retries, fixing cleanup hangs where the IGW was deleted out-of-band.
- **fix: handle NotFound errors in NLB/listener/target-group deletion** — All NLB cleanup paths now check for `LoadBalancerNotFound`, `ListenerNotFound`, and `TargetGroupNotFound` before retrying, treating already-deleted resources as success.
- **fix: add SSH keepalive and handshake timeout** — SSH connections now send keepalive probes every 30 seconds to prevent session drops during long operations (e.g., `kubeadm init`). A 15-second handshake timeout prevents `connectOrDie` from blocking indefinitely against hosts that accept TCP but never complete the SSH handshake.
- **fix: suppress NotFound warnings in cleanup deleteInternetGateways** — The periodic cleanup job no longer logs misleading "Failed to detach/delete internet gateway" warnings when an IGW is already gone.

## [v0.3.3] - 2026-04-01

### Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ const (
// ProgramName is the canonical name of this program
ProgramName = "holodeck"
// ProgramVersion is the current version of the program
ProgramVersion = "0.3.3"
ProgramVersion = "0.3.4"
)

type config struct {
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func TestNewApp(t *testing.T) {
log := logger.NewLogger()
app := NewApp(log)

if app.Version != "0.3.3" {
t.Errorf("expected app version %q, got %q", "0.3.3", app.Version)
if app.Version != "0.3.4" {
t.Errorf("expected app version %q, got %q", "0.3.4", app.Version)
}
if app.Name != "holodeck" {
t.Errorf("expected app name %q, got %q", "holodeck", app.Name)
Expand Down
22 changes: 17 additions & 5 deletions pkg/cleanup/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,9 @@ func (c *Cleaner) deleteSecurityGroups(ctx context.Context, vpcID string) error

_, err = c.ec2.DeleteSecurityGroup(ctx, deleteInput)
if err != nil {
c.log.Warning("Failed to delete security group %s: %v", safeString(sg.GroupId), err)
if !strings.Contains(err.Error(), "InvalidGroup.NotFound") {
c.log.Warning("Failed to delete security group %s: %v", safeString(sg.GroupId), err)
}
}
}

Expand Down Expand Up @@ -512,7 +514,9 @@ func (c *Cleaner) deleteSubnets(ctx context.Context, vpcID string) error {

_, err = c.ec2.DeleteSubnet(ctx, deleteInput)
if err != nil {
c.log.Warning("Failed to delete subnet %s: %v", safeString(subnet.SubnetId), err)
if !strings.Contains(err.Error(), "InvalidSubnetID.NotFound") {
c.log.Warning("Failed to delete subnet %s: %v", safeString(subnet.SubnetId), err)
}
}
}

Expand Down Expand Up @@ -583,7 +587,9 @@ func (c *Cleaner) deleteRouteTables(ctx context.Context, vpcID string) error {

_, err = c.ec2.DeleteRouteTable(ctx, deleteInput)
if err != nil {
c.log.Warning("Failed to delete route table %s: %v", safeString(rt.RouteTableId), err)
if !strings.Contains(err.Error(), "InvalidRouteTableID.NotFound") {
c.log.Warning("Failed to delete route table %s: %v", safeString(rt.RouteTableId), err)
}
}
}

Expand Down Expand Up @@ -617,7 +623,11 @@ func (c *Cleaner) deleteInternetGateways(ctx context.Context, vpcID string) erro

_, err = c.ec2.DetachInternetGateway(ctx, detachInput)
if err != nil {
c.log.Warning("Failed to detach internet gateway %s: %v", safeString(igw.InternetGatewayId), err)
errMsg := err.Error()
if !strings.Contains(errMsg, "Gateway.NotAttached") &&
!strings.Contains(errMsg, "InvalidInternetGatewayID.NotFound") {
c.log.Warning("Failed to detach internet gateway %s: %v", safeString(igw.InternetGatewayId), err)
}
}

// Delete internet gateway
Expand All @@ -627,7 +637,9 @@ func (c *Cleaner) deleteInternetGateways(ctx context.Context, vpcID string) erro

_, err = c.ec2.DeleteInternetGateway(ctx, deleteInput)
if err != nil {
c.log.Warning("Failed to delete internet gateway %s: %v", safeString(igw.InternetGatewayId), err)
if !strings.Contains(err.Error(), "InvalidInternetGatewayID.NotFound") {
c.log.Warning("Failed to delete internet gateway %s: %v", safeString(igw.InternetGatewayId), err)
}
}
}

Expand Down
260 changes: 260 additions & 0 deletions pkg/cleanup/cleanup_ginkgo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,266 @@ var _ = Describe("Cleanup Package", func() {
})
})

Describe("deleteInternetGateways NotFound handling", func() {
BeforeEach(func() {
mockEC.DescribeInstancesFunc = func(ctx context.Context,
params *ec2.DescribeInstancesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) {
return &ec2.DescribeInstancesOutput{}, nil
}
mockEC.DescribeSecurityGroupsFunc = func(ctx context.Context,
params *ec2.DescribeSecurityGroupsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) {
return &ec2.DescribeSecurityGroupsOutput{}, nil
}
mockEC.DescribeSubnetsFunc = func(ctx context.Context,
params *ec2.DescribeSubnetsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSubnetsOutput, error) {
return &ec2.DescribeSubnetsOutput{}, nil
}
mockEC.DescribeRouteTablesFunc = func(ctx context.Context,
params *ec2.DescribeRouteTablesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) {
return &ec2.DescribeRouteTablesOutput{}, nil
}
mockEC.DeleteVpcFunc = func(ctx context.Context,
params *ec2.DeleteVpcInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteVpcOutput, error) {
return &ec2.DeleteVpcOutput{}, nil
}
})

It("should complete successfully when IGW detach/delete return NotFound", func() {
detachCalls := 0
deleteCalls := 0

mockEC.DescribeInternetGatewaysFunc = func(ctx context.Context,
params *ec2.DescribeInternetGatewaysInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInternetGatewaysOutput, error) {
return &ec2.DescribeInternetGatewaysOutput{
InternetGateways: []types.InternetGateway{
{InternetGatewayId: aws.String("igw-gone")},
},
}, nil
}
mockEC.DetachInternetGatewayFunc = func(ctx context.Context,
params *ec2.DetachInternetGatewayInput,
optFns ...func(*ec2.Options)) (*ec2.DetachInternetGatewayOutput, error) {
detachCalls++
return nil, fmt.Errorf("InvalidInternetGatewayID.NotFound: igw-gone does not exist")
}
mockEC.DeleteInternetGatewayFunc = func(ctx context.Context,
params *ec2.DeleteInternetGatewayInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteInternetGatewayOutput, error) {
deleteCalls++
return nil, fmt.Errorf("InvalidInternetGatewayID.NotFound: igw-gone does not exist")
}

cleaner, err := New(log, "us-west-2", WithEC2Client(mockEC))
Expect(err).NotTo(HaveOccurred())

err = cleaner.DeleteVPCResources(context.Background(), "vpc-12345")
Expect(err).NotTo(HaveOccurred())
// NotFound errors are silently ignored — detach and delete still called
Expect(detachCalls).To(Equal(1))
Expect(deleteCalls).To(Equal(1))
})
})

Describe("deleteSecurityGroups NotFound handling", func() {
BeforeEach(func() {
mockEC.DescribeInstancesFunc = func(ctx context.Context,
params *ec2.DescribeInstancesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) {
return &ec2.DescribeInstancesOutput{}, nil
}
mockEC.DescribeSubnetsFunc = func(ctx context.Context,
params *ec2.DescribeSubnetsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSubnetsOutput, error) {
return &ec2.DescribeSubnetsOutput{}, nil
}
mockEC.DescribeRouteTablesFunc = func(ctx context.Context,
params *ec2.DescribeRouteTablesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) {
return &ec2.DescribeRouteTablesOutput{}, nil
}
mockEC.DescribeInternetGatewaysFunc = func(ctx context.Context,
params *ec2.DescribeInternetGatewaysInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInternetGatewaysOutput, error) {
return &ec2.DescribeInternetGatewaysOutput{}, nil
}
mockEC.DeleteVpcFunc = func(ctx context.Context,
params *ec2.DeleteVpcInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteVpcOutput, error) {
return &ec2.DeleteVpcOutput{}, nil
}
})

It("should complete successfully when SG delete returns InvalidGroup.NotFound", func() {
deleteCalls := 0

mockEC.DescribeSecurityGroupsFunc = func(ctx context.Context,
params *ec2.DescribeSecurityGroupsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) {
return &ec2.DescribeSecurityGroupsOutput{
SecurityGroups: []types.SecurityGroup{
{GroupId: aws.String("sg-default"), GroupName: aws.String("default")},
{GroupId: aws.String("sg-gone"), GroupName: aws.String("holodeck-sg")},
},
}, nil
}
mockEC.DescribeNetworkInterfacesFunc = func(ctx context.Context,
params *ec2.DescribeNetworkInterfacesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) {
return &ec2.DescribeNetworkInterfacesOutput{}, nil
}
mockEC.DeleteSecurityGroupFunc = func(ctx context.Context,
params *ec2.DeleteSecurityGroupInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteSecurityGroupOutput, error) {
deleteCalls++
return nil, fmt.Errorf("InvalidGroup.NotFound: The security group '%s' does not exist", *params.GroupId)
}

cleaner, err := New(log, "us-west-2", WithEC2Client(mockEC))
Expect(err).NotTo(HaveOccurred())

err = cleaner.DeleteVPCResources(context.Background(), "vpc-12345")
Expect(err).NotTo(HaveOccurred())
Expect(deleteCalls).To(Equal(1))
})
})

Describe("deleteSubnets NotFound handling", func() {
BeforeEach(func() {
mockEC.DescribeInstancesFunc = func(ctx context.Context,
params *ec2.DescribeInstancesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) {
return &ec2.DescribeInstancesOutput{}, nil
}
mockEC.DescribeSecurityGroupsFunc = func(ctx context.Context,
params *ec2.DescribeSecurityGroupsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) {
return &ec2.DescribeSecurityGroupsOutput{}, nil
}
mockEC.DescribeRouteTablesFunc = func(ctx context.Context,
params *ec2.DescribeRouteTablesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) {
return &ec2.DescribeRouteTablesOutput{}, nil
}
mockEC.DescribeInternetGatewaysFunc = func(ctx context.Context,
params *ec2.DescribeInternetGatewaysInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInternetGatewaysOutput, error) {
return &ec2.DescribeInternetGatewaysOutput{}, nil
}
mockEC.DeleteVpcFunc = func(ctx context.Context,
params *ec2.DeleteVpcInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteVpcOutput, error) {
return &ec2.DeleteVpcOutput{}, nil
}
})

It("should complete successfully when subnet delete returns InvalidSubnetID.NotFound", func() {
deleteCalls := 0

mockEC.DescribeSubnetsFunc = func(ctx context.Context,
params *ec2.DescribeSubnetsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSubnetsOutput, error) {
return &ec2.DescribeSubnetsOutput{
Subnets: []types.Subnet{
{SubnetId: aws.String("subnet-gone")},
},
}, nil
}
mockEC.DeleteSubnetFunc = func(ctx context.Context,
params *ec2.DeleteSubnetInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteSubnetOutput, error) {
deleteCalls++
return nil, fmt.Errorf("InvalidSubnetID.NotFound: The subnet ID '%s' does not exist", *params.SubnetId)
}

cleaner, err := New(log, "us-west-2", WithEC2Client(mockEC))
Expect(err).NotTo(HaveOccurred())

err = cleaner.DeleteVPCResources(context.Background(), "vpc-12345")
Expect(err).NotTo(HaveOccurred())
Expect(deleteCalls).To(Equal(1))
})
})

Describe("deleteRouteTables NotFound handling", func() {
BeforeEach(func() {
mockEC.DescribeInstancesFunc = func(ctx context.Context,
params *ec2.DescribeInstancesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) {
return &ec2.DescribeInstancesOutput{}, nil
}
mockEC.DescribeSecurityGroupsFunc = func(ctx context.Context,
params *ec2.DescribeSecurityGroupsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) {
return &ec2.DescribeSecurityGroupsOutput{}, nil
}
mockEC.DescribeSubnetsFunc = func(ctx context.Context,
params *ec2.DescribeSubnetsInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeSubnetsOutput, error) {
return &ec2.DescribeSubnetsOutput{}, nil
}
mockEC.DescribeInternetGatewaysFunc = func(ctx context.Context,
params *ec2.DescribeInternetGatewaysInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeInternetGatewaysOutput, error) {
return &ec2.DescribeInternetGatewaysOutput{}, nil
}
mockEC.DeleteVpcFunc = func(ctx context.Context,
params *ec2.DeleteVpcInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteVpcOutput, error) {
return &ec2.DeleteVpcOutput{}, nil
}
})

It("should complete successfully when route table delete returns InvalidRouteTableID.NotFound", func() {
deleteCalls := 0
mainRT := true

mockEC.DescribeRouteTablesFunc = func(ctx context.Context,
params *ec2.DescribeRouteTablesInput,
optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) {
return &ec2.DescribeRouteTablesOutput{
RouteTables: []types.RouteTable{
{
RouteTableId: aws.String("rtb-main"),
Associations: []types.RouteTableAssociation{
{RouteTableAssociationId: aws.String("rtbassoc-main"), Main: &mainRT},
},
},
{
RouteTableId: aws.String("rtb-gone"),
Associations: []types.RouteTableAssociation{
{RouteTableAssociationId: aws.String("rtbassoc-gone")},
},
},
},
}, nil
}
mockEC.ReplaceRouteTableAssociationFunc = func(ctx context.Context,
params *ec2.ReplaceRouteTableAssociationInput,
optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) {
return &ec2.ReplaceRouteTableAssociationOutput{}, nil
}
mockEC.DeleteRouteTableFunc = func(ctx context.Context,
params *ec2.DeleteRouteTableInput,
optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) {
deleteCalls++
return nil, fmt.Errorf("InvalidRouteTableID.NotFound: The routeTable ID '%s' does not exist", *params.RouteTableId)
}

cleaner, err := New(log, "us-west-2", WithEC2Client(mockEC))
Expect(err).NotTo(HaveOccurred())

err = cleaner.DeleteVPCResources(context.Background(), "vpc-12345")
Expect(err).NotTo(HaveOccurred())
Expect(deleteCalls).To(Equal(1))
})
})

Describe("deleteRouteTables", func() {
BeforeEach(func() {
mockEC.DescribeInstancesFunc = func(ctx context.Context,
Expand Down
Loading
Loading