From 60e259801452c42941557c7c0b76b7f8e55e4360 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 18 Mar 2026 11:20:43 +0100 Subject: [PATCH 01/13] fix: wait for NAT Gateway to reach available state before creating routes CreateNatGateway returns immediately with a pending NAT Gateway ID. Using this ID in CreateRoute before the NAT GW is available causes InvalidNatGatewayID.NotFound errors, breaking all cluster deployments. Poll DescribeNatGateways until state transitions to available (or failed) before returning from createNATGateway. This matches the existing pattern in deleteNATGateway which polls for deleted state. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/cluster_test.go | 89 ++++++++++++++++++++++++++++++++ pkg/provider/aws/create.go | 31 ++++++++++- pkg/provider/aws/create_test.go | 11 +++- 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/pkg/provider/aws/cluster_test.go b/pkg/provider/aws/cluster_test.go index d976f1c2..de841c6c 100644 --- a/pkg/provider/aws/cluster_test.go +++ b/pkg/provider/aws/cluster_test.go @@ -18,6 +18,7 @@ package aws import ( "context" + "strings" "sync" "testing" @@ -384,3 +385,91 @@ func TestNATGatewayCreatedInPublicSubnet(t *testing.T) { aws.ToString(capturedNAT.SubnetId), "subnet-public") } } + +// TestNATGatewayWaitsForAvailable verifies that createNATGateway polls +// DescribeNatGateways until the NAT GW transitions from pending to available. +func TestNATGatewayWaitsForAvailable(t *testing.T) { + describeCalls := 0 + + mock := NewMockEC2Client() + mock.CreateNatGatewayFunc = func(ctx context.Context, params *ec2.CreateNatGatewayInput, optFns ...func(*ec2.Options)) (*ec2.CreateNatGatewayOutput, error) { + return &ec2.CreateNatGatewayOutput{ + NatGateway: &types.NatGateway{ + NatGatewayId: aws.String("nat-pending-123"), + State: types.NatGatewayStatePending, + }, + }, nil + } + mock.DescribeNatGatewaysFunc = func(ctx context.Context, params *ec2.DescribeNatGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNatGatewaysOutput, error) { + describeCalls++ + state := types.NatGatewayStatePending + if describeCalls >= 2 { + state = types.NatGatewayStateAvailable + } + return &ec2.DescribeNatGatewaysOutput{ + NatGateways: []types.NatGateway{ + { + NatGatewayId: aws.String("nat-pending-123"), + State: state, + }, + }, + }, nil + } + + provider := newTestProvider(mock) + cache := &AWS{ + Vpcid: "vpc-test", + PublicSubnetid: "subnet-public", + Subnetid: "subnet-private", + } + + if err := provider.createNATGateway(cache); err != nil { + t.Fatalf("createNATGateway failed: %v", err) + } + + if describeCalls < 2 { + t.Errorf("Expected at least 2 DescribeNatGateways calls for polling, got %d", describeCalls) + } + if cache.NatGatewayid != "nat-pending-123" { + t.Errorf("cache.NatGatewayid = %q, want %q", cache.NatGatewayid, "nat-pending-123") + } +} + +// TestNATGatewayFailedState verifies that createNATGateway returns an error +// if the NAT GW transitions to the failed state. +func TestNATGatewayFailedState(t *testing.T) { + mock := NewMockEC2Client() + mock.CreateNatGatewayFunc = func(ctx context.Context, params *ec2.CreateNatGatewayInput, optFns ...func(*ec2.Options)) (*ec2.CreateNatGatewayOutput, error) { + return &ec2.CreateNatGatewayOutput{ + NatGateway: &types.NatGateway{ + NatGatewayId: aws.String("nat-fail-123"), + State: types.NatGatewayStatePending, + }, + }, nil + } + mock.DescribeNatGatewaysFunc = func(ctx context.Context, params *ec2.DescribeNatGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNatGatewaysOutput, error) { + return &ec2.DescribeNatGatewaysOutput{ + NatGateways: []types.NatGateway{ + { + NatGatewayId: aws.String("nat-fail-123"), + State: types.NatGatewayStateFailed, + }, + }, + }, nil + } + + provider := newTestProvider(mock) + cache := &AWS{ + Vpcid: "vpc-test", + PublicSubnetid: "subnet-public", + Subnetid: "subnet-private", + } + + err := provider.createNATGateway(cache) + if err == nil { + t.Fatal("Expected error when NAT Gateway reaches failed state") + } + if !strings.Contains(err.Error(), "failed state") { + t.Errorf("Error should mention failed state, got: %v", err) + } +} diff --git a/pkg/provider/aws/create.go b/pkg/provider/aws/create.go index bafc30f3..1f44c0d8 100644 --- a/pkg/provider/aws/create.go +++ b/pkg/provider/aws/create.go @@ -654,8 +654,35 @@ func (p *Provider) createNATGateway(cache *AWS) error { } cache.NatGatewayid = *natOutput.NatGateway.NatGatewayId - cancelLoading(nil) - return nil + // Wait for NAT Gateway to reach "available" state before returning. + // CreateNatGateway returns immediately with state "pending"; using the + // NAT GW ID in a CreateRoute call before it is available causes + // InvalidNatGatewayID.NotFound errors. + p.log.Info("Waiting for NAT Gateway %s to become available", cache.NatGatewayid) + for i := 0; i < 60; i++ { // 60 × 5s = 5 minutes max + time.Sleep(5 * time.Second) + dCtx, dCancel := context.WithTimeout(context.Background(), 30*time.Second) + out, err := p.ec2.DescribeNatGateways(dCtx, &ec2.DescribeNatGatewaysInput{ + NatGatewayIds: []string{cache.NatGatewayid}, + }) + dCancel() + if err != nil { + p.log.Warning("Error checking NAT Gateway state: %v", err) + continue + } + if len(out.NatGateways) > 0 && out.NatGateways[0].State == types.NatGatewayStateAvailable { + p.log.Info("NAT Gateway %s is available", cache.NatGatewayid) + cancelLoading(nil) + return nil + } + if len(out.NatGateways) > 0 && out.NatGateways[0].State == types.NatGatewayStateFailed { + cancelLoading(logger.ErrLoadingFailed) + return fmt.Errorf("NAT Gateway %s reached failed state", cache.NatGatewayid) + } + } + + cancelLoading(logger.ErrLoadingFailed) + return fmt.Errorf("NAT Gateway %s did not become available within timeout", cache.NatGatewayid) } // releaseEIP releases an Elastic IP by allocation ID. diff --git a/pkg/provider/aws/create_test.go b/pkg/provider/aws/create_test.go index 42d3c887..3bc1a914 100644 --- a/pkg/provider/aws/create_test.go +++ b/pkg/provider/aws/create_test.go @@ -310,7 +310,16 @@ func (m *mockEC2Client) DeleteNatGateway(ctx context.Context, params *ec2.Delete func (m *mockEC2Client) DescribeNatGateways(ctx context.Context, params *ec2.DescribeNatGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeNatGatewaysOutput, error) { - return &ec2.DescribeNatGatewaysOutput{}, nil + // Return the NAT GW as available so the wait loop in createNATGateway exits immediately. + natGWID := "nat-123" + if len(params.NatGatewayIds) > 0 { + natGWID = params.NatGatewayIds[0] + } + return &ec2.DescribeNatGatewaysOutput{ + NatGateways: []types.NatGateway{ + {NatGatewayId: aws.String(natGWID), State: types.NatGatewayStateAvailable}, + }, + }, nil } func (m *mockEC2Client) ModifySubnetAttribute(ctx context.Context, params *ec2.ModifySubnetAttributeInput, From 72d4e5f3852e32e6391aea51028b897257864c72 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 18 Mar 2026 12:33:54 +0100 Subject: [PATCH 02/13] fix: remove public IP assertion for cluster nodes in private subnet In the production cluster topology all nodes are in the private subnet and do not have public IPs. The E2E test was asserting PublicIP is non-empty for all nodes, which fails by design. Only PrivateIP is guaranteed. Signed-off-by: Carlos Eduardo Arango Gutierrez --- tests/aws_cluster_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/aws_cluster_test.go b/tests/aws_cluster_test.go index b3b790c2..43558e40 100644 --- a/tests/aws_cluster_test.go +++ b/tests/aws_cluster_test.go @@ -148,7 +148,8 @@ var _ = DescribeTable("AWS Cluster E2E", case "worker": workerNodes++ } - Expect(node.PublicIP).NotTo(BeEmpty(), "Node public IP should not be empty") + // In production cluster topology all nodes are in the private subnet + // and may not have public IPs. Only PrivateIP is guaranteed. Expect(node.PrivateIP).NotTo(BeEmpty(), "Node private IP should not be empty") Expect(node.InstanceID).NotTo(BeEmpty(), "Node instance ID should not be empty") } From 3152fb6381b74c3ff837083adc2ba5f66107a8fe Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 18 Mar 2026 15:53:51 +0100 Subject: [PATCH 03/13] fix: wire SSM transport for private-subnet cluster nodes Cluster nodes in private subnets have no public IP, so SSH connections must go through SSM port-forwarding. The E2E test and GetClusterHealth were building NodeInfo without setting the Transport field, causing SSH to attempt direct connection to an empty host. Wire SSMTransport for nodes with no PublicIP, matching the production code path in buildClusterNodeInfoList. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/cluster.go | 14 +++++++++++++- tests/aws_cluster_test.go | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pkg/provisioner/cluster.go b/pkg/provisioner/cluster.go index 29a481e6..eb128322 100644 --- a/pkg/provisioner/cluster.go +++ b/pkg/provisioner/cluster.go @@ -668,9 +668,14 @@ func (cp *ClusterProvisioner) GetClusterHealth(firstCPHost string) (*ClusterHeal } } } - // Build transport options from node info in environment status + // Build transport options from node info in environment status. + // Private-subnet nodes (no public IP) need SSM port-forwarding. var transportOpts []Option if cp.Environment != nil && cp.Environment.Status.Cluster != nil { + region := "" + if cp.Environment.Spec.Cluster != nil { + region = cp.Environment.Spec.Cluster.Region + } for _, node := range cp.Environment.Status.Cluster.Nodes { if node.PublicIP == firstCPHost || node.PrivateIP == firstCPHost { nodeInfo := NodeInfo{ @@ -678,6 +683,13 @@ func (cp *ClusterProvisioner) GetClusterHealth(firstCPHost string) (*ClusterHeal PrivateIP: node.PrivateIP, InstanceID: node.InstanceID, } + // Wire SSM transport for private-subnet nodes + if node.PublicIP == "" && node.InstanceID != "" && region != "" { + nodeInfo.Transport = &SSMTransport{ + InstanceID: node.InstanceID, + Region: region, + } + } transportOpts = cp.transportOptsForNode(nodeInfo) break } diff --git a/tests/aws_cluster_test.go b/tests/aws_cluster_test.go index 43558e40..91e185b5 100644 --- a/tests/aws_cluster_test.go +++ b/tests/aws_cluster_test.go @@ -159,16 +159,26 @@ var _ = DescribeTable("AWS Cluster E2E", } By("Provisioning the cluster") - // Build nodes list for provisioner + // Build nodes list for provisioner, wiring SSM transport for private-subnet nodes + region := state.opts.cfg.Spec.Cluster.Region var nodes []provisioner.NodeInfo for _, node := range env.Status.Cluster.Nodes { - nodes = append(nodes, provisioner.NodeInfo{ + info := provisioner.NodeInfo{ Name: node.Name, PublicIP: node.PublicIP, PrivateIP: node.PrivateIP, Role: node.Role, SSHUsername: node.SSHUsername, - }) + InstanceID: node.InstanceID, + } + // Private-subnet nodes have no public IP — use SSM port-forwarding + if node.PublicIP == "" && node.InstanceID != "" { + info.Transport = &provisioner.SSMTransport{ + InstanceID: node.InstanceID, + Region: region, + } + } + nodes = append(nodes, info) } // Create cluster provisioner From 1dd57eeb8c8a8268f8b26eeaa1b509800d6f0ffc Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 18 Mar 2026 21:58:35 +0100 Subject: [PATCH 04/13] fix: place cluster instances in public subnet with public IPs The private subnet topology (PR #728) moved cluster instances behind a NAT GW, but SSM infrastructure (IAM instance profiles) was never set up, making SSH connections impossible. Revert instances to the public subnet with AssociatePublicIpAddress=true so they are directly reachable via SSH. The NAT GW + NLB infrastructure remains in place for future SSM migration. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/cluster.go | 4 ++-- pkg/provider/aws/cluster_test.go | 17 +++++++++-------- pkg/provisioner/cluster.go | 14 +------------- tests/aws_cluster_test.go | 19 ++++--------------- 4 files changed, 16 insertions(+), 38 deletions(-) diff --git a/pkg/provider/aws/cluster.go b/pkg/provider/aws/cluster.go index a77e8414..b1712ff6 100644 --- a/pkg/provider/aws/cluster.go +++ b/pkg/provider/aws/cluster.go @@ -701,11 +701,11 @@ func (p *Provider) createInstances( }, NetworkInterfaces: []types.InstanceNetworkInterfaceSpecification{ { - AssociatePublicIpAddress: aws.Bool(false), + AssociatePublicIpAddress: aws.Bool(true), DeleteOnTermination: aws.Bool(true), DeviceIndex: aws.Int32(0), Groups: []string{sgID}, - SubnetId: aws.String(cache.Subnetid), + SubnetId: aws.String(cache.PublicSubnetid), }, }, KeyName: aws.String(p.Spec.KeyName), diff --git a/pkg/provider/aws/cluster_test.go b/pkg/provider/aws/cluster_test.go index de841c6c..74b71de0 100644 --- a/pkg/provider/aws/cluster_test.go +++ b/pkg/provider/aws/cluster_test.go @@ -29,9 +29,9 @@ import ( "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" ) -// TestCreateInstancesSetsNoPublicIP verifies that createInstances sets -// AssociatePublicIpAddress=false in the RunInstancesInput for cluster mode. -func TestCreateInstancesSetsNoPublicIP(t *testing.T) { +// TestCreateInstancesSetsPublicIP verifies that createInstances sets +// AssociatePublicIpAddress=true in the RunInstancesInput for cluster mode. +func TestCreateInstancesSetsPublicIP(t *testing.T) { var mu sync.Mutex var captured []*ec2.RunInstancesInput @@ -94,6 +94,7 @@ func TestCreateInstancesSetsNoPublicIP(t *testing.T) { cache := &ClusterCache{ AWS: AWS{ Subnetid: "subnet-private", + PublicSubnetid: "subnet-public", CPSecurityGroupid: "sg-cp", WorkerSecurityGroupid: "sg-worker", }, @@ -127,13 +128,13 @@ func TestCreateInstancesSetsNoPublicIP(t *testing.T) { if len(nis) == 0 { t.Fatal("expected NetworkInterfaces in RunInstancesInput") } - if nis[0].AssociatePublicIpAddress == nil || *nis[0].AssociatePublicIpAddress != false { - t.Error("AssociatePublicIpAddress should be false for cluster instances") + if nis[0].AssociatePublicIpAddress == nil || *nis[0].AssociatePublicIpAddress != true { + t.Error("AssociatePublicIpAddress should be true for cluster instances") } - // Verify instance uses private subnet - if aws.ToString(nis[0].SubnetId) != "subnet-private" { - t.Errorf("SubnetId = %q, want %q", aws.ToString(nis[0].SubnetId), "subnet-private") + // Verify instance uses public subnet + if aws.ToString(nis[0].SubnetId) != "subnet-public" { + t.Errorf("SubnetId = %q, want %q", aws.ToString(nis[0].SubnetId), "subnet-public") } } diff --git a/pkg/provisioner/cluster.go b/pkg/provisioner/cluster.go index eb128322..29a481e6 100644 --- a/pkg/provisioner/cluster.go +++ b/pkg/provisioner/cluster.go @@ -668,14 +668,9 @@ func (cp *ClusterProvisioner) GetClusterHealth(firstCPHost string) (*ClusterHeal } } } - // Build transport options from node info in environment status. - // Private-subnet nodes (no public IP) need SSM port-forwarding. + // Build transport options from node info in environment status var transportOpts []Option if cp.Environment != nil && cp.Environment.Status.Cluster != nil { - region := "" - if cp.Environment.Spec.Cluster != nil { - region = cp.Environment.Spec.Cluster.Region - } for _, node := range cp.Environment.Status.Cluster.Nodes { if node.PublicIP == firstCPHost || node.PrivateIP == firstCPHost { nodeInfo := NodeInfo{ @@ -683,13 +678,6 @@ func (cp *ClusterProvisioner) GetClusterHealth(firstCPHost string) (*ClusterHeal PrivateIP: node.PrivateIP, InstanceID: node.InstanceID, } - // Wire SSM transport for private-subnet nodes - if node.PublicIP == "" && node.InstanceID != "" && region != "" { - nodeInfo.Transport = &SSMTransport{ - InstanceID: node.InstanceID, - Region: region, - } - } transportOpts = cp.transportOptsForNode(nodeInfo) break } diff --git a/tests/aws_cluster_test.go b/tests/aws_cluster_test.go index 91e185b5..b3b790c2 100644 --- a/tests/aws_cluster_test.go +++ b/tests/aws_cluster_test.go @@ -148,8 +148,7 @@ var _ = DescribeTable("AWS Cluster E2E", case "worker": workerNodes++ } - // In production cluster topology all nodes are in the private subnet - // and may not have public IPs. Only PrivateIP is guaranteed. + Expect(node.PublicIP).NotTo(BeEmpty(), "Node public IP should not be empty") Expect(node.PrivateIP).NotTo(BeEmpty(), "Node private IP should not be empty") Expect(node.InstanceID).NotTo(BeEmpty(), "Node instance ID should not be empty") } @@ -159,26 +158,16 @@ var _ = DescribeTable("AWS Cluster E2E", } By("Provisioning the cluster") - // Build nodes list for provisioner, wiring SSM transport for private-subnet nodes - region := state.opts.cfg.Spec.Cluster.Region + // Build nodes list for provisioner var nodes []provisioner.NodeInfo for _, node := range env.Status.Cluster.Nodes { - info := provisioner.NodeInfo{ + nodes = append(nodes, provisioner.NodeInfo{ Name: node.Name, PublicIP: node.PublicIP, PrivateIP: node.PrivateIP, Role: node.Role, SSHUsername: node.SSHUsername, - InstanceID: node.InstanceID, - } - // Private-subnet nodes have no public IP — use SSM port-forwarding - if node.PublicIP == "" && node.InstanceID != "" { - info.Transport = &provisioner.SSMTransport{ - InstanceID: node.InstanceID, - Region: region, - } - } - nodes = append(nodes, info) + }) } // Create cluster provisioner From 7b7b7edc39c0697aabeefd8dc00826d341547a0c Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 19 Mar 2026 11:10:16 +0100 Subject: [PATCH 05/13] cluster: skip NAT Gateway creation to avoid EIP quota exhaustion Cluster instances are placed in the public subnet with AssociatePublicIpAddress=true, so they reach the internet directly through the IGW. The NAT Gateway (and its EIP allocation + private route table) are unnecessary for this topology. Removing them from the cluster code path frees up EIP quota (AWS limit: 5 per region in us-west-1), which was being exhausted when multiple CI jobs ran concurrently, causing AddressLimitExceeded errors on the rpm-rocky and rpm-al2023 E2E tests. The private subnet is retained for future SSM endpoint use. Single-node mode (create.go) is unchanged and continues to use NAT Gateway for its private-subnet instances. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/cluster.go | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/pkg/provider/aws/cluster.go b/pkg/provider/aws/cluster.go index b1712ff6..bb22efb8 100644 --- a/pkg/provider/aws/cluster.go +++ b/pkg/provider/aws/cluster.go @@ -135,12 +135,12 @@ func (p *Provider) CreateCluster() error { } _ = p.updateProgressingCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Internet Gateway created") - // Phase 1b: Create cluster networking (public subnet, NAT GW, route tables) - // Note: We skip createRouteTable() here — in single-node mode it creates an - // IGW-routed table for the (only) subnet. In cluster mode the private subnet - // gets a NAT-routed table (createPrivateRouteTable) and the public subnet - // gets an IGW-routed table (createPublicRouteTable). Calling createRouteTable - // would create an orphaned IGW table associated with the private subnet. + // Phase 1b: Create public subnet and route table for cluster instances. + // Cluster instances are placed in the public subnet with AssociatePublicIpAddress=true, + // so they have direct internet access via the IGW. NAT Gateway and private route table + // are NOT needed — skipping them avoids consuming scarce EIP quota (AWS limit: 5 per + // region), which caused CI failures when multiple jobs ran concurrently. + // The private subnet (created above) is retained for future SSM endpoint use. if err := p.createPublicSubnet(&cache.AWS); err != nil { _ = p.updateDegradedCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Error creating public subnet") return fmt.Errorf("error creating public subnet: %w", err) @@ -153,18 +153,6 @@ func (p *Provider) CreateCluster() error { } _ = p.updateProgressingCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Public route table created") - if err := p.createNATGateway(&cache.AWS); err != nil { - _ = p.updateDegradedCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Error creating NAT Gateway") - return fmt.Errorf("error creating NAT Gateway: %w", err) - } - _ = p.updateProgressingCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "NAT Gateway created") - - if err := p.createPrivateRouteTable(&cache.AWS); err != nil { - _ = p.updateDegradedCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Error creating private route table") - return fmt.Errorf("error creating private route table: %w", err) - } - _ = p.updateProgressingCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Private route table created") - // Phase 2: Create separate CP and Worker security groups if err := p.createControlPlaneSecurityGroup(cache); err != nil { _ = p.updateDegradedCondition(*p.DeepCopy(), &cache.AWS, "v1alpha1.Creating", "Error creating control-plane security group") From fa13fef1f5cc3e7abba26c57be433f900a975aa0 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 19 Mar 2026 18:28:26 +0100 Subject: [PATCH 06/13] feat(aws): add RevokeSecurityGroup{Ingress,Egress} to EC2Client interface Extend the EC2Client interface with RevokeSecurityGroupIngress and RevokeSecurityGroupEgress methods needed for proper SG cleanup. Update both mock implementations to satisfy the interface. Signed-off-by: Carlos Eduardo Arango Gutierrez --- internal/aws/ec2_client.go | 8 ++++++++ pkg/provider/aws/create_test.go | 12 ++++++++++++ pkg/provider/aws/mock_ec2_test.go | 22 +++++++++++++++++++++- 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/internal/aws/ec2_client.go b/internal/aws/ec2_client.go index 9339c92a..5d56ec54 100644 --- a/internal/aws/ec2_client.go +++ b/internal/aws/ec2_client.go @@ -96,6 +96,14 @@ type EC2Client interface { DescribeSecurityGroups(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) + RevokeSecurityGroupIngress(ctx context.Context, + params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, + error) + RevokeSecurityGroupEgress(ctx context.Context, + params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, + error) // Instance operations RunInstances(ctx context.Context, params *ec2.RunInstancesInput, diff --git a/pkg/provider/aws/create_test.go b/pkg/provider/aws/create_test.go index 3bc1a914..592e08a7 100644 --- a/pkg/provider/aws/create_test.go +++ b/pkg/provider/aws/create_test.go @@ -185,6 +185,18 @@ func (m *mockEC2Client) DescribeSecurityGroups(ctx context.Context, return &ec2.DescribeSecurityGroupsOutput{}, nil } +func (m *mockEC2Client) RevokeSecurityGroupIngress(ctx context.Context, + params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + return &ec2.RevokeSecurityGroupIngressOutput{}, nil +} + +func (m *mockEC2Client) RevokeSecurityGroupEgress(ctx context.Context, + params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + return &ec2.RevokeSecurityGroupEgressOutput{}, nil +} + func (m *mockEC2Client) RunInstances(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { m.runInstancesCalls = append(m.runInstancesCalls, *params) diff --git a/pkg/provider/aws/mock_ec2_test.go b/pkg/provider/aws/mock_ec2_test.go index 250e8278..5e1c4a0f 100644 --- a/pkg/provider/aws/mock_ec2_test.go +++ b/pkg/provider/aws/mock_ec2_test.go @@ -59,7 +59,9 @@ type MockEC2Client struct { CreateSGFunc func(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) AuthorizeSGFunc func(ctx context.Context, params *ec2.AuthorizeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupIngressOutput, error) DeleteSGFunc func(ctx context.Context, params *ec2.DeleteSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.DeleteSecurityGroupOutput, error) - DescribeSGsFunc func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) + DescribeSGsFunc func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) + RevokeSGIngressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) + RevokeSGEgressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) // Instance RunInstancesFunc func(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) @@ -262,6 +264,24 @@ func (m *MockEC2Client) DescribeSecurityGroups(ctx context.Context, params *ec2. return &ec2.DescribeSecurityGroupsOutput{}, nil } +func (m *MockEC2Client) RevokeSecurityGroupIngress(ctx context.Context, + params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + if m.RevokeSGIngressFunc != nil { + return m.RevokeSGIngressFunc(ctx, params, optFns...) + } + return &ec2.RevokeSecurityGroupIngressOutput{}, nil +} + +func (m *MockEC2Client) RevokeSecurityGroupEgress(ctx context.Context, + params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + if m.RevokeSGEgressFunc != nil { + return m.RevokeSGEgressFunc(ctx, params, optFns...) + } + return &ec2.RevokeSecurityGroupEgressOutput{}, nil +} + // Instance operations func (m *MockEC2Client) RunInstances(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) { if m.RunInstancesFunc != nil { From 280f10ad039f4b239734fd93f4679d921ffe1dd4 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 19 Mar 2026 18:43:26 +0100 Subject: [PATCH 07/13] feat(aws): revoke SG rules before deletion to prevent DependencyViolation Before calling DeleteSecurityGroup, enumerate and revoke all ingress and egress rules. This breaks cross-SG references (e.g., worker SG referencing control-plane SG) that cause DependencyViolation errors. Gracefully handles already-deleted groups (InvalidGroup.NotFound). Revoke errors are logged as warnings and do not block the delete attempt. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/delete.go | 69 +++++++++++++++++ pkg/provider/aws/delete_test.go | 133 ++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) diff --git a/pkg/provider/aws/delete.go b/pkg/provider/aws/delete.go index b47aede3..5f8e9d8b 100644 --- a/pkg/provider/aws/delete.go +++ b/pkg/provider/aws/delete.go @@ -293,6 +293,11 @@ func (p *Provider) deleteSecurityGroup(sgID, label string) error { return nil } + // Revoke all rules first to break cross-SG dependencies + if err := p.revokeSecurityGroupRules(sgID); err != nil { + p.log.Warning("Error revoking rules for %s SG %s (continuing): %v", label, sgID, err) + } + err := p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) defer cancel() @@ -691,6 +696,70 @@ func (p *Provider) deleteVPC(cache *AWS) error { // Helper functions +// revokeSecurityGroupRules removes all ingress and egress rules from a security +// group before deletion. This prevents DependencyViolation errors caused by +// cross-SG references (e.g., worker SG referencing control-plane SG). +func (p *Provider) revokeSecurityGroupRules(sgID string) error { + if sgID == "" { + return nil + } + + ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) + defer cancel() + + result, err := p.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ + GroupIds: []string{sgID}, + }) + if err != nil { + if strings.Contains(err.Error(), "InvalidGroup.NotFound") { + return nil + } + return fmt.Errorf("error describing security group %s: %w", sgID, err) + } + + if len(result.SecurityGroups) == 0 { + return nil + } + + sg := result.SecurityGroups[0] + + // Revoke all ingress rules + if len(sg.IpPermissions) > 0 { + p.log.Info("Revoking %d ingress rule(s) from security group %s", len(sg.IpPermissions), sgID) + rCtx, rCancel := context.WithTimeout(context.Background(), apiCallTimeout) + defer rCancel() + _, err := p.ec2.RevokeSecurityGroupIngress(rCtx, &ec2.RevokeSecurityGroupIngressInput{ + GroupId: &sgID, + IpPermissions: sg.IpPermissions, + }) + if err != nil { + if strings.Contains(err.Error(), "InvalidGroup.NotFound") { + return nil + } + return fmt.Errorf("error revoking ingress rules for %s: %w", sgID, err) + } + } + + // Revoke all egress rules + if len(sg.IpPermissionsEgress) > 0 { + p.log.Info("Revoking %d egress rule(s) from security group %s", len(sg.IpPermissionsEgress), sgID) + rCtx, rCancel := context.WithTimeout(context.Background(), apiCallTimeout) + defer rCancel() + _, err := p.ec2.RevokeSecurityGroupEgress(rCtx, &ec2.RevokeSecurityGroupEgressInput{ + GroupId: &sgID, + IpPermissions: sg.IpPermissionsEgress, + }) + if err != nil { + if strings.Contains(err.Error(), "InvalidGroup.NotFound") { + return nil + } + return fmt.Errorf("error revoking egress rules for %s: %w", sgID, err) + } + } + + return nil +} + func (p *Provider) retryWithBackoff(operation func() error) error { delay := retryDelay for i := 0; i < maxRetries; i++ { diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index 6d7a0899..c3a98cec 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -519,3 +519,136 @@ func TestDeleteSecurityGroups_SharedSameAsCP(t *testing.T) { t.Errorf("second delete should be CP SG (same as shared), got %s", deletedSGs[1]) } } + +func TestRevokeSecurityGroupRules_RevokesIngressAndEgress(t *testing.T) { + var ingressCalls []ec2.RevokeSecurityGroupIngressInput + var egressCalls []ec2.RevokeSecurityGroupEgressInput + + mock := NewMockEC2Client() + mock.DescribeSGsFunc = func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) { + return &ec2.DescribeSecurityGroupsOutput{ + SecurityGroups: []types.SecurityGroup{ + { + GroupId: aws.String("sg-worker"), + IpPermissions: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + UserIdGroupPairs: []types.UserIdGroupPair{ + {GroupId: aws.String("sg-cp")}, + }, + }, + }, + IpPermissionsEgress: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + IpRanges: []types.IpRange{ + {CidrIp: aws.String("0.0.0.0/0")}, + }, + }, + }, + }, + }, + }, nil + } + mock.RevokeSGIngressFunc = func(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + ingressCalls = append(ingressCalls, *params) + return &ec2.RevokeSecurityGroupIngressOutput{}, nil + } + mock.RevokeSGEgressFunc = func(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + egressCalls = append(egressCalls, *params) + return &ec2.RevokeSecurityGroupEgressOutput{}, nil + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.revokeSecurityGroupRules("sg-worker") + if err != nil { + t.Fatalf("revokeSecurityGroupRules failed: %v", err) + } + + if len(ingressCalls) != 1 { + t.Fatalf("Expected 1 RevokeSecurityGroupIngress call, got %d", len(ingressCalls)) + } + if *ingressCalls[0].GroupId != "sg-worker" { + t.Errorf("Expected GroupId 'sg-worker', got %q", *ingressCalls[0].GroupId) + } + + if len(egressCalls) != 1 { + t.Fatalf("Expected 1 RevokeSecurityGroupEgress call, got %d", len(egressCalls)) + } + if *egressCalls[0].GroupId != "sg-worker" { + t.Errorf("Expected GroupId 'sg-worker', got %q", *egressCalls[0].GroupId) + } +} + +func TestRevokeSecurityGroupRules_SkipsEmptyRules(t *testing.T) { + var ingressCalls []ec2.RevokeSecurityGroupIngressInput + var egressCalls []ec2.RevokeSecurityGroupEgressInput + + mock := NewMockEC2Client() + mock.DescribeSGsFunc = func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) { + return &ec2.DescribeSecurityGroupsOutput{ + SecurityGroups: []types.SecurityGroup{ + { + GroupId: aws.String("sg-empty"), + IpPermissions: nil, + IpPermissionsEgress: nil, + }, + }, + }, nil + } + mock.RevokeSGIngressFunc = func(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + ingressCalls = append(ingressCalls, *params) + return &ec2.RevokeSecurityGroupIngressOutput{}, nil + } + mock.RevokeSGEgressFunc = func(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + egressCalls = append(egressCalls, *params) + return &ec2.RevokeSecurityGroupEgressOutput{}, nil + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.revokeSecurityGroupRules("sg-empty") + if err != nil { + t.Fatalf("revokeSecurityGroupRules failed: %v", err) + } + + if len(ingressCalls) != 0 { + t.Errorf("Expected no RevokeSecurityGroupIngress calls for empty rules, got %d", len(ingressCalls)) + } + if len(egressCalls) != 0 { + t.Errorf("Expected no RevokeSecurityGroupEgress calls for empty rules, got %d", len(egressCalls)) + } +} + +func TestRevokeSecurityGroupRules_SkipsEmptyID(t *testing.T) { + mock := NewMockEC2Client() + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.revokeSecurityGroupRules("") + if err != nil { + t.Fatalf("revokeSecurityGroupRules should skip empty SG ID, got: %v", err) + } +} + +func TestRevokeSecurityGroupRules_DescribeError(t *testing.T) { + mock := NewMockEC2Client() + mock.DescribeSGsFunc = func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) { + return nil, fmt.Errorf("InvalidGroup.NotFound") + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + // NotFound is not an error — SG is already gone + err := provider.revokeSecurityGroupRules("sg-gone") + if err != nil { + t.Fatalf("revokeSecurityGroupRules should handle NotFound gracefully, got: %v", err) + } +} From 53e1043efaa814326d5f0be62dde3e540024029f Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 19 Mar 2026 20:35:04 +0100 Subject: [PATCH 08/13] feat(aws): wait for ENIs to drain before deleting security groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After instance termination, AWS ENIs can linger for 2-5 minutes in 'in-use' state. Attempting to delete security groups while ENIs reference them causes DependencyViolation errors. Add waitForENIsDrained() that polls DescribeNetworkInterfaces every 10s (up to 5min) until all non-available ENIs in the VPC are gone. Insert this as Phase 1.5 between instance termination and SG deletion. ENI drain timeout is a warning, not a fatal error — the extended retry budget on deleteSecurityGroup provides a second line of defense. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/delete.go | 58 ++++++++++++++++++++++ pkg/provider/aws/delete_test.go | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/pkg/provider/aws/delete.go b/pkg/provider/aws/delete.go index 5f8e9d8b..e98a30ff 100644 --- a/pkg/provider/aws/delete.go +++ b/pkg/provider/aws/delete.go @@ -115,6 +115,14 @@ func (p *Provider) delete(cache *AWS) error { return fmt.Errorf("failed to delete EC2 instances: %w", err) } + // Phase 1.5: Wait for ENIs to detach after instance termination. + // AWS ENIs can linger for 2-5 minutes after termination, blocking SG deletion. + if cache.Vpcid != "" { + if err := p.waitForENIsDrained(cache.Vpcid); err != nil { + p.log.Warning("ENI drain wait failed (continuing): %v", err) + } + } + // Phase 2: Delete Security Groups if err := p.deleteSecurityGroups(cache); err != nil { return fmt.Errorf("failed to delete security groups: %w", err) @@ -696,6 +704,56 @@ func (p *Provider) deleteVPC(cache *AWS) error { // Helper functions +// waitForENIsDrained polls DescribeNetworkInterfaces until all non-available +// ENIs in the VPC are detached or deleted. This prevents DependencyViolation +// errors when deleting security groups, since AWS ENIs can linger for 2-5 +// minutes after instance termination. +func (p *Provider) waitForENIsDrained(vpcID string) error { + if vpcID == "" { + return nil + } + + const ( + eniPollInterval = 10 * time.Second + eniPollTimeout = 5 * time.Minute + ) + + deadline := time.Now().Add(eniPollTimeout) + + for { + ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) + result, err := p.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{ + Filters: []types.Filter{ + {Name: aws.String("vpc-id"), Values: []string{vpcID}}, + }, + }) + cancel() + + if err != nil { + p.log.Warning("Error checking ENIs in VPC %s: %v", vpcID, err) + } else { + // Count non-available ENIs (in-use ENIs block SG deletion) + var blocking int + for _, eni := range result.NetworkInterfaces { + if eni.Status != types.NetworkInterfaceStatusAvailable { + blocking++ + } + } + if blocking == 0 { + p.log.Info("All ENIs in VPC %s are drained", vpcID) + return nil + } + p.log.Info("Waiting for %d in-use ENI(s) in VPC %s to detach...", blocking, vpcID) + } + + if time.Now().After(deadline) { + return fmt.Errorf("timed out waiting for ENIs to drain in VPC %s", vpcID) + } + + time.Sleep(eniPollInterval) + } +} + // revokeSecurityGroupRules removes all ingress and egress rules from a security // group before deletion. This prevents DependencyViolation errors caused by // cross-SG references (e.g., worker SG referencing control-plane SG). diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index c3a98cec..016056ed 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -652,3 +652,90 @@ func TestRevokeSecurityGroupRules_DescribeError(t *testing.T) { t.Fatalf("revokeSecurityGroupRules should handle NotFound gracefully, got: %v", err) } } + +func TestWaitForENIsDrained_NoENIs(t *testing.T) { + mock := NewMockEC2Client() + mock.DescribeNIsFunc = func(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { + return &ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: []types.NetworkInterface{}, + }, nil + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.waitForENIsDrained("vpc-123") + if err != nil { + t.Fatalf("waitForENIsDrained should succeed with no ENIs, got: %v", err) + } +} + +func TestWaitForENIsDrained_SkipsEmptyVPC(t *testing.T) { + mock := NewMockEC2Client() + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.waitForENIsDrained("") + if err != nil { + t.Fatalf("waitForENIsDrained should skip empty VPC ID, got: %v", err) + } +} + +func TestWaitForENIsDrained_ENIsDrainOnSecondPoll(t *testing.T) { + if testing.Short() { + t.Skip("skipping: poll loop sleeps 10s between calls") + } + callCount := 0 + mock := NewMockEC2Client() + mock.DescribeNIsFunc = func(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { + callCount++ + if callCount == 1 { + // First call: ENI still in-use + return &ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: []types.NetworkInterface{ + { + NetworkInterfaceId: aws.String("eni-123"), + Status: types.NetworkInterfaceStatusInUse, + }, + }, + }, nil + } + // Second call: all drained + return &ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: []types.NetworkInterface{}, + }, nil + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.waitForENIsDrained("vpc-123") + if err != nil { + t.Fatalf("waitForENIsDrained should succeed after ENIs drain, got: %v", err) + } + if callCount < 2 { + t.Errorf("Expected at least 2 DescribeNetworkInterfaces calls, got %d", callCount) + } +} + +func TestWaitForENIsDrained_AvailableENIsIgnored(t *testing.T) { + mock := NewMockEC2Client() + mock.DescribeNIsFunc = func(ctx context.Context, params *ec2.DescribeNetworkInterfacesInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { + // ENI exists but is in "available" state (detached) — not blocking + return &ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: []types.NetworkInterface{ + { + NetworkInterfaceId: aws.String("eni-avail"), + Status: types.NetworkInterfaceStatusAvailable, + }, + }, + }, nil + } + + provider := &Provider{ec2: mock, log: mockLogger()} + + err := provider.waitForENIsDrained("vpc-123") + if err != nil { + t.Fatalf("waitForENIsDrained should ignore 'available' ENIs, got: %v", err) + } +} From fc1df3265a829a5ecd6f7fcc7cd86968d009423f Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 19 Mar 2026 20:39:02 +0100 Subject: [PATCH 09/13] feat(aws): increase retry budget to ~5min for eventual-consistency delays Increase maxRetries from 5 to 10 and retryDelay from 5s to 10s. This extends the total retry budget from ~65s to ~5min, covering the full AWS ENI detach window (2-5 min) as a final safety net beyond the explicit ENI drain wait and SG rule revocation. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/delete.go | 4 ++-- pkg/provider/aws/delete_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/provider/aws/delete.go b/pkg/provider/aws/delete.go index e98a30ff..3b161e05 100644 --- a/pkg/provider/aws/delete.go +++ b/pkg/provider/aws/delete.go @@ -32,8 +32,8 @@ import ( ) const ( - maxRetries = 5 - retryDelay = 5 * time.Second + maxRetries = 10 + retryDelay = 10 * time.Second maxRetryDelay = 30 * time.Second deletionTimeout = 15 * time.Minute verificationDelay = 2 * time.Second diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index 016056ed..a06d3719 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -51,8 +51,8 @@ func TestDeleteConstants(t *testing.T) { constant time.Duration expected time.Duration }{ - {"maxRetries", time.Duration(maxRetries), 5}, - {"retryDelay", retryDelay, 5 * time.Second}, + {"maxRetries", time.Duration(maxRetries), 10}, + {"retryDelay", retryDelay, 10 * time.Second}, {"maxRetryDelay", maxRetryDelay, 30 * time.Second}, {"verificationDelay", verificationDelay, 2 * time.Second}, {"apiCallTimeout", apiCallTimeout, 30 * time.Second}, From 375acfb6eac3a779fb344034bd56c83cbb65154f Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Fri, 20 Mar 2026 12:14:25 +0100 Subject: [PATCH 10/13] fix: add RevokeSecurityGroup methods to shared mock EC2Client The EC2Client interface gained RevokeSecurityGroupIngress and RevokeSecurityGroupEgress in commit fa13fef1, but the shared mock in pkg/testutil/mocks was not updated. This broke pkg/cleanup tests which use the shared mock. Also fix pre-existing gofmt issue in mock_ec2_test.go. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/mock_ec2_test.go | 6 +++--- pkg/testutil/mocks/aws.go | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/provider/aws/mock_ec2_test.go b/pkg/provider/aws/mock_ec2_test.go index 5e1c4a0f..a68855fc 100644 --- a/pkg/provider/aws/mock_ec2_test.go +++ b/pkg/provider/aws/mock_ec2_test.go @@ -56,9 +56,9 @@ type MockEC2Client struct { ReplaceRTAssocFunc func(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) // Security Group - CreateSGFunc func(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) - AuthorizeSGFunc func(ctx context.Context, params *ec2.AuthorizeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupIngressOutput, error) - DeleteSGFunc func(ctx context.Context, params *ec2.DeleteSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.DeleteSecurityGroupOutput, error) + CreateSGFunc func(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) + AuthorizeSGFunc func(ctx context.Context, params *ec2.AuthorizeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupIngressOutput, error) + DeleteSGFunc func(ctx context.Context, params *ec2.DeleteSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.DeleteSecurityGroupOutput, error) DescribeSGsFunc func(ctx context.Context, params *ec2.DescribeSecurityGroupsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) RevokeSGIngressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) RevokeSGEgressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) diff --git a/pkg/testutil/mocks/aws.go b/pkg/testutil/mocks/aws.go index 6994506d..846ddd19 100644 --- a/pkg/testutil/mocks/aws.go +++ b/pkg/testutil/mocks/aws.go @@ -83,6 +83,13 @@ type EC2Client interface { params *ec2.DescribeSecurityGroupsInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) + RevokeSecurityGroupIngress(ctx context.Context, + params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) + RevokeSecurityGroupEgress(ctx context.Context, + params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) + RunInstances(ctx context.Context, params *ec2.RunInstancesInput, optFns ...func(*ec2.Options)) (*ec2.RunInstancesOutput, error) TerminateInstances(ctx context.Context, @@ -476,3 +483,13 @@ func (m *MockEC2Client) DescribeNatGateways(ctx context.Context, params *ec2.Des func (m *MockEC2Client) ModifySubnetAttribute(ctx context.Context, params *ec2.ModifySubnetAttributeInput, optFns ...func(*ec2.Options)) (*ec2.ModifySubnetAttributeOutput, error) { return &ec2.ModifySubnetAttributeOutput{}, nil } + +// Security Group Revoke operations + +func (m *MockEC2Client) RevokeSecurityGroupIngress(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + return &ec2.RevokeSecurityGroupIngressOutput{}, nil +} + +func (m *MockEC2Client) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + return &ec2.RevokeSecurityGroupEgressOutput{}, nil +} From f3b1d6d62035bb4a507020888ebc5f3db29ab9ee Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 24 Mar 2026 09:19:30 +0100 Subject: [PATCH 11/13] fix(aws): break cross-SG references and disassociate route tables before deletion Security group deletion failed with DependencyViolation because worker SG ingress rules referenced CP SG and vice versa. Fix: revoke both SGs' ingress rules upfront before any deletion. Egress revocation is skipped because CI IAM user lacks ec2:RevokeSecurityGroupEgress permission (and the default egress rule doesn't create cross-SG dependencies). Route table deletion also failed with DependencyViolation because subnet associations were not removed first. Fix: describe route table associations and disassociate non-main entries before calling DeleteRouteTable. Both fixes validated with local E2E: create + delete of a minimal cluster completed cleanly with no leaked resources. Signed-off-by: Carlos Eduardo Arango Gutierrez --- internal/aws/ec2_client.go | 2 + pkg/provider/aws/create_test.go | 5 ++ pkg/provider/aws/delete.go | 90 +++++++++++++++++++++++-------- pkg/provider/aws/delete_test.go | 68 ++++++++++++++++++++++- pkg/provider/aws/mock_ec2_test.go | 8 +++ pkg/testutil/mocks/aws.go | 21 ++++++-- 6 files changed, 167 insertions(+), 27 deletions(-) diff --git a/internal/aws/ec2_client.go b/internal/aws/ec2_client.go index 5d56ec54..5a9bf656 100644 --- a/internal/aws/ec2_client.go +++ b/internal/aws/ec2_client.go @@ -77,6 +77,8 @@ type EC2Client interface { optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) DescribeRouteTables(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) + DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, + optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, diff --git a/pkg/provider/aws/create_test.go b/pkg/provider/aws/create_test.go index 592e08a7..51f6a6a2 100644 --- a/pkg/provider/aws/create_test.go +++ b/pkg/provider/aws/create_test.go @@ -155,6 +155,11 @@ func (m *mockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } +func (m *mockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, + optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { + return &ec2.DisassociateRouteTableOutput{}, nil +} + func (m *mockEC2Client) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) { diff --git a/pkg/provider/aws/delete.go b/pkg/provider/aws/delete.go index 3b161e05..b0a2bf21 100644 --- a/pkg/provider/aws/delete.go +++ b/pkg/provider/aws/delete.go @@ -261,6 +261,19 @@ func (p *Provider) deleteSecurityGroups(cache *AWS) error { p.log.Error(fmt.Errorf("failed to update progressing condition: %w", err)) } + // Break cross-SG ingress references before deletion. + // Worker SG has ingress rules referencing CP SG, and CP SG has ingress + // rules referencing Worker SG. Both must be cleared to avoid + // DependencyViolation errors on DeleteSecurityGroup. + if cache.WorkerSecurityGroupid != "" && cache.CPSecurityGroupid != "" { + if err := p.revokeSecurityGroupRules(cache.CPSecurityGroupid); err != nil { + p.log.Warning("Error revoking CP SG %s rules (continuing): %v", cache.CPSecurityGroupid, err) + } + if err := p.revokeSecurityGroupRules(cache.WorkerSecurityGroupid); err != nil { + p.log.Warning("Error revoking Worker SG %s rules (continuing): %v", cache.WorkerSecurityGroupid, err) + } + } + // Delete Worker SG first — it references CP SG, so CP SG can't be deleted // while Worker SG still exists. if err := p.deleteSecurityGroup(cache.WorkerSecurityGroupid, "worker"); err != nil { @@ -301,11 +314,6 @@ func (p *Provider) deleteSecurityGroup(sgID, label string) error { return nil } - // Revoke all rules first to break cross-SG dependencies - if err := p.revokeSecurityGroupRules(sgID); err != nil { - p.log.Warning("Error revoking rules for %s SG %s (continuing): %v", label, sgID, err) - } - err := p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) defer cancel() @@ -470,6 +478,11 @@ func (p *Provider) deletePublicRouteTable(cache *AWS) error { return nil } + // Disassociate subnet associations before deletion to avoid DependencyViolation. + if err := p.disassociateRouteTableSubnets(cache.PublicRouteTable); err != nil { + p.log.Warning("Error disassociating public route table %s (continuing): %v", cache.PublicRouteTable, err) + } + p.log.Info("Deleting public route table %s", cache.PublicRouteTable) err := p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) @@ -575,6 +588,11 @@ func (p *Provider) deleteRouteTable(cache *AWS) error { return nil } + // Disassociate subnet associations before deletion to avoid DependencyViolation. + if err := p.disassociateRouteTableSubnets(cache.RouteTable); err != nil { + p.log.Warning("Error disassociating route table %s (continuing): %v", cache.RouteTable, err) + } + err = p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) defer cancel() @@ -798,22 +816,10 @@ func (p *Provider) revokeSecurityGroupRules(sgID string) error { } } - // Revoke all egress rules - if len(sg.IpPermissionsEgress) > 0 { - p.log.Info("Revoking %d egress rule(s) from security group %s", len(sg.IpPermissionsEgress), sgID) - rCtx, rCancel := context.WithTimeout(context.Background(), apiCallTimeout) - defer rCancel() - _, err := p.ec2.RevokeSecurityGroupEgress(rCtx, &ec2.RevokeSecurityGroupEgressInput{ - GroupId: &sgID, - IpPermissions: sg.IpPermissionsEgress, - }) - if err != nil { - if strings.Contains(err.Error(), "InvalidGroup.NotFound") { - return nil - } - return fmt.Errorf("error revoking egress rules for %s: %w", sgID, err) - } - } + // Note: egress rule revocation is intentionally skipped. + // The CI IAM user (cnt-ci) lacks ec2:RevokeSecurityGroupEgress permission, + // and the default egress rule (0.0.0.0/0) does not create cross-SG + // dependencies that would block DeleteSecurityGroup. return nil } @@ -878,6 +884,48 @@ func (p *Provider) vpcExists(vpcID string) bool { return err == nil } +// disassociateRouteTableSubnets removes non-main subnet associations from a +// route table so it can be deleted without DependencyViolation errors. +func (p *Provider) disassociateRouteTableSubnets(rtID string) error { + ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) + defer cancel() + + result, err := p.ec2.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{ + RouteTableIds: []string{rtID}, + }) + if err != nil { + if strings.Contains(err.Error(), "InvalidRouteTableID.NotFound") { + return nil + } + return fmt.Errorf("error describing route table %s: %w", rtID, err) + } + + if len(result.RouteTables) == 0 { + return nil + } + + for _, assoc := range result.RouteTables[0].Associations { + if assoc.Main != nil && *assoc.Main { + continue // Cannot disassociate the main route table association + } + if assoc.RouteTableAssociationId == nil { + continue + } + assocID := *assoc.RouteTableAssociationId + p.log.Info("Disassociating route table association %s from %s", assocID, rtID) + dCtx, dCancel := context.WithTimeout(context.Background(), apiCallTimeout) + _, dErr := p.ec2.DisassociateRouteTable(dCtx, &ec2.DisassociateRouteTableInput{ + AssociationId: &assocID, + }) + dCancel() + if dErr != nil { + p.log.Warning("Error disassociating %s (continuing): %v", assocID, dErr) + } + } + + return nil +} + func (p *Provider) isMainRouteTable(rtID, vpcID string) (bool, error) { result, err := p.ec2.DescribeRouteTables(context.Background(), &ec2.DescribeRouteTablesInput{ RouteTableIds: []string{rtID}, diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index a06d3719..cd80aa65 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -295,10 +295,37 @@ func TestDeletePublicRouteTable_Empty(t *testing.T) { } } -func TestDeletePublicRouteTable_Success(t *testing.T) { +func TestDeletePublicRouteTable_DisassociatesBeforeDelete(t *testing.T) { + var disassociated bool var deletedID string mock := &MockEC2Client{ + DescribeRTsFunc: func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) { + return &ec2.DescribeRouteTablesOutput{ + RouteTables: []types.RouteTable{ + { + RouteTableId: aws.String("rtb-public-123"), + Associations: []types.RouteTableAssociation{ + { + RouteTableAssociationId: aws.String("rtbassoc-123"), + SubnetId: aws.String("subnet-public-123"), + Main: aws.Bool(false), + }, + }, + }, + }, + }, nil + }, + DisassociateRTFunc: func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { + if aws.ToString(params.AssociationId) != "rtbassoc-123" { + t.Errorf("expected disassociation of rtbassoc-123, got %s", aws.ToString(params.AssociationId)) + } + disassociated = true + return &ec2.DisassociateRouteTableOutput{}, nil + }, DeleteRTFunc: func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) { + if !disassociated { + return nil, fmt.Errorf("DependencyViolation: route table has dependencies") + } deletedID = aws.ToString(params.RouteTableId) return &ec2.DeleteRouteTableOutput{}, nil }, @@ -308,11 +335,50 @@ func TestDeletePublicRouteTable_Success(t *testing.T) { if err := provider.deletePublicRouteTable(cache); err != nil { t.Fatalf("unexpected error: %v", err) } + if !disassociated { + t.Error("expected DisassociateRouteTable to be called before deletion") + } if deletedID != "rtb-public-123" { t.Errorf("expected deletion of rtb-public-123, got %s", deletedID) } } +func TestDeletePublicRouteTable_SkipsMainAssociation(t *testing.T) { + var disassociateCalled bool + mock := &MockEC2Client{ + DescribeRTsFunc: func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) { + return &ec2.DescribeRouteTablesOutput{ + RouteTables: []types.RouteTable{ + { + RouteTableId: aws.String("rtb-public-123"), + Associations: []types.RouteTableAssociation{ + { + RouteTableAssociationId: aws.String("rtbassoc-main"), + Main: aws.Bool(true), + }, + }, + }, + }, + }, nil + }, + DisassociateRTFunc: func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { + disassociateCalled = true + return &ec2.DisassociateRouteTableOutput{}, nil + }, + DeleteRTFunc: func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) { + return &ec2.DeleteRouteTableOutput{}, nil + }, + } + provider := &Provider{ec2: mock, log: mockLogger()} + cache := &AWS{PublicRouteTable: "rtb-public-123"} + if err := provider.deletePublicRouteTable(cache); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if disassociateCalled { + t.Error("should not disassociate main route table association") + } +} + func TestDeletePublicSubnet_Empty(t *testing.T) { provider := &Provider{log: mockLogger()} cache := &AWS{PublicSubnetid: ""} diff --git a/pkg/provider/aws/mock_ec2_test.go b/pkg/provider/aws/mock_ec2_test.go index a68855fc..72a19e42 100644 --- a/pkg/provider/aws/mock_ec2_test.go +++ b/pkg/provider/aws/mock_ec2_test.go @@ -53,6 +53,7 @@ type MockEC2Client struct { CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) DeleteRTFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) DescribeRTsFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) + DisassociateRTFunc func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) ReplaceRTAssocFunc func(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) // Security Group @@ -226,6 +227,13 @@ func (m *MockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } +func (m *MockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { + if m.DisassociateRTFunc != nil { + return m.DisassociateRTFunc(ctx, params, optFns...) + } + return &ec2.DisassociateRouteTableOutput{}, nil +} + func (m *MockEC2Client) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) { if m.ReplaceRTAssocFunc != nil { return m.ReplaceRTAssocFunc(ctx, params, optFns...) diff --git a/pkg/testutil/mocks/aws.go b/pkg/testutil/mocks/aws.go index 846ddd19..63716427 100644 --- a/pkg/testutil/mocks/aws.go +++ b/pkg/testutil/mocks/aws.go @@ -69,6 +69,9 @@ type EC2Client interface { DescribeRouteTables(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) + DisassociateRouteTable(ctx context.Context, + params *ec2.DisassociateRouteTableInput, + optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) CreateSecurityGroup(ctx context.Context, params *ec2.CreateSecurityGroupInput, @@ -128,11 +131,12 @@ type MockEC2Client struct { DeleteInternetGatewayFunc func(ctx context.Context, params *ec2.DeleteInternetGatewayInput, optFns ...func(*ec2.Options)) (*ec2.DeleteInternetGatewayOutput, error) // Route Table operations - CreateRouteTableFunc func(ctx context.Context, params *ec2.CreateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteTableOutput, error) - AssociateRouteTableFunc func(ctx context.Context, params *ec2.AssociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.AssociateRouteTableOutput, error) - CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) - DeleteRouteTableFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) - DescribeRouteTablesFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) + CreateRouteTableFunc func(ctx context.Context, params *ec2.CreateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteTableOutput, error) + AssociateRouteTableFunc func(ctx context.Context, params *ec2.AssociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.AssociateRouteTableOutput, error) + CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) + DeleteRouteTableFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) + DescribeRouteTablesFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) + DisassociateRouteTableFunc func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) // Security Group operations CreateSecurityGroupFunc func(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) @@ -298,6 +302,13 @@ func (m *MockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } +func (m *MockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { + if m.DisassociateRouteTableFunc != nil { + return m.DisassociateRouteTableFunc(ctx, params, optFns...) + } + return &ec2.DisassociateRouteTableOutput{}, nil +} + // Security Group operations func (m *MockEC2Client) CreateSecurityGroup(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) { From 2ecc19a1610eec92be617b217e558aa3804360a1 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 24 Mar 2026 09:57:30 +0100 Subject: [PATCH 12/13] fix(aws): update test to match egress revocation removal TestRevokeSecurityGroupRules_RevokesIngressAndEgress asserted that egress revocation was called, but the implementation now intentionally skips it (CI IAM lacks ec2:RevokeSecurityGroupEgress). Updated test to assert 0 egress calls and renamed to TestRevokeSecurityGroupRules_RevokesIngressOnly. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/delete_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index cd80aa65..f9c56506 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -586,7 +586,7 @@ func TestDeleteSecurityGroups_SharedSameAsCP(t *testing.T) { } } -func TestRevokeSecurityGroupRules_RevokesIngressAndEgress(t *testing.T) { +func TestRevokeSecurityGroupRules_RevokesIngressOnly(t *testing.T) { var ingressCalls []ec2.RevokeSecurityGroupIngressInput var egressCalls []ec2.RevokeSecurityGroupEgressInput @@ -642,11 +642,11 @@ func TestRevokeSecurityGroupRules_RevokesIngressAndEgress(t *testing.T) { t.Errorf("Expected GroupId 'sg-worker', got %q", *ingressCalls[0].GroupId) } - if len(egressCalls) != 1 { - t.Fatalf("Expected 1 RevokeSecurityGroupEgress call, got %d", len(egressCalls)) - } - if *egressCalls[0].GroupId != "sg-worker" { - t.Errorf("Expected GroupId 'sg-worker', got %q", *egressCalls[0].GroupId) + // Egress revocation is intentionally skipped — CI IAM user lacks + // ec2:RevokeSecurityGroupEgress, and the default egress rule does not + // create cross-SG dependencies that block DeleteSecurityGroup. + if len(egressCalls) != 0 { + t.Errorf("Expected 0 RevokeSecurityGroupEgress calls (egress skipped), got %d", len(egressCalls)) } } From a9e2161f14cac426688633a6912f371279ef74ec Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 24 Mar 2026 12:02:48 +0100 Subject: [PATCH 13/13] fix(aws): delete subnets before route tables to avoid DependencyViolation CI IAM user (cnt-ci) lacks ec2:DisassociateRouteTable, so the explicit disassociation approach fails with 403. Instead, swap the deletion order: delete subnets first (which implicitly removes route table associations), then delete route tables. This avoids needing DisassociateRouteTable entirely. Remove the disassociateRouteTableSubnets helper and DisassociateRouteTable from the EC2Client interface since they are no longer needed. Validated with local E2E: create + delete completes cleanly. Signed-off-by: Carlos Eduardo Arango Gutierrez --- internal/aws/ec2_client.go | 2 - pkg/provider/aws/create_test.go | 5 --- pkg/provider/aws/delete.go | 64 ++++------------------------- pkg/provider/aws/delete_test.go | 68 +------------------------------ pkg/provider/aws/mock_ec2_test.go | 8 ---- pkg/testutil/mocks/aws.go | 21 +++------- 6 files changed, 13 insertions(+), 155 deletions(-) diff --git a/internal/aws/ec2_client.go b/internal/aws/ec2_client.go index 5a9bf656..5d56ec54 100644 --- a/internal/aws/ec2_client.go +++ b/internal/aws/ec2_client.go @@ -77,8 +77,6 @@ type EC2Client interface { optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) DescribeRouteTables(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) - DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, - optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, diff --git a/pkg/provider/aws/create_test.go b/pkg/provider/aws/create_test.go index 51f6a6a2..592e08a7 100644 --- a/pkg/provider/aws/create_test.go +++ b/pkg/provider/aws/create_test.go @@ -155,11 +155,6 @@ func (m *mockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } -func (m *mockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, - optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { - return &ec2.DisassociateRouteTableOutput{}, nil -} - func (m *mockEC2Client) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) { diff --git a/pkg/provider/aws/delete.go b/pkg/provider/aws/delete.go index b0a2bf21..cf069083 100644 --- a/pkg/provider/aws/delete.go +++ b/pkg/provider/aws/delete.go @@ -366,13 +366,15 @@ func (p *Provider) deleteVPCResources(cache *AWS) error { return err } - // Step 3: Delete public route table - if err := p.deletePublicRouteTable(cache); err != nil { + // Step 3: Delete public subnet (before route table — deleting the subnet + // implicitly removes the route table association, avoiding the need for + // ec2:DisassociateRouteTable which CI IAM may lack) + if err := p.deletePublicSubnet(cache); err != nil { return err } - // Step 4: Delete public subnet - if err := p.deletePublicSubnet(cache); err != nil { + // Step 4: Delete public route table (association removed by step 3) + if err := p.deletePublicRouteTable(cache); err != nil { return err } @@ -381,7 +383,7 @@ func (p *Provider) deleteVPCResources(cache *AWS) error { return err } - // Step 6: Delete private Route Table + // Step 6: Delete private Route Table (association removed by step 5) if err := p.deleteRouteTable(cache); err != nil { return err } @@ -478,11 +480,6 @@ func (p *Provider) deletePublicRouteTable(cache *AWS) error { return nil } - // Disassociate subnet associations before deletion to avoid DependencyViolation. - if err := p.disassociateRouteTableSubnets(cache.PublicRouteTable); err != nil { - p.log.Warning("Error disassociating public route table %s (continuing): %v", cache.PublicRouteTable, err) - } - p.log.Info("Deleting public route table %s", cache.PublicRouteTable) err := p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) @@ -588,11 +585,6 @@ func (p *Provider) deleteRouteTable(cache *AWS) error { return nil } - // Disassociate subnet associations before deletion to avoid DependencyViolation. - if err := p.disassociateRouteTableSubnets(cache.RouteTable); err != nil { - p.log.Warning("Error disassociating route table %s (continuing): %v", cache.RouteTable, err) - } - err = p.retryWithBackoff(func() error { ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) defer cancel() @@ -884,48 +876,6 @@ func (p *Provider) vpcExists(vpcID string) bool { return err == nil } -// disassociateRouteTableSubnets removes non-main subnet associations from a -// route table so it can be deleted without DependencyViolation errors. -func (p *Provider) disassociateRouteTableSubnets(rtID string) error { - ctx, cancel := context.WithTimeout(context.Background(), apiCallTimeout) - defer cancel() - - result, err := p.ec2.DescribeRouteTables(ctx, &ec2.DescribeRouteTablesInput{ - RouteTableIds: []string{rtID}, - }) - if err != nil { - if strings.Contains(err.Error(), "InvalidRouteTableID.NotFound") { - return nil - } - return fmt.Errorf("error describing route table %s: %w", rtID, err) - } - - if len(result.RouteTables) == 0 { - return nil - } - - for _, assoc := range result.RouteTables[0].Associations { - if assoc.Main != nil && *assoc.Main { - continue // Cannot disassociate the main route table association - } - if assoc.RouteTableAssociationId == nil { - continue - } - assocID := *assoc.RouteTableAssociationId - p.log.Info("Disassociating route table association %s from %s", assocID, rtID) - dCtx, dCancel := context.WithTimeout(context.Background(), apiCallTimeout) - _, dErr := p.ec2.DisassociateRouteTable(dCtx, &ec2.DisassociateRouteTableInput{ - AssociationId: &assocID, - }) - dCancel() - if dErr != nil { - p.log.Warning("Error disassociating %s (continuing): %v", assocID, dErr) - } - } - - return nil -} - func (p *Provider) isMainRouteTable(rtID, vpcID string) (bool, error) { result, err := p.ec2.DescribeRouteTables(context.Background(), &ec2.DescribeRouteTablesInput{ RouteTableIds: []string{rtID}, diff --git a/pkg/provider/aws/delete_test.go b/pkg/provider/aws/delete_test.go index f9c56506..0b185e22 100644 --- a/pkg/provider/aws/delete_test.go +++ b/pkg/provider/aws/delete_test.go @@ -295,37 +295,10 @@ func TestDeletePublicRouteTable_Empty(t *testing.T) { } } -func TestDeletePublicRouteTable_DisassociatesBeforeDelete(t *testing.T) { - var disassociated bool +func TestDeletePublicRouteTable_Success(t *testing.T) { var deletedID string mock := &MockEC2Client{ - DescribeRTsFunc: func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) { - return &ec2.DescribeRouteTablesOutput{ - RouteTables: []types.RouteTable{ - { - RouteTableId: aws.String("rtb-public-123"), - Associations: []types.RouteTableAssociation{ - { - RouteTableAssociationId: aws.String("rtbassoc-123"), - SubnetId: aws.String("subnet-public-123"), - Main: aws.Bool(false), - }, - }, - }, - }, - }, nil - }, - DisassociateRTFunc: func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { - if aws.ToString(params.AssociationId) != "rtbassoc-123" { - t.Errorf("expected disassociation of rtbassoc-123, got %s", aws.ToString(params.AssociationId)) - } - disassociated = true - return &ec2.DisassociateRouteTableOutput{}, nil - }, DeleteRTFunc: func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) { - if !disassociated { - return nil, fmt.Errorf("DependencyViolation: route table has dependencies") - } deletedID = aws.ToString(params.RouteTableId) return &ec2.DeleteRouteTableOutput{}, nil }, @@ -335,50 +308,11 @@ func TestDeletePublicRouteTable_DisassociatesBeforeDelete(t *testing.T) { if err := provider.deletePublicRouteTable(cache); err != nil { t.Fatalf("unexpected error: %v", err) } - if !disassociated { - t.Error("expected DisassociateRouteTable to be called before deletion") - } if deletedID != "rtb-public-123" { t.Errorf("expected deletion of rtb-public-123, got %s", deletedID) } } -func TestDeletePublicRouteTable_SkipsMainAssociation(t *testing.T) { - var disassociateCalled bool - mock := &MockEC2Client{ - DescribeRTsFunc: func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) { - return &ec2.DescribeRouteTablesOutput{ - RouteTables: []types.RouteTable{ - { - RouteTableId: aws.String("rtb-public-123"), - Associations: []types.RouteTableAssociation{ - { - RouteTableAssociationId: aws.String("rtbassoc-main"), - Main: aws.Bool(true), - }, - }, - }, - }, - }, nil - }, - DisassociateRTFunc: func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { - disassociateCalled = true - return &ec2.DisassociateRouteTableOutput{}, nil - }, - DeleteRTFunc: func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) { - return &ec2.DeleteRouteTableOutput{}, nil - }, - } - provider := &Provider{ec2: mock, log: mockLogger()} - cache := &AWS{PublicRouteTable: "rtb-public-123"} - if err := provider.deletePublicRouteTable(cache); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if disassociateCalled { - t.Error("should not disassociate main route table association") - } -} - func TestDeletePublicSubnet_Empty(t *testing.T) { provider := &Provider{log: mockLogger()} cache := &AWS{PublicSubnetid: ""} diff --git a/pkg/provider/aws/mock_ec2_test.go b/pkg/provider/aws/mock_ec2_test.go index 72a19e42..a68855fc 100644 --- a/pkg/provider/aws/mock_ec2_test.go +++ b/pkg/provider/aws/mock_ec2_test.go @@ -53,7 +53,6 @@ type MockEC2Client struct { CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) DeleteRTFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) DescribeRTsFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) - DisassociateRTFunc func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) ReplaceRTAssocFunc func(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) // Security Group @@ -227,13 +226,6 @@ func (m *MockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } -func (m *MockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { - if m.DisassociateRTFunc != nil { - return m.DisassociateRTFunc(ctx, params, optFns...) - } - return &ec2.DisassociateRouteTableOutput{}, nil -} - func (m *MockEC2Client) ReplaceRouteTableAssociation(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) { if m.ReplaceRTAssocFunc != nil { return m.ReplaceRTAssocFunc(ctx, params, optFns...) diff --git a/pkg/testutil/mocks/aws.go b/pkg/testutil/mocks/aws.go index 63716427..846ddd19 100644 --- a/pkg/testutil/mocks/aws.go +++ b/pkg/testutil/mocks/aws.go @@ -69,9 +69,6 @@ type EC2Client interface { DescribeRouteTables(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) - DisassociateRouteTable(ctx context.Context, - params *ec2.DisassociateRouteTableInput, - optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) CreateSecurityGroup(ctx context.Context, params *ec2.CreateSecurityGroupInput, @@ -131,12 +128,11 @@ type MockEC2Client struct { DeleteInternetGatewayFunc func(ctx context.Context, params *ec2.DeleteInternetGatewayInput, optFns ...func(*ec2.Options)) (*ec2.DeleteInternetGatewayOutput, error) // Route Table operations - CreateRouteTableFunc func(ctx context.Context, params *ec2.CreateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteTableOutput, error) - AssociateRouteTableFunc func(ctx context.Context, params *ec2.AssociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.AssociateRouteTableOutput, error) - CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) - DeleteRouteTableFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) - DescribeRouteTablesFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) - DisassociateRouteTableFunc func(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) + CreateRouteTableFunc func(ctx context.Context, params *ec2.CreateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteTableOutput, error) + AssociateRouteTableFunc func(ctx context.Context, params *ec2.AssociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.AssociateRouteTableOutput, error) + CreateRouteFunc func(ctx context.Context, params *ec2.CreateRouteInput, optFns ...func(*ec2.Options)) (*ec2.CreateRouteOutput, error) + DeleteRouteTableFunc func(ctx context.Context, params *ec2.DeleteRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DeleteRouteTableOutput, error) + DescribeRouteTablesFunc func(ctx context.Context, params *ec2.DescribeRouteTablesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeRouteTablesOutput, error) // Security Group operations CreateSecurityGroupFunc func(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) @@ -302,13 +298,6 @@ func (m *MockEC2Client) DescribeRouteTables(ctx context.Context, params *ec2.Des return &ec2.DescribeRouteTablesOutput{}, nil } -func (m *MockEC2Client) DisassociateRouteTable(ctx context.Context, params *ec2.DisassociateRouteTableInput, optFns ...func(*ec2.Options)) (*ec2.DisassociateRouteTableOutput, error) { - if m.DisassociateRouteTableFunc != nil { - return m.DisassociateRouteTableFunc(ctx, params, optFns...) - } - return &ec2.DisassociateRouteTableOutput{}, nil -} - // Security Group operations func (m *MockEC2Client) CreateSecurityGroup(ctx context.Context, params *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) {