From f4f54762ba33b1401aab8ddddeafe8f678bdd9ef Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 31 Mar 2026 22:46:57 +0200 Subject: [PATCH] fix: revoke cross-referencing SG rules before deletion in cleanup Security groups that reference each other (e.g., CP SG allows traffic from Worker SG and vice versa) cannot be deleted due to AWS DependencyViolation errors. Revoke all ingress/egress rules before attempting deletion to break the circular dependency. This fixes the periodic cleanup failing on 2 remaining VPCs in us-west-1 that had cross-referencing CP/Worker security groups. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/cleanup/cleanup.go | 29 +++++++++++ pkg/cleanup/cleanup_ginkgo_test.go | 84 ++++++++++++++++++++++++++++++ pkg/testutil/mocks/aws.go | 10 ++++ 3 files changed, 123 insertions(+) diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go index d221e7ab6..eabdef63c 100644 --- a/pkg/cleanup/cleanup.go +++ b/pkg/cleanup/cleanup.go @@ -442,6 +442,35 @@ func (c *Cleaner) deleteSecurityGroups(ctx context.Context, vpcID string) error } } + // Revoke all ingress/egress rules to break cross-SG references. + // Without this, SGs that reference each other (e.g., CP allows traffic + // from Worker and vice versa) cannot be deleted due to DependencyViolation. + for _, sg := range nonDefaultSGs { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context cancelled during SG rule revocation: %w", err) + } + + if len(sg.IpPermissions) > 0 { + _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{ + GroupId: sg.GroupId, + IpPermissions: sg.IpPermissions, + }) + if err != nil { + c.log.Warning("Failed to revoke ingress rules for %s: %v", safeString(sg.GroupId), err) + } + } + + if len(sg.IpPermissionsEgress) > 0 { + _, err := c.ec2.RevokeSecurityGroupEgress(ctx, &ec2.RevokeSecurityGroupEgressInput{ + GroupId: sg.GroupId, + IpPermissions: sg.IpPermissionsEgress, + }) + if err != nil { + c.log.Warning("Failed to revoke egress rules for %s: %v", safeString(sg.GroupId), err) + } + } + } + // Delete non-default security groups for _, sg := range nonDefaultSGs { deleteInput := &ec2.DeleteSecurityGroupInput{ diff --git a/pkg/cleanup/cleanup_ginkgo_test.go b/pkg/cleanup/cleanup_ginkgo_test.go index c03806755..6178441cf 100644 --- a/pkg/cleanup/cleanup_ginkgo_test.go +++ b/pkg/cleanup/cleanup_ginkgo_test.go @@ -530,6 +530,90 @@ var _ = Describe("Cleanup Package", func() { Expect(deletedSGs).To(ConsistOf("sg-custom")) }) + It("should revoke cross-referencing SG rules before deleting", func() { + revokedIngress := []string{} + revokedEgress := []string{} + deletedSGs := []string{} + + mockEC.DescribeSecurityGroupsFunc = func(ctx context.Context, + params *ec2.DescribeSecurityGroupsInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeSecurityGroupsOutput, error) { + return &ec2.DescribeSecurityGroupsOutput{ + SecurityGroups: []types.SecurityGroup{ + { + GroupId: aws.String("sg-default"), + GroupName: aws.String("default"), + }, + { + GroupId: aws.String("sg-cp"), + GroupName: aws.String("holodeck-cp"), + IpPermissions: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: aws.String("sg-worker")}}, + }, + }, + IpPermissionsEgress: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + IpRanges: []types.IpRange{{CidrIp: aws.String("0.0.0.0/0")}}, + }, + }, + }, + { + GroupId: aws.String("sg-worker"), + GroupName: aws.String("holodeck-worker"), + IpPermissions: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: aws.String("sg-cp")}}, + }, + }, + IpPermissionsEgress: []types.IpPermission{ + { + IpProtocol: aws.String("-1"), + IpRanges: []types.IpRange{{CidrIp: aws.String("0.0.0.0/0")}}, + }, + }, + }, + }, + }, nil + } + mockEC.DescribeNetworkInterfacesFunc = func(ctx context.Context, + params *ec2.DescribeNetworkInterfacesInput, + optFns ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { + return &ec2.DescribeNetworkInterfacesOutput{}, nil + } + mockEC.RevokeSecurityGroupIngressFunc = func(ctx context.Context, + params *ec2.RevokeSecurityGroupIngressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + revokedIngress = append(revokedIngress, *params.GroupId) + return &ec2.RevokeSecurityGroupIngressOutput{}, nil + } + mockEC.RevokeSecurityGroupEgressFunc = func(ctx context.Context, + params *ec2.RevokeSecurityGroupEgressInput, + optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + revokedEgress = append(revokedEgress, *params.GroupId) + return &ec2.RevokeSecurityGroupEgressOutput{}, nil + } + mockEC.DeleteSecurityGroupFunc = func(ctx context.Context, + params *ec2.DeleteSecurityGroupInput, + optFns ...func(*ec2.Options)) (*ec2.DeleteSecurityGroupOutput, error) { + deletedSGs = append(deletedSGs, *params.GroupId) + return &ec2.DeleteSecurityGroupOutput{}, nil + } + + cleaner, err := New(log, "us-west-2", WithEC2Client(mockEC)) + Expect(err).NotTo(HaveOccurred()) + + err = cleaner.DeleteVPCResources(context.Background(), "vpc-12345") + Expect(err).NotTo(HaveOccurred()) + + Expect(revokedIngress).To(ConsistOf("sg-cp", "sg-worker")) + Expect(revokedEgress).To(ConsistOf("sg-cp", "sg-worker")) + Expect(deletedSGs).To(ConsistOf("sg-cp", "sg-worker")) + }) + It("should handle VPC deletion failure with retries", func() { deleteAttempts := 0 mockEC.DeleteVpcFunc = func(ctx context.Context, diff --git a/pkg/testutil/mocks/aws.go b/pkg/testutil/mocks/aws.go index 846ddd190..406f3d027 100644 --- a/pkg/testutil/mocks/aws.go +++ b/pkg/testutil/mocks/aws.go @@ -156,6 +156,10 @@ type MockEC2Client struct { DescribeInternetGatewaysFunc func(ctx context.Context, params *ec2.DescribeInternetGatewaysInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInternetGatewaysOutput, error) DescribeInstanceTypesFunc func(ctx context.Context, params *ec2.DescribeInstanceTypesInput, optFns ...func(*ec2.Options)) (*ec2.DescribeInstanceTypesOutput, error) ReplaceRouteTableAssociationFunc func(ctx context.Context, params *ec2.ReplaceRouteTableAssociationInput, optFns ...func(*ec2.Options)) (*ec2.ReplaceRouteTableAssociationOutput, error) + + // Security Group Revoke operations + RevokeSecurityGroupIngressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) + RevokeSecurityGroupEgressFunc func(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) } // VPC operations @@ -487,9 +491,15 @@ func (m *MockEC2Client) ModifySubnetAttribute(ctx context.Context, params *ec2.M // Security Group Revoke operations func (m *MockEC2Client) RevokeSecurityGroupIngress(ctx context.Context, params *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { + if m.RevokeSecurityGroupIngressFunc != nil { + return m.RevokeSecurityGroupIngressFunc(ctx, params, optFns...) + } return &ec2.RevokeSecurityGroupIngressOutput{}, nil } func (m *MockEC2Client) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) { + if m.RevokeSecurityGroupEgressFunc != nil { + return m.RevokeSecurityGroupEgressFunc(ctx, params, optFns...) + } return &ec2.RevokeSecurityGroupEgressOutput{}, nil }