diff --git a/pkg/client/sdk.go b/pkg/client/sdk.go index c4694c30..2277fdd6 100644 --- a/pkg/client/sdk.go +++ b/pkg/client/sdk.go @@ -206,12 +206,7 @@ func (c *SdkStackitClient) CreateServer(ctx context.Context, projectID, region s } // Convert SDK server to our Server type - server := &Server{ - ID: sdkServer.GetId(), - Name: sdkServer.GetName(), - Status: sdkServer.GetStatus(), - Labels: convertLabelsFromSDK(sdkServer.Labels), - } + server := convertSDKServerToServer(sdkServer) return server, nil } @@ -228,12 +223,7 @@ func (c *SdkStackitClient) GetServer(ctx context.Context, projectID, region, ser } // Convert SDK server to our Server type - server := &Server{ - ID: sdkServer.GetId(), - Name: sdkServer.GetName(), - Status: sdkServer.GetStatus(), - Labels: convertLabelsFromSDK(sdkServer.Labels), - } + server := convertSDKServerToServer(sdkServer) return server, nil } @@ -282,13 +272,7 @@ func (c *SdkStackitClient) ListServers(ctx context.Context, projectID, region st if sdkResponse.Items != nil { for i := range *sdkResponse.Items { sdkServer := &(*sdkResponse.Items)[i] - - server := &Server{ - ID: sdkServer.GetId(), - Name: sdkServer.GetName(), - Status: sdkServer.GetStatus(), - Labels: convertLabelsFromSDK(sdkServer.Labels), - } + server := convertSDKServerToServer(sdkServer) servers = append(servers, server) } } @@ -358,6 +342,16 @@ func convertSDKNICtoNIC(nic *iaas.NIC) *NIC { } } +func convertSDKServerToServer(sdkServer *iaas.Server) *Server { + return &Server{ + ID: sdkServer.GetId(), + Name: sdkServer.GetName(), + Status: sdkServer.GetStatus(), + ErrorMessage: sdkServer.GetErrorMessage(), + Labels: convertLabelsFromSDK(sdkServer.Labels), + } +} + // isNotFoundError checks if an error is a 404 Not Found error from the SDK func isNotFoundError(err error) bool { if err == nil { diff --git a/pkg/client/stackit.go b/pkg/client/stackit.go index ad83be08..0d0fe053 100644 --- a/pkg/client/stackit.go +++ b/pkg/client/stackit.go @@ -81,10 +81,11 @@ type AgentRequest struct { // Server represents a STACKIT server response type Server struct { - ID string `json:"id"` - Name string `json:"name"` - Status string `json:"status"` - Labels map[string]string `json:"labels,omitempty"` + ID string `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + ErrorMessage string `json:"errorMessage,omitempty"` + Labels map[string]string `json:"labels,omitempty"` } // NIC represents a STACKIT network interface diff --git a/pkg/provider/create.go b/pkg/provider/create.go index ba02db9b..0c13e5af 100644 --- a/pkg/provider/create.go +++ b/pkg/provider/create.go @@ -13,6 +13,7 @@ import ( "github.com/stackitcloud/machine-controller-manager-provider-stackit/pkg/client" api "github.com/stackitcloud/machine-controller-manager-provider-stackit/pkg/provider/apis" "github.com/stackitcloud/machine-controller-manager-provider-stackit/pkg/provider/apis/validation" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -72,10 +73,21 @@ func (p *Provider) CreateMachine(ctx context.Context, req *driver.CreateMachineR server, err = p.client.CreateServer(ctx, projectID, providerSpec.Region, p.createServerRequest(req, providerSpec)) if err != nil { klog.Errorf("Failed to create server for machine %q: %v", req.Machine.Name, err) + if isResourceExhaustedError(err) { + return nil, status.Error(codes.ResourceExhausted, fmt.Sprintf("failed to create server: %v", err)) + } return nil, status.Error(codes.Unavailable, fmt.Sprintf("failed to create server: %v", err)) } } + if err := p.WaitUntilServerRunning(ctx, projectID, providerSpec.Region, server.ID); err != nil { + klog.Errorf("Failed waiting for server %q to reach ACTIVE state: %v", req.Machine.Name, err) + if isResourceExhaustedError(err) { + return nil, status.Error(codes.ResourceExhausted, fmt.Sprintf("failed waiting for server to be ACTIVE: %v", err)) + } + return nil, status.Error(codes.DeadlineExceeded, fmt.Sprintf("failed waiting for server to be ACTIVE: %v", err)) + } + if err := p.patchNetworkInterface(ctx, projectID, server.ID, providerSpec); err != nil { klog.Errorf("Failed to patch network interface for server %q: %v", req.Machine.Name, err) return nil, status.Error(codes.Unavailable, fmt.Sprintf("failed to patch network interface for server: %v", err)) @@ -265,3 +277,22 @@ func (p *Provider) patchNetworkInterface(ctx context.Context, projectID, serverI return nil } + +func (p *Provider) WaitUntilServerRunning(ctx context.Context, projectID, region, serverID string) error { + return wait.PollUntilContextTimeout(ctx, p.pollingInterval, p.pollingTimeout, true, func(ctx context.Context) (bool, error) { + server, err := p.client.GetServer(ctx, projectID, region, serverID) + if err != nil { + return false, err + } + + switch server.Status { + case "ACTIVE": + klog.V(2).Infof("Server %q reached ACTIVE state", serverID) + return true, nil + case "ERROR": + return false, fmt.Errorf("server in ERROR state: %q", server.ErrorMessage) + } + + return false, nil + }) +} diff --git a/pkg/provider/create_basic_test.go b/pkg/provider/create_basic_test.go index 6fced3fe..ce1b86b5 100644 --- a/pkg/provider/create_basic_test.go +++ b/pkg/provider/create_basic_test.go @@ -3,6 +3,7 @@ package provider import ( "context" "fmt" + "time" "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" @@ -33,7 +34,9 @@ var _ = Describe("CreateMachine", func() { ctx = context.Background() mockClient = &mock.StackitClient{} provider = &Provider{ - client: mockClient, + client: mockClient, + pollingInterval: 10 * time.Millisecond, + pollingTimeout: 5 * time.Second, } secret = &corev1.Secret{ @@ -114,6 +117,41 @@ var _ = Describe("CreateMachine", func() { Expect(capturedReq.MachineType).To(Equal("c2i.2")) Expect(capturedReq.ImageID).To(Equal("12345678-1234-1234-1234-123456789abc")) }) + + It("should poll GetServer until server is ACTIVE", func() { + getServerCallCount := 0 + + mockClient.CreateServerFunc = func(_ context.Context, _, _ string, req *client.CreateServerRequest) (*client.Server, error) { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: req.Name, + Status: "CREATING", + }, nil + } + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + getServerCallCount++ + // First call returns CREATING, second call returns ACTIVE + if getServerCallCount == 1 { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: "test-machine", + Status: "CREATING", + }, nil + } + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: "test-machine", + Status: "ACTIVE", + }, nil + } + + resp, err := provider.CreateMachine(ctx, req) + + Expect(err).NotTo(HaveOccurred()) + Expect(resp).NotTo(BeNil()) + Expect(resp.ProviderID).To(Equal("stackit://11111111-2222-3333-4444-555555555555/550e8400-e29b-41d4-a716-446655440000")) + Expect(getServerCallCount).To(BeNumerically(">=", 2)) + }) }) Context("with invalid ProviderSpec", func() { @@ -185,5 +223,31 @@ var _ = Describe("CreateMachine", func() { Expect(ok).To(BeTrue()) Expect(statusErr.Code()).To(Equal(codes.Unavailable)) }) + + It("should return ResourceExhausted when server enters ERROR state with 'no valid host'", func() { + mockClient.CreateServerFunc = func(_ context.Context, _, _ string, req *client.CreateServerRequest) (*client.Server, error) { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: req.Name, + Status: "CREATING", + }, nil + } + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: "test-machine", + Status: "ERROR", + ErrorMessage: "No valid host was found. There are not enough hosts available.", + }, nil + } + + _, err := provider.CreateMachine(ctx, req) + + Expect(err).To(HaveOccurred()) + statusErr, ok := status.FromError(err) + Expect(ok).To(BeTrue()) + Expect(statusErr.Code()).To(Equal(codes.ResourceExhausted)) + Expect(err.Error()).To(ContainSubstring("No valid host")) + }) }) }) diff --git a/pkg/provider/delete.go b/pkg/provider/delete.go index 90f78cc5..ff84e93c 100644 --- a/pkg/provider/delete.go +++ b/pkg/provider/delete.go @@ -10,6 +10,7 @@ import ( "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" "github.com/stackitcloud/machine-controller-manager-provider-stackit/pkg/client" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ) @@ -31,7 +32,7 @@ func (p *Provider) DeleteMachine(ctx context.Context, req *driver.DeleteMachineR // Initialize client on first use (lazy initialization) if err := p.ensureClient(serviceAccountKey); err != nil { - return nil, status.Error(codes.Internal, fmt.Sprintf("failed to initialize STACKIT client: %v", err)) + return nil, status.Error(codes.Unauthenticated, fmt.Sprintf("failed to initialize STACKIT client: %v", err)) } var projectID, serverID string @@ -87,7 +88,25 @@ func (p *Provider) DeleteMachine(ctx context.Context, req *driver.DeleteMachineR return nil, status.Error(codes.Internal, fmt.Sprintf("failed to delete server: %v", err)) } - klog.V(2).Infof("Successfully deleted server %q for machine %q", serverID, req.Machine.Name) + if err := p.WaitUntilServerDeleted(ctx, projectID, providerSpec.Region, serverID); err != nil { + klog.Errorf("Failed waiting for server %q to be deleted for machine %q: %v", serverID, req.Machine.Name, err) + return nil, status.Error(codes.DeadlineExceeded, fmt.Sprintf("failed waiting for server to be deleted: %v", err)) + } return &driver.DeleteMachineResponse{}, nil } + +func (p *Provider) WaitUntilServerDeleted(ctx context.Context, projectID, region, serverID string) error { + return wait.PollUntilContextTimeout(ctx, p.pollingInterval, p.pollingTimeout, true, func(ctx context.Context) (bool, error) { + _, err := p.client.GetServer(ctx, projectID, region, serverID) + if err != nil { + // Server is deleted if we get a not found error + if errors.Is(err, client.ErrServerNotFound) { + klog.V(2).Infof("Server %q has been deleted", serverID) + return true, nil + } + } + + return false, err + }) +} diff --git a/pkg/provider/delete_test.go b/pkg/provider/delete_test.go index 746f76fb..a30dae1a 100644 --- a/pkg/provider/delete_test.go +++ b/pkg/provider/delete_test.go @@ -3,6 +3,7 @@ package provider import ( "context" "fmt" + "time" "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" @@ -33,7 +34,9 @@ var _ = Describe("DeleteMachine", func() { ctx = context.Background() mockClient = &mock.StackitClient{} provider = &Provider{ - client: mockClient, + client: mockClient, + pollingInterval: 10 * time.Millisecond, + pollingTimeout: 5 * time.Second, } // Create secret with projectId @@ -87,6 +90,9 @@ var _ = Describe("DeleteMachine", func() { mockClient.DeleteServerFunc = func(_ context.Context, _, _, _ string) error { return nil } + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + return nil, fmt.Errorf("%w: status 404", client.ErrServerNotFound) + } resp, err := provider.DeleteMachine(ctx, req) @@ -103,6 +109,9 @@ var _ = Describe("DeleteMachine", func() { capturedServerID = serverID return nil } + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + return nil, fmt.Errorf("%w: status 404", client.ErrServerNotFound) + } _, err := provider.DeleteMachine(ctx, req) @@ -110,12 +119,48 @@ var _ = Describe("DeleteMachine", func() { Expect(capturedProjectID).To(Equal("11111111-2222-3333-4444-555555555555")) Expect(capturedServerID).To(Equal("550e8400-e29b-41d4-a716-446655440000")) }) + + It("should poll GetServer until server is deleted", func() { + getServerCallCount := 0 + + mockClient.DeleteServerFunc = func(_ context.Context, _, _, _ string) error { + return nil + } + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + getServerCallCount++ + // First call returns server still exists, second call returns not found + if getServerCallCount == 1 { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: "test-machine", + Status: "SHUTTING_DOWN", + }, nil + } + return nil, fmt.Errorf("%w: status 404", client.ErrServerNotFound) + } + + resp, err := provider.DeleteMachine(ctx, req) + + Expect(err).NotTo(HaveOccurred()) + Expect(resp).NotTo(BeNil()) + Expect(getServerCallCount).To(BeNumerically(">=", 2)) + }) }) Context("with missing or invalid ProviderID", func() { It("should still delete the machine when ProviderID is missing", func() { machine.Spec.ProviderID = "" + mockClient.GetServerFunc = func(_ context.Context, _, _, _ string) (*client.Server, error) { + return &client.Server{ + ID: "550e8400-e29b-41d4-a716-446655440000", + Name: "test-machine", + }, nil + } + mockClient.DeleteServerFunc = func(_ context.Context, _, _, _ string) error { + return nil + } + _, err := provider.DeleteMachine(ctx, req) Expect(err).ToNot(HaveOccurred()) diff --git a/pkg/provider/helpers.go b/pkg/provider/helpers.go index 37c31e4f..dcbb2c06 100644 --- a/pkg/provider/helpers.go +++ b/pkg/provider/helpers.go @@ -58,3 +58,8 @@ func extractSecretCredentials(secretData map[string][]byte) (projectID, serviceA serviceAccountKey = string(secretData[validation.StackitServiceAccountKey]) return projectID, serviceAccountKey } + +func isResourceExhaustedError(err error) bool { + errMsg := strings.ToLower(err.Error()) + return strings.Contains(errMsg, "no valid host") || strings.Contains(errMsg, "quota exceeded") +} diff --git a/pkg/provider/provider.go b/pkg/provider/provider.go index 992d9496..684523c3 100644 --- a/pkg/provider/provider.go +++ b/pkg/provider/provider.go @@ -3,6 +3,7 @@ package provider import ( "fmt" "sync" + "time" "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" client2 "github.com/stackitcloud/machine-controller-manager-provider-stackit/pkg/client" @@ -24,12 +25,17 @@ type Provider struct { clientOnce sync.Once // Ensures client is initialized exactly once clientErr error // Stores initialization error if any capturedCredentials string // Service account key used for initialization (for defensive checks) + // intervals need to be configurable to speed up tests + pollingInterval time.Duration // Interval between polling attempts + pollingTimeout time.Duration // Maximum time to wait during polling } // NewProvider returns an empty provider object func NewProvider(i spi.SessionProviderInterface) driver.Driver { return &Provider{ - SPI: i, + SPI: i, + pollingInterval: 5 * time.Second, + pollingTimeout: 10 * time.Minute, } }