From 5a4ba42d4bd2b7f7edace07e45e48d7de6e9cee5 Mon Sep 17 00:00:00 2001 From: Pierre Prinetti Date: Fri, 23 Jul 2021 10:29:24 +0200 Subject: [PATCH] Bug 1985015: Eliminate instanceCreate volume leak InstanceCreate is a method that creates a server. If the server is set to boot from volume, and an image ID is passed, the method creates the root volume prior to creating the server. The root volume is set to be destroyed when the associated server is destroyed. However, if the server fails to create (for example, because of quota issues), the volume is never associated to a server and the automatic deletion is never triggered. At every round of retry, a new volume will be created, possibly until volume quota is reached (or server creation is successful). This results in a leakage of unused volumes. With this change, a newly created root volume is explicitly deleted in the event of an early failure of InstanceCreate. That is to say: if InstanceCreate creates a volume, that volume will be deleted if any of the next steps fail and cause the server never to be created. Note that this patch leaves unmodified the lifespan of a volume associated to a server, regardless if the server ever reaches an ACTIVE state. Co-Authored-By: Matthew Booth --- pkg/cloud/openstack/clients/machineservice.go | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/pkg/cloud/openstack/clients/machineservice.go b/pkg/cloud/openstack/clients/machineservice.go index d0c05a78f2..d557f27d57 100644 --- a/pkg/cloud/openstack/clients/machineservice.go +++ b/pkg/cloud/openstack/clients/machineservice.go @@ -462,6 +462,17 @@ func GetSecurityGroups(is *InstanceService, sg_param []openstackconfigv1.Securit // If ServerGroupName is nonempty and no server group exists with that name, // then InstanceCreate creates a server group with that name. func (is *InstanceService) InstanceCreate(clusterName string, name string, clusterSpec *openstackconfigv1.OpenstackClusterProviderSpec, config *openstackconfigv1.OpenstackProviderSpec, cmd string, keyName string, configClient configclient.ConfigV1Interface) (instance *Instance, err error) { + // server is only non-nil in case of successful server creation. + // + // There are multiple preparation steps in this method, some of which + // create resources (e.g. a volume in case of boot-from-volume with + // "image" source), and all of which have a chance to fail. + // + // This variable is guaranteed to remain nil until server creation is + // successful. Deferred cleanup functions can restore the initial state + // in case of failure, by only acting if "server" is nil. + var server *servers.Server + if config == nil { return nil, fmt.Errorf("create Options need be specified to create instace") } @@ -732,15 +743,21 @@ func (is *InstanceService) InstanceCreate(clusterName string, name string, clust volumeID = volume.ID - err = volumes.WaitForStatus(is.volumeClient, volumeID, "available", 300) - if err != nil { - klog.Infof("Bootable volume %v creation failed. Removing...", volumeID) - err = volumes.Delete(is.volumeClient, volumeID, volumes.DeleteOpts{}).ExtractErr() - if err != nil { - return nil, fmt.Errorf("Bootable volume deletion err: %v", err) + defer func() { + // If the server is created in Nova, the lifetime of the associated volume will be + // bound to it. This deferred cleanup function tackles the case where a volume was created + // but never attached to a server. In that case, the volume must be removed manually. + if server == nil { + if err := volumes.Delete(is.volumeClient, volumeID, nil).ExtractErr(); err != nil { + klog.Infof("Failed to delete stale volume %q", volumeID) + } else { + klog.Infof("Deleted stale volume %q", volumeID) + } } + }() - return nil, fmt.Errorf("Bootable volume %v is not available err: %v", volumeID, err) + if err := volumes.WaitForStatus(is.volumeClient, volumeID, "available", 300); err != nil { + return nil, fmt.Errorf("bootable volume is not available: %v", err) } klog.Infof("Bootable volume %v was created successfully.", volumeID) @@ -815,11 +832,14 @@ func (is *InstanceService) InstanceCreate(clusterName string, name string, clust } } - server, err := servers.Create(is.computeClient, keypairs.CreateOptsExt{ + server, err = servers.Create(is.computeClient, keypairs.CreateOptsExt{ CreateOptsBuilder: serverCreateOpts, KeyName: keyName, }).Extract() if err != nil { + // Gophercloud does not guarantee "server" to be nil when "err" + // is not nil. Resetting to nil to trigger cleanup functions. + server = nil return nil, fmt.Errorf("Create new server err: %v", err) }