From 507b4746329d8812d2391a252ae46647a4d3f9e4 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Sun, 3 Apr 2022 22:52:58 -0700
Subject: [PATCH] pkg/cvo/updatepayload: Prune previous payload downloads

Avoid [1]:

  mv: inter-device move failed: '/manifests' to '/etc/cvo/updatepayloads/HbO7IDc7tyIg9utw3sd_tg/manifests/manifests'; unable to remove target: Directory not empty

by pruning all scratch directories before downloading a new payload.
Also shift the job pruning up here too, so it's all collected
together.

For jobs, pending and active jobs are now added to the deletion queue
too, and we pivot to not retaining any old jobs.  Either the older
jobs were for other targets, in which case we no longer care about
them, or they are looking at the same directory we're about to target
with the job we're about to launch, in which case they will likely
step on each other's toes.  If we want to return to keeping more jobs
around, we'd need to pivot to target directory names that are more
unique than just "pullspec hash", e.g. by including a job-launch
timestap.

The directory pruning avoids sticking subsequent download jobs after a
corrupted download attempt (e.g. if something terminates a download
job part way through its 'mv' call, subsequent download ).
Reproducing this issue locally, with /tmp on a different filesystem:

  $ mkdir -p a /tmp/a/a
  $ /bin/mv a /tmp
  /bin/mv: inter-device move failed: 'a' to '/tmp/a'; unable to remove target: Directory not empty
  $ /bin/mv --version
  mv (GNU coreutils) 8.31
  ...

The directory pruning will also avoid leaking previous sets of
manifests.  Before this commit, the /etc/cvo/updatepayloads
directories on the control plane nodes would gradually accumulate more
and more release manifest sets as new targets were downloaded for
inspection.  The manifest sets aren't that big:

  $ oc adm release extract --to manifests quay.io/openshift-release-dev/ocp-release:4.10.6-x86_64
  Extracted release payload from digest sha256:88b394e633e09dc23aa1f1a61ededd8e52478edf34b51a7dbbb21d9abde2511a created at 2022-03-23T07:34:52Z
  $ du -hs manifests
  6.5M    manifests

But still, better to have the pruning than leak forever.

[1]: https://bugzilla.redhat.com/show_bug.cgi?id=2070805#c0
---
 pkg/cvo/updatepayload.go | 41 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go
index 52c69925c1..90fa9ce3f9 100644
--- a/pkg/cvo/updatepayload.go
+++ b/pkg/cvo/updatepayload.go
@@ -135,14 +135,28 @@ func (r *payloadRetriever) targetUpdatePayloadDir(ctx context.Context, update co
 	hash := md5.New()
 	hash.Write([]byte(update.Image))
 	payloadHash := base64.RawURLEncoding.EncodeToString(hash.Sum(nil))
-
 	tdir := filepath.Join(r.workingDir, payloadHash)
-	err := payload.ValidateDirectory(tdir)
-	if os.IsNotExist(err) {
-		// the dirs don't exist, try fetching the payload to tdir.
-		err = r.fetchUpdatePayloadToDir(ctx, tdir, update)
+
+	// Prune older jobs and directories while gracefully handling errors.
+	if err := r.pruneJobs(ctx, 0); err != nil {
+		klog.Warningf("failed to prune jobs: %v", err)
 	}
-	if err != nil {
+	if files, err := os.ReadDir(r.workingDir); err != nil {
+		klog.Warningf("failed to list update payload directory: %v", err)
+	} else {
+		for _, file := range files {
+			if err := os.RemoveAll(filepath.Join(r.workingDir, file.Name())); err != nil {
+				klog.Warningf("failed to prune update payload directory: %v", err)
+			}
+		}
+	}
+
+	if err := payload.ValidateDirectory(tdir); os.IsNotExist(err) {
+		// the dirs don't exist, try fetching the payload to tdir.
+		if err := r.fetchUpdatePayloadToDir(ctx, tdir, update); err != nil {
+			return "", err
+		}
+	} else if err != nil {
 		return "", err
 	}
 
@@ -217,14 +231,7 @@ func (r *payloadRetriever) fetchUpdatePayloadToDir(ctx context.Context, dir stri
 		},
 	}
 
-	// Prune older jobs while gracefully handling errors.
-	err := r.pruneJobs(ctx, 2)
-	if err != nil {
-		klog.Warningf("failed to prune jobs: %v", err)
-	}
-
-	_, err = r.kubeClient.BatchV1().Jobs(job.Namespace).Create(ctx, job, metav1.CreateOptions{})
-	if err != nil {
+	if _, err := r.kubeClient.BatchV1().Jobs(job.Namespace).Create(ctx, job, metav1.CreateOptions{}); err != nil {
 		return err
 	}
 	return resourcebuilder.WaitForJobCompletion(ctx, r.kubeClient.BatchV1(), job)
@@ -248,12 +255,6 @@ func (r *payloadRetriever) pruneJobs(ctx context.Context, retain int) error {
 		// Ignore jobs not beginning with operatorName
 		case !strings.HasPrefix(job.Name, r.operatorName+"-"):
 			break
-		// Ignore jobs that have not yet started
-		case job.Status.StartTime == nil:
-			break
-		// Ignore jobs that are still active
-		case job.Status.Active == 1:
-			break
 		default:
 			deleteJobs = append(deleteJobs, job)
 		}