From a5af89d34e7d032722bc02147172cf0a7e7a9c32 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 18 Apr 2022 12:56:47 -0700 Subject: [PATCH 1/2] pkg/cvo/updatepayload: Shift previous-download removal into the job I'd attempted to add this cleanup on the CVO side in 507b474632 (pkg/cvo/updatepayload: Prune previous payload downloads, 2022-04-03, #760). But Evgeni pointed out that that doesn't work because the CVO's volume mount is 'readOnly: true': 2022-04-11T20:42:32.056428784Z W0411 20:42:32.056387 1 updatepayload.go:149] failed to prune update payload directory: unlinkat /etc/cvo/updatepayloads/brRTeZnZSIkZ2J2YMdQozg: read-only file system This commit shifts removal into the job. To avoid issues with loading half-populated directories, I've also adjusted the job to populate {baseDir}/{targetName}-{randomSuffix}, and then rename that populated result to {baseDir}/{targetName}. That way, future targetUpdatePayloadDir calls using the ValidateDirectory precheck will successfully distinguish "previous job got everything for this release" (where we don't want a new download job) from "previous job got some bits over, but not everything" (where we do want a new download job). POSIX 2018 requires mv to use rename [1]. While rename discusses atomicity in the informative Rationale section, there's nothing in the normative description that mentions atomicity [2]. Still, "atomic renames" are a common dance on Linux, especially when renaming within a directory, where it is a single directory table being edited. [3] talks through the pattern, and points out the need for a pre-rename fsync or similar if you want to avoid: 1. Open a/file, write to it, and close the file descriptor. 2. Kernel holding file contents in memory, not yet on disk. 3. Atomically move a/ to b/. 4. Crash, losing any kernel-memory caches. 5. b/file exists, but it may be empty or partial. I think the risk of kernel-crashes leaving us with currupted manifests is small enough that we can ignore it, to save the cost of calling fsync. [1]: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/mv.html [2]: https://pubs.opengroup.org/onlinepubs/9699919799/functions/rename.html [3]: https://lwn.net/Articles/789600/ --- pkg/cvo/updatepayload.go | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index 90fa9ce3f9..0a48463192 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -141,15 +141,6 @@ func (r *payloadRetriever) targetUpdatePayloadDir(ctx context.Context, update co if err := r.pruneJobs(ctx, 0); err != nil { klog.Warningf("failed to prune jobs: %v", err) } - if files, err := os.ReadDir(r.workingDir); err != nil { - klog.Warningf("failed to list update payload directory: %v", err) - } else { - for _, file := range files { - if err := os.RemoveAll(filepath.Join(r.workingDir, file.Name())); err != nil { - klog.Warningf("failed to prune update payload directory: %v", err) - } - } - } if err := payload.ValidateDirectory(tdir); os.IsNotExist(err) { // the dirs don't exist, try fetching the payload to tdir. @@ -290,20 +281,24 @@ func (r *payloadRetriever) pruneJobs(ctx context.Context, retain int) error { // copyPayloadCmd returns a shell command that copies CVO and release manifests from the default location // to the target dir. -// It is made up of 2 commands: -// `mkdir -p && mv ` -// `mkdir -p && mv ` func copyPayloadCmd(tdir string) string { - var ( - fromCVOPath = filepath.Join(payload.DefaultPayloadDir, payload.CVOManifestDir) - toCVOPath = filepath.Join(tdir, payload.CVOManifestDir) - cvoCmd = fmt.Sprintf("mkdir -p %s && mv %s %s", tdir, fromCVOPath, toCVOPath) + baseDir, targetName := filepath.Split(tdir) + tmpDir := filepath.Join(baseDir, fmt.Sprintf("%s-%s", targetName, randutil.String(5))) - fromReleasePath = filepath.Join(payload.DefaultPayloadDir, payload.ReleaseManifestDir) - toReleasePath = filepath.Join(tdir, payload.ReleaseManifestDir) - releaseCmd = fmt.Sprintf("mkdir -p %s && mv %s %s", tdir, fromReleasePath, toReleasePath) - ) - return fmt.Sprintf("%s && %s", cvoCmd, releaseCmd) + cleanupCmd := fmt.Sprintf("rm -fR %s", filepath.Join(baseDir, "*")) + + tmpDirCmd := fmt.Sprintf("mkdir %s", tmpDir) + + fromCVOPath := filepath.Join(payload.DefaultPayloadDir, payload.CVOManifestDir) + toCVOPath := filepath.Join(tmpDir, payload.CVOManifestDir) + cvoCmd := fmt.Sprintf("mv %s %s", fromCVOPath, toCVOPath) + + fromReleasePath := filepath.Join(payload.DefaultPayloadDir, payload.ReleaseManifestDir) + toReleasePath := filepath.Join(tmpDir, payload.ReleaseManifestDir) + releaseCmd := fmt.Sprintf("mv %s %s", fromReleasePath, toReleasePath) + + moveInPlaceCmd := fmt.Sprintf("mv %s %s", tmpDir, tdir) + return strings.Join([]string{cleanupCmd, tmpDirCmd, cvoCmd, releaseCmd, moveInPlaceCmd}, " && ") } // findUpdateFromConfig identifies a desired update from user input or returns false. It will From c45a98130202152762323bf145ffc9c3297d5bd4 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 20 Apr 2022 21:05:47 -0700 Subject: [PATCH 2/2] pkg/cvo/updatepayload: Use initContainers instead of shell &&-chains All of the input strings were internally controlled, so there wasn't a risk of them containing a shell-sensitive character. But that safety was not immediately obvious when reading the code. By converting to initContainers, which run sequentially to completion before the next container is run [1], we have the same effect without involving the shell. As a side benefit, we also get clearer status logging showing exactly which steps have succeeded or failed. [1]: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#understanding-init-containers --- pkg/cvo/updatepayload.go | 98 ++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/pkg/cvo/updatepayload.go b/pkg/cvo/updatepayload.go index 0a48463192..46bfb0bb84 100644 --- a/pkg/cvo/updatepayload.go +++ b/pkg/cvo/updatepayload.go @@ -161,16 +161,36 @@ func (r *payloadRetriever) targetUpdatePayloadDir(ctx context.Context, update co func (r *payloadRetriever) fetchUpdatePayloadToDir(ctx context.Context, dir string, update configv1.Update) error { var ( version = update.Version - payload = update.Image + image = update.Image name = fmt.Sprintf("%s-%s-%s", r.operatorName, version, randutil.String(5)) namespace = r.namespace deadline = pointer.Int64Ptr(2 * 60) nodeSelectorKey = "node-role.kubernetes.io/master" nodename = r.nodeName - cmd = []string{"/bin/sh"} - args = []string{"-c", copyPayloadCmd(dir)} ) + baseDir, targetName := filepath.Split(dir) + tmpDir := filepath.Join(baseDir, fmt.Sprintf("%s-%s", targetName, randutil.String(5))) + + setContainerDefaults := func(container corev1.Container) corev1.Container { + container.Image = image + container.VolumeMounts = []corev1.VolumeMount{{ + MountPath: targetUpdatePayloadsDir, + Name: "payloads", + }} + container.SecurityContext = &corev1.SecurityContext{ + Privileged: pointer.BoolPtr(true), + } + container.Resources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + corev1.ResourceEphemeralStorage: resource.MustParse("2Mi"), + }, + } + return container + } + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -180,26 +200,38 @@ func (r *payloadRetriever) fetchUpdatePayloadToDir(ctx context.Context, dir stri ActiveDeadlineSeconds: deadline, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: "payload", - Image: payload, - Command: cmd, - Args: args, - VolumeMounts: []corev1.VolumeMount{{ - MountPath: targetUpdatePayloadsDir, - Name: "payloads", - }}, - SecurityContext: &corev1.SecurityContext{ - Privileged: pointer.BoolPtr(true), - }, - Resources: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("10m"), - corev1.ResourceMemory: resource.MustParse("50Mi"), - corev1.ResourceEphemeralStorage: resource.MustParse("2Mi"), + InitContainers: []corev1.Container{ + setContainerDefaults(corev1.Container{ + Name: "cleanup", + Command: []string{"rm", "-fR", filepath.Join(baseDir, "*")}, + }), + setContainerDefaults(corev1.Container{ + Name: "make-temporary-directory", + Command: []string{"mkdir", tmpDir}, + }), + setContainerDefaults(corev1.Container{ + Name: "move-operator-manifests-to-temporary-directory", + Command: []string{ + "mv", + filepath.Join(payload.DefaultPayloadDir, payload.CVOManifestDir), + filepath.Join(tmpDir, payload.CVOManifestDir), }, - }, - }}, + }), + setContainerDefaults(corev1.Container{ + Name: "move-release-manifests-to-temporary-directory", + Command: []string{ + "mv", + filepath.Join(payload.DefaultPayloadDir, payload.ReleaseManifestDir), + filepath.Join(tmpDir, payload.ReleaseManifestDir), + }, + }), + }, + Containers: []corev1.Container{ + setContainerDefaults(corev1.Container{ + Name: "rename-to-final-location", + Command: []string{"mv", tmpDir, dir}, + }), + }, Volumes: []corev1.Volume{{ Name: "payloads", VolumeSource: corev1.VolumeSource{ @@ -279,28 +311,6 @@ func (r *payloadRetriever) pruneJobs(ctx context.Context, retain int) error { return nil } -// copyPayloadCmd returns a shell command that copies CVO and release manifests from the default location -// to the target dir. -func copyPayloadCmd(tdir string) string { - baseDir, targetName := filepath.Split(tdir) - tmpDir := filepath.Join(baseDir, fmt.Sprintf("%s-%s", targetName, randutil.String(5))) - - cleanupCmd := fmt.Sprintf("rm -fR %s", filepath.Join(baseDir, "*")) - - tmpDirCmd := fmt.Sprintf("mkdir %s", tmpDir) - - fromCVOPath := filepath.Join(payload.DefaultPayloadDir, payload.CVOManifestDir) - toCVOPath := filepath.Join(tmpDir, payload.CVOManifestDir) - cvoCmd := fmt.Sprintf("mv %s %s", fromCVOPath, toCVOPath) - - fromReleasePath := filepath.Join(payload.DefaultPayloadDir, payload.ReleaseManifestDir) - toReleasePath := filepath.Join(tmpDir, payload.ReleaseManifestDir) - releaseCmd := fmt.Sprintf("mv %s %s", fromReleasePath, toReleasePath) - - moveInPlaceCmd := fmt.Sprintf("mv %s %s", tmpDir, tdir) - return strings.Join([]string{cleanupCmd, tmpDirCmd, cvoCmd, releaseCmd, moveInPlaceCmd}, " && ") -} - // findUpdateFromConfig identifies a desired update from user input or returns false. It will // resolve payload if the user specifies a version and a matching available update. func findUpdateFromConfig(config *configv1.ClusterVersion) (configv1.Update, bool) {