From 2c38d4bce8c0d69a9be242e14b3707ada66daf29 Mon Sep 17 00:00:00 2001 From: petr-muller Date: Mon, 7 Feb 2022 20:00:39 +0100 Subject: [PATCH 1/3] ci-operator: retry infra-failed builds immediately `ci-operator` was already able to recognize infrastructure-failed builds from previous runs and retry them. This is an attempt to reuse that code to retry such failed builds immediately, with two attempts in an exponential backoff. The backoff has an intentionally long starting delay of 1 minute to give the infrastructure problem a chance to go away. The way the code is structured makes it less optimal for the case where we are retrying infra failures from the previous executions: it will eat one of the backoff iterations, but such cases should be rare because ci-op runs should not result in failures caused by infrastructure failures anymore (because they are retried immediately). --- pkg/steps/source.go | 75 ++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/pkg/steps/source.go b/pkg/steps/source.go index b57d072df2e..2ae0e3ec815 100644 --- a/pkg/steps/source.go +++ b/pkg/steps/source.go @@ -369,46 +369,57 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool { } func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.Build) error { - if err := buildClient.Create(ctx, build); err != nil { - if !kerrors.IsAlreadyExists(err) { - return fmt.Errorf("could not create build %s: %w", build.Name, err) + var buildErr error + attempts := 5 + if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { + if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { + return false, fmt.Errorf("could not create build %s: %w", build.Name, err) } + + buildErr = waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) + if buildErr == nil { + if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { + // log error but do not fail successful build + logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", build.Name) + } + return true, nil + } + b := &buildapi.Build{} if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: build.Namespace, Name: build.Name}, b); err != nil { - return fmt.Errorf("could not get build %s: %w", build.Name, err) + return false, fmt.Errorf("could not get build %s: %w", build.Name, err) } - if isBuildPhaseTerminated(b.Status.Phase) && - (isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { - logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason) - zero := int64(0) - foreground := metav1.DeletePropagationForeground - opts := metav1.DeleteOptions{ - GracePeriodSeconds: &zero, - Preconditions: &metav1.Preconditions{UID: &b.UID}, - PropagationPolicy: &foreground, - } - if err := buildClient.Delete(ctx, build, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { - return fmt.Errorf("could not delete build %s: %w", build.Name, err) - } - if err := waitForBuildDeletion(ctx, buildClient, build.Namespace, build.Name); err != nil { - return fmt.Errorf("could not wait for build %s to be deleted: %w", build.Name, err) - } - if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { - return fmt.Errorf("could not recreate build %s: %w", build.Name, err) - } + if !isBuildPhaseTerminated(b.Status.Phase) { + return false, buildErr } - } - err := waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) - if err == nil { - if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { - // log error but do not fail successful build - logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", build.Name) + + if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { + return false, buildErr } - } - // this will still be the err from waitForBuild - return err + logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason) + zero := int64(0) + foreground := metav1.DeletePropagationForeground + opts := metav1.DeleteOptions{ + GracePeriodSeconds: &zero, + Preconditions: &metav1.Preconditions{UID: &b.UID}, + PropagationPolicy: &foreground, + } + if err := buildClient.Delete(ctx, build, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { + return false, fmt.Errorf("could not delete build %s: %w", build.Name, err) + } + if err := waitForBuildDeletion(ctx, buildClient, build.Namespace, build.Name); err != nil { + return false, fmt.Errorf("could not wait for build %s to be deleted: %w", build.Name, err) + } + return false, nil + }); boErr != nil { + if boErr == wait.ErrWaitTimeout { + return fmt.Errorf("build not successful after %d attempts, last error: %w", attempts, buildErr) + } + return boErr + } + return nil } func waitForBuildDeletion(ctx context.Context, client ctrlruntimeclient.Client, ns, name string) error { From 85b816f2415517b5557960959cca4e2818a2d665 Mon Sep 17 00:00:00 2001 From: petr-muller Date: Tue, 8 Feb 2022 18:25:57 +0100 Subject: [PATCH 2/3] ci-operator: record aggregate error instead last one --- pkg/steps/source.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/steps/source.go b/pkg/steps/source.go index 2ae0e3ec815..6db8abbe4d8 100644 --- a/pkg/steps/source.go +++ b/pkg/steps/source.go @@ -16,6 +16,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" prowv1 "k8s.io/test-infra/prow/apis/prowjobs/v1" "k8s.io/test-infra/prow/clonerefs" @@ -369,14 +370,14 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool { } func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.Build) error { - var buildErr error + var buildErrs []error attempts := 5 if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { return false, fmt.Errorf("could not create build %s: %w", build.Name, err) } - buildErr = waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) + buildErr := waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) if buildErr == nil { if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { // log error but do not fail successful build @@ -384,6 +385,7 @@ func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.B } return true, nil } + buildErrs = append(buildErrs, buildErr) b := &buildapi.Build{} if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: build.Namespace, Name: build.Name}, b); err != nil { @@ -415,7 +417,7 @@ func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.B return false, nil }); boErr != nil { if boErr == wait.ErrWaitTimeout { - return fmt.Errorf("build not successful after %d attempts, last error: %w", attempts, buildErr) + return fmt.Errorf("build not successful after %d attempts: %w", attempts, errors.NewAggregate(buildErrs)) } return boErr } From b85f52523ec1dadd5c589fd8a9078f8d1762639c Mon Sep 17 00:00:00 2001 From: petr-muller Date: Wed, 9 Feb 2022 14:27:49 +0100 Subject: [PATCH 3/3] ci-operator: fix infra-failed build retries `Create` populates the `Build`s metadata with resource version, UID etc of the created object, so the same object cannot be subsequently used in a subsequent `Create` when we need to retry. Make local deep copies of the input (desired) `Build` and create them instead of the original. The `Build` is passed into the method as a pointer, which made me wonder whether some callsite actually depends on the side effect on the object, but it does not. To prevent future confusion like that, I changed `handleBuild` to accept an instance copy, not a pointer. If something in the future needs the resulting created `Build` object, the method should return it as a return value. --- pkg/steps/bundle_source.go | 2 +- pkg/steps/git_source.go | 2 +- pkg/steps/index_generator.go | 2 +- pkg/steps/pipeline_image_cache.go | 2 +- pkg/steps/project_image.go | 2 +- pkg/steps/rpm_injection.go | 2 +- pkg/steps/source.go | 28 +++++++++++++++------------- 7 files changed, 21 insertions(+), 19 deletions(-) diff --git a/pkg/steps/bundle_source.go b/pkg/steps/bundle_source.go index f396f55e0d4..9629a1d7029 100644 --- a/pkg/steps/bundle_source.go +++ b/pkg/steps/bundle_source.go @@ -75,7 +75,7 @@ func (s *bundleSourceStep) run(ctx context.Context) error { s.pullSecret, nil, ) - return handleBuild(ctx, s.client, build) + return handleBuild(ctx, s.client, *build) } func replaceCommand(pullSpec, with string) string { diff --git a/pkg/steps/git_source.go b/pkg/steps/git_source.go index 7d5ffa66c1b..85a60e7b07d 100644 --- a/pkg/steps/git_source.go +++ b/pkg/steps/git_source.go @@ -42,7 +42,7 @@ func (s *gitSourceStep) run(ctx context.Context) error { secretName = s.cloneAuthConfig.Secret.Name } - return handleBuild(ctx, s.buildClient, buildFromSource(s.jobSpec, "", api.PipelineImageStreamTagReferenceRoot, buildapi.BuildSource{ + return handleBuild(ctx, s.buildClient, *buildFromSource(s.jobSpec, "", api.PipelineImageStreamTagReferenceRoot, buildapi.BuildSource{ Type: buildapi.BuildSourceGit, Dockerfile: s.config.DockerfileLiteral, ContextDir: s.config.ContextDir, diff --git a/pkg/steps/index_generator.go b/pkg/steps/index_generator.go index 5b6536607f4..a748746a19f 100644 --- a/pkg/steps/index_generator.go +++ b/pkg/steps/index_generator.go @@ -79,7 +79,7 @@ func (s *indexGeneratorStep) run(ctx context.Context) error { s.pullSecret, nil, ) - err = handleBuild(ctx, s.client, build) + err = handleBuild(ctx, s.client, *build) if err != nil && strings.Contains(err.Error(), "error checking provided apis") { return results.ForReason("generating_index").WithError(err).Errorf("failed to generate operator index due to invalid bundle info: %v", err) } diff --git a/pkg/steps/pipeline_image_cache.go b/pkg/steps/pipeline_image_cache.go index ed5cea9ca64..cb4598c8b8b 100644 --- a/pkg/steps/pipeline_image_cache.go +++ b/pkg/steps/pipeline_image_cache.go @@ -44,7 +44,7 @@ func (s *pipelineImageCacheStep) run(ctx context.Context) error { if err != nil { return err } - return handleBuild(ctx, s.client, buildFromSource( + return handleBuild(ctx, s.client, *buildFromSource( s.jobSpec, s.config.From, s.config.To, buildapi.BuildSource{ Type: buildapi.BuildSourceDockerfile, diff --git a/pkg/steps/project_image.go b/pkg/steps/project_image.go index e142c75fae1..0e4a0446bfa 100644 --- a/pkg/steps/project_image.go +++ b/pkg/steps/project_image.go @@ -61,7 +61,7 @@ func (s *projectDirectoryImageBuildStep) run(ctx context.Context) error { s.pullSecret, s.config.BuildArgs, ) - return handleBuild(ctx, s.client, build) + return handleBuild(ctx, s.client, *build) } type workingDir func(tag string) (string, error) diff --git a/pkg/steps/rpm_injection.go b/pkg/steps/rpm_injection.go index f77f4bed384..bb002311085 100644 --- a/pkg/steps/rpm_injection.go +++ b/pkg/steps/rpm_injection.go @@ -48,7 +48,7 @@ func (s *rpmImageInjectionStep) run(ctx context.Context) error { if err != nil { return err } - return handleBuild(ctx, s.client, buildFromSource( + return handleBuild(ctx, s.client, *buildFromSource( s.jobSpec, s.config.From, s.config.To, buildapi.BuildSource{ Type: buildapi.BuildSourceDockerfile, diff --git a/pkg/steps/source.go b/pkg/steps/source.go index 6db8abbe4d8..56ca34814d0 100644 --- a/pkg/steps/source.go +++ b/pkg/steps/source.go @@ -159,7 +159,7 @@ func (s *sourceStep) run(ctx context.Context) error { if err != nil { return err } - return handleBuild(ctx, s.client, createBuild(s.config, s.jobSpec, clonerefsRef, s.resources, s.cloneAuthConfig, s.pullSecret, fromDigest)) + return handleBuild(ctx, s.client, *createBuild(s.config, s.jobSpec, clonerefsRef, s.resources, s.cloneAuthConfig, s.pullSecret, fromDigest)) } func createBuild(config api.SourceStepConfiguration, jobSpec *api.JobSpec, clonerefsRef corev1.ObjectReference, resources api.ResourceConfiguration, cloneAuthConfig *CloneAuthConfig, pullSecret *corev1.Secret, fromDigest string) *buildapi.Build { @@ -369,27 +369,29 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool { return true } -func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.Build) error { +func handleBuild(ctx context.Context, buildClient BuildClient, build buildapi.Build) error { var buildErrs []error attempts := 5 if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { - if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { - return false, fmt.Errorf("could not create build %s: %w", build.Name, err) + var buildAttempt buildapi.Build + build.DeepCopyInto(&buildAttempt) + if err := buildClient.Create(ctx, &buildAttempt); err != nil && !kerrors.IsAlreadyExists(err) { + return false, fmt.Errorf("could not create build %s: %w", buildAttempt.Name, err) } - buildErr := waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) + buildErr := waitForBuildOrTimeout(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name) if buildErr == nil { - if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { + if err := gatherSuccessfulBuildLog(buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil { // log error but do not fail successful build - logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", build.Name) + logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", buildAttempt.Name) } return true, nil } buildErrs = append(buildErrs, buildErr) b := &buildapi.Build{} - if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: build.Namespace, Name: build.Name}, b); err != nil { - return false, fmt.Errorf("could not get build %s: %w", build.Name, err) + if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: buildAttempt.Namespace, Name: buildAttempt.Name}, b); err != nil { + return false, fmt.Errorf("could not get build %s: %w", buildAttempt.Name, err) } if !isBuildPhaseTerminated(b.Status.Phase) { @@ -408,11 +410,11 @@ func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.B Preconditions: &metav1.Preconditions{UID: &b.UID}, PropagationPolicy: &foreground, } - if err := buildClient.Delete(ctx, build, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { - return false, fmt.Errorf("could not delete build %s: %w", build.Name, err) + if err := buildClient.Delete(ctx, &buildAttempt, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { + return false, fmt.Errorf("could not delete build %s: %w", buildAttempt.Name, err) } - if err := waitForBuildDeletion(ctx, buildClient, build.Namespace, build.Name); err != nil { - return false, fmt.Errorf("could not wait for build %s to be deleted: %w", build.Name, err) + if err := waitForBuildDeletion(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil { + return false, fmt.Errorf("could not wait for build %s to be deleted: %w", buildAttempt.Name, err) } return false, nil }); boErr != nil {