From c056a8cb1f929d952c3f08684607bd694c0b020e Mon Sep 17 00:00:00 2001 From: petr-muller Date: Mon, 7 Feb 2022 20:00:39 +0100 Subject: [PATCH] ci-operator: retry infra-failed builds immediately `ci-operator` was already able to recognize infrastructure-failed builds from previous runs and retry them. This is an attempt to reuse that code to retry such failed builds immediately, with two attempts in an exponential backoff. The backoff has an intentionally long starting delay of 1 minute to give the infrastructure problem a chance to go away. The way the code is structured makes it less optimal for the case where we are retrying infra failures from the previous executions: it will eat one of the backoff iterations, but such cases should be rare because ci-op runs should not result in failures caused by infrastructure failures anymore (because they are retried immediately). --- pkg/steps/source.go | 75 ++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/pkg/steps/source.go b/pkg/steps/source.go index b57d072df2e..2ae0e3ec815 100644 --- a/pkg/steps/source.go +++ b/pkg/steps/source.go @@ -369,46 +369,57 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool { } func handleBuild(ctx context.Context, buildClient BuildClient, build *buildapi.Build) error { - if err := buildClient.Create(ctx, build); err != nil { - if !kerrors.IsAlreadyExists(err) { - return fmt.Errorf("could not create build %s: %w", build.Name, err) + var buildErr error + attempts := 5 + if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { + if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { + return false, fmt.Errorf("could not create build %s: %w", build.Name, err) } + + buildErr = waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) + if buildErr == nil { + if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { + // log error but do not fail successful build + logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", build.Name) + } + return true, nil + } + b := &buildapi.Build{} if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: build.Namespace, Name: build.Name}, b); err != nil { - return fmt.Errorf("could not get build %s: %w", build.Name, err) + return false, fmt.Errorf("could not get build %s: %w", build.Name, err) } - if isBuildPhaseTerminated(b.Status.Phase) && - (isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { - logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason) - zero := int64(0) - foreground := metav1.DeletePropagationForeground - opts := metav1.DeleteOptions{ - GracePeriodSeconds: &zero, - Preconditions: &metav1.Preconditions{UID: &b.UID}, - PropagationPolicy: &foreground, - } - if err := buildClient.Delete(ctx, build, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { - return fmt.Errorf("could not delete build %s: %w", build.Name, err) - } - if err := waitForBuildDeletion(ctx, buildClient, build.Namespace, build.Name); err != nil { - return fmt.Errorf("could not wait for build %s to be deleted: %w", build.Name, err) - } - if err := buildClient.Create(ctx, build); err != nil && !kerrors.IsAlreadyExists(err) { - return fmt.Errorf("could not recreate build %s: %w", build.Name, err) - } + if !isBuildPhaseTerminated(b.Status.Phase) { + return false, buildErr } - } - err := waitForBuildOrTimeout(ctx, buildClient, build.Namespace, build.Name) - if err == nil { - if err := gatherSuccessfulBuildLog(buildClient, build.Namespace, build.Name); err != nil { - // log error but do not fail successful build - logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", build.Name) + + if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { + return false, buildErr } - } - // this will still be the err from waitForBuild - return err + logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason) + zero := int64(0) + foreground := metav1.DeletePropagationForeground + opts := metav1.DeleteOptions{ + GracePeriodSeconds: &zero, + Preconditions: &metav1.Preconditions{UID: &b.UID}, + PropagationPolicy: &foreground, + } + if err := buildClient.Delete(ctx, build, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { + return false, fmt.Errorf("could not delete build %s: %w", build.Name, err) + } + if err := waitForBuildDeletion(ctx, buildClient, build.Namespace, build.Name); err != nil { + return false, fmt.Errorf("could not wait for build %s to be deleted: %w", build.Name, err) + } + return false, nil + }); boErr != nil { + if boErr == wait.ErrWaitTimeout { + return fmt.Errorf("build not successful after %d attempts, last error: %w", attempts, buildErr) + } + return boErr + } + return nil } func waitForBuildDeletion(ctx context.Context, client ctrlruntimeclient.Client, ns, name string) error {