diff --git a/pkg/steps/source.go b/pkg/steps/source.go index 0cd86447d32..d524f15d71d 100644 --- a/pkg/steps/source.go +++ b/pkg/steps/source.go @@ -17,7 +17,6 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/errors" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" prowv1 "k8s.io/test-infra/prow/apis/prowjobs/v1" @@ -361,6 +360,39 @@ func buildInputsFromStep(inputs map[string]api.ImageBuildInputs) []buildapi.Imag return refs } +func handleFailedBuild(ctx context.Context, client BuildClient, ns, name string, err error) error { + b := &buildapi.Build{} + if err := client.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: ns, Name: name}, b); err != nil { + return fmt.Errorf("could not get build %s: %w", name, err) + } + + if !isBuildPhaseTerminated(b.Status.Phase) { + logrus.Debugf("Build %q (created at %v) still in phase %q", name, b.CreationTimestamp, b.Status.Phase) + return err + } + + if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { + logrus.Debugf("Build %q (created at %v) classified as legitimate failure, will not be retried", name, b.CreationTimestamp) + return err + } + + logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", name, b.Status.Reason) + zero := int64(0) + foreground := metav1.DeletePropagationForeground + opts := metav1.DeleteOptions{ + GracePeriodSeconds: &zero, + Preconditions: &metav1.Preconditions{UID: &b.UID}, + PropagationPolicy: &foreground, + } + if err := client.Delete(ctx, b, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { + return fmt.Errorf("could not delete build %s: %w", name, err) + } + if err := waitForBuildDeletion(ctx, client, ns, name); err != nil { + return fmt.Errorf("could not wait for build %s to be deleted: %w", name, err) + } + return nil +} + func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool { switch phase { case buildapi.BuildPhaseNew, @@ -417,59 +449,34 @@ func constructMultiArchBuilds(build buildapi.Build, nodeArchitectures []string) return ret } -func handleBuild(ctx context.Context, buildClient BuildClient, build buildapi.Build) error { - var buildErrs []error - attempts := 5 - if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { - var buildAttempt buildapi.Build - build.DeepCopyInto(&buildAttempt) - if err := buildClient.Create(ctx, &buildAttempt); err != nil && !kerrors.IsAlreadyExists(err) { - return false, fmt.Errorf("could not create build %s: %w", buildAttempt.Name, err) - } - - buildErr := waitForBuildOrTimeout(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name) - if buildErr == nil { - if err := gatherSuccessfulBuildLog(buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil { - // log error but do not fail successful build - logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", buildAttempt.Name) - } - return true, nil - } - buildErrs = append(buildErrs, buildErr) - - b := &buildapi.Build{} - if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: buildAttempt.Namespace, Name: buildAttempt.Name}, b); err != nil { - return false, fmt.Errorf("could not get build %s: %w", buildAttempt.Name, err) - } - - if !isBuildPhaseTerminated(b.Status.Phase) { - return false, buildErr - } - - if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) { - return false, buildErr - } - - logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason) - zero := int64(0) - foreground := metav1.DeletePropagationForeground - opts := metav1.DeleteOptions{ - GracePeriodSeconds: &zero, - Preconditions: &metav1.Preconditions{UID: &b.UID}, - PropagationPolicy: &foreground, +func handleBuild(ctx context.Context, client BuildClient, build buildapi.Build) error { + const attempts = 5 + ns, name := build.Namespace, build.Name + var errs []error + if err := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) { + var attempt buildapi.Build + build.DeepCopyInto(&attempt) + if err := client.Create(ctx, &attempt); err == nil { + logrus.Infof("Created build %q", name) + } else if kerrors.IsAlreadyExists(err) { + logrus.Infof("Found existing build %q", name) + } else { + return false, fmt.Errorf("could not create build %s: %w", name, err) } - if err := buildClient.Delete(ctx, &buildAttempt, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) { - return false, fmt.Errorf("could not delete build %s: %w", buildAttempt.Name, err) + if err := waitForBuildOrTimeout(ctx, client, ns, name); err != nil { + errs = append(errs, err) + return false, handleFailedBuild(ctx, client, ns, name, err) } - if err := waitForBuildDeletion(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil { - return false, fmt.Errorf("could not wait for build %s to be deleted: %w", buildAttempt.Name, err) + if err := gatherSuccessfulBuildLog(client, ns, name); err != nil { + // log error but do not fail successful build + logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", name) } - return false, nil - }); boErr != nil { - if boErr == wait.ErrWaitTimeout { - return fmt.Errorf("build not successful after %d attempts: %w", attempts, errors.NewAggregate(buildErrs)) + return true, nil + }); err != nil { + if err == wait.ErrWaitTimeout { + return fmt.Errorf("build not successful after %d attempts: %w", attempts, utilerrors.NewAggregate(errs)) } - return boErr + return err } return nil } @@ -499,7 +506,7 @@ func waitForBuildDeletion(ctx context.Context, client ctrlruntimeclient.Client, func isInfraReason(reason buildapi.StatusReason) bool { infraReasons := []buildapi.StatusReason{ - buildapi.StatusReason("BuildPodEvicted"), // vendoring to get this is so hard + buildapi.StatusReasonBuildPodEvicted, buildapi.StatusReasonBuildPodDeleted, buildapi.StatusReasonBuildPodExists, buildapi.StatusReasonCannotCreateBuildPod, @@ -533,19 +540,10 @@ func hintsAtInfraReason(logSnippet string) bool { } func waitForBuildOrTimeout(ctx context.Context, buildClient BuildClient, namespace, name string) error { - return waitForBuild(ctx, buildClient, namespace, name, 0, buildDuration) + return waitForBuild(ctx, buildClient, namespace, name) } -func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name string, timeout time.Duration, buildDurationFunc func(*buildapi.Build) time.Duration) error { - isOK := func(b *buildapi.Build) bool { - return b.Status.Phase == buildapi.BuildPhaseComplete - } - isFailed := func(b *buildapi.Build) bool { - return b.Status.Phase == buildapi.BuildPhaseFailed || - b.Status.Phase == buildapi.BuildPhaseCancelled || - b.Status.Phase == buildapi.BuildPhaseError - } - +func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name string) error { logrus.WithFields(logrus.Fields{ "namespace": namespace, "name": name, @@ -554,14 +552,14 @@ func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name evaluatorFunc := func(obj runtime.Object) (bool, error) { switch build := obj.(type) { case *buildapi.Build: - if isOK(build) { - logrus.Infof("Build %s succeeded after %s", build.Name, buildDurationFunc(build).Truncate(time.Second)) + switch build.Status.Phase { + case buildapi.BuildPhaseComplete: + logrus.Infof("Build %s succeeded after %s", build.Name, buildDuration(build).Truncate(time.Second)) return true, nil - } - if isFailed(build) { + case buildapi.BuildPhaseFailed, buildapi.BuildPhaseCancelled, buildapi.BuildPhaseError: logrus.Infof("Build %s failed, printing logs:", build.Name) printBuildLogs(buildClient, build.Namespace, build.Name) - return true, util.AppendLogToError(fmt.Errorf("the build %s failed after %s with reason %s: %s", build.Name, buildDurationFunc(build).Truncate(time.Second), build.Status.Reason, build.Status.Message), build.Status.LogSnippet) + return true, util.AppendLogToError(fmt.Errorf("the build %s failed after %s with reason %s: %s", build.Name, buildDuration(build).Truncate(time.Second), build.Status.Reason, build.Status.Message), build.Status.LogSnippet) } default: return false, fmt.Errorf("build/%v ns/%v got an event that did not contain a build: %v", name, namespace, obj) @@ -569,7 +567,7 @@ func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name return false, nil } - return kubernetes.WaitForConditionOnObject(ctx, buildClient, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: name}, &buildapi.BuildList{}, &buildapi.Build{}, evaluatorFunc, timeout) + return kubernetes.WaitForConditionOnObject(ctx, buildClient, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: name}, &buildapi.BuildList{}, &buildapi.Build{}, evaluatorFunc, 0) } func buildDuration(build *buildapi.Build) time.Duration { diff --git a/pkg/steps/source_test.go b/pkg/steps/source_test.go index ed439f81a3e..7b875064d49 100644 --- a/pkg/steps/source_test.go +++ b/pkg/steps/source_test.go @@ -386,16 +386,13 @@ func init() { } func TestWaitForBuild(t *testing.T) { + now := meta.Time{Time: time.Now()} + start, end := meta.Time{Time: now.Time.Add(-3 * time.Second)}, now var testCases = []struct { name string buildClient BuildClient expected error }{ - { - name: "timeout", - buildClient: NewBuildClient(loggingclient.New(fakectrlruntimeclient.NewClientBuilder().WithRuntimeObjects().Build()), nil, nil), - expected: fmt.Errorf("timed out waiting for the condition"), - }, { name: "build succeeded", buildClient: NewBuildClient(loggingclient.New(fakectrlruntimeclient.NewClientBuilder().WithRuntimeObjects( @@ -405,7 +402,9 @@ func TestWaitForBuild(t *testing.T) { Namespace: "some-ns", }, Status: buildapi.BuildStatus{ - Phase: buildapi.BuildPhaseComplete, + Phase: buildapi.BuildPhaseComplete, + StartTimestamp: &start, + CompletionTimestamp: &end, }, }).Build()), nil, nil), }, @@ -418,10 +417,12 @@ func TestWaitForBuild(t *testing.T) { Namespace: "some-ns", }, Status: buildapi.BuildStatus{ - Phase: buildapi.BuildPhaseCancelled, - Reason: "reason", - Message: "msg", - LogSnippet: "snippet", + Phase: buildapi.BuildPhaseCancelled, + Reason: "reason", + Message: "msg", + LogSnippet: "snippet", + StartTimestamp: &start, + CompletionTimestamp: &end, }, }).Build()), "abc\n"), // the line break is for gotestsum https://github.com/gotestyourself/gotestsum/issues/141#issuecomment-1209146526 expected: fmt.Errorf("%s\n\n%s", "the build some-build failed after 3s with reason reason: msg", "snippet"), @@ -430,9 +431,7 @@ func TestWaitForBuild(t *testing.T) { for _, testCase := range testCases { t.Run(testCase.name, func(t *testing.T) { - actual := waitForBuild(context.TODO(), testCase.buildClient, "some-ns", "some-build", 90*time.Millisecond, func(build *buildapi.Build) time.Duration { - return 3 * time.Second - }) + actual := waitForBuild(context.TODO(), testCase.buildClient, "some-ns", "some-build") if diff := cmp.Diff(testCase.expected, actual, testhelper.EquateErrorMessage); diff != "" { t.Errorf("%s: mismatch (-expected +actual), diff: %s", testCase.name, diff) }