Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 66 additions & 68 deletions pkg/steps/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/errors"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
prowv1 "k8s.io/test-infra/prow/apis/prowjobs/v1"
Expand Down Expand Up @@ -361,6 +360,39 @@ func buildInputsFromStep(inputs map[string]api.ImageBuildInputs) []buildapi.Imag
return refs
}

func handleFailedBuild(ctx context.Context, client BuildClient, ns, name string, err error) error {
b := &buildapi.Build{}
if err := client.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: ns, Name: name}, b); err != nil {
return fmt.Errorf("could not get build %s: %w", name, err)
}

if !isBuildPhaseTerminated(b.Status.Phase) {
logrus.Debugf("Build %q (created at %v) still in phase %q", name, b.CreationTimestamp, b.Status.Phase)
return err
}

if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) {
logrus.Debugf("Build %q (created at %v) classified as legitimate failure, will not be retried", name, b.CreationTimestamp)
return err
}

logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", name, b.Status.Reason)
zero := int64(0)
foreground := metav1.DeletePropagationForeground
opts := metav1.DeleteOptions{
GracePeriodSeconds: &zero,
Preconditions: &metav1.Preconditions{UID: &b.UID},
PropagationPolicy: &foreground,
}
if err := client.Delete(ctx, b, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) {
return fmt.Errorf("could not delete build %s: %w", name, err)
}
if err := waitForBuildDeletion(ctx, client, ns, name); err != nil {
return fmt.Errorf("could not wait for build %s to be deleted: %w", name, err)
}
return nil
}

func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool {
switch phase {
case buildapi.BuildPhaseNew,
Expand Down Expand Up @@ -417,59 +449,34 @@ func constructMultiArchBuilds(build buildapi.Build, nodeArchitectures []string)
return ret
}

func handleBuild(ctx context.Context, buildClient BuildClient, build buildapi.Build) error {
var buildErrs []error
attempts := 5
if boErr := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) {
var buildAttempt buildapi.Build
build.DeepCopyInto(&buildAttempt)
if err := buildClient.Create(ctx, &buildAttempt); err != nil && !kerrors.IsAlreadyExists(err) {
return false, fmt.Errorf("could not create build %s: %w", buildAttempt.Name, err)
}

buildErr := waitForBuildOrTimeout(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name)
if buildErr == nil {
if err := gatherSuccessfulBuildLog(buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil {
// log error but do not fail successful build
logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", buildAttempt.Name)
}
return true, nil
}
buildErrs = append(buildErrs, buildErr)

b := &buildapi.Build{}
if err := buildClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: buildAttempt.Namespace, Name: buildAttempt.Name}, b); err != nil {
return false, fmt.Errorf("could not get build %s: %w", buildAttempt.Name, err)
}

if !isBuildPhaseTerminated(b.Status.Phase) {
return false, buildErr
}

if !(isInfraReason(b.Status.Reason) || hintsAtInfraReason(b.Status.LogSnippet)) {
return false, buildErr
}

logrus.Infof("Build %s previously failed from an infrastructure error (%s), retrying...", b.Name, b.Status.Reason)
zero := int64(0)
foreground := metav1.DeletePropagationForeground
opts := metav1.DeleteOptions{
GracePeriodSeconds: &zero,
Preconditions: &metav1.Preconditions{UID: &b.UID},
PropagationPolicy: &foreground,
func handleBuild(ctx context.Context, client BuildClient, build buildapi.Build) error {
const attempts = 5
ns, name := build.Namespace, build.Name
var errs []error
if err := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) {
var attempt buildapi.Build
build.DeepCopyInto(&attempt)
if err := client.Create(ctx, &attempt); err == nil {
logrus.Infof("Created build %q", name)
} else if kerrors.IsAlreadyExists(err) {
logrus.Infof("Found existing build %q", name)
} else {
return false, fmt.Errorf("could not create build %s: %w", name, err)
}
if err := buildClient.Delete(ctx, &buildAttempt, &ctrlruntimeclient.DeleteOptions{Raw: &opts}); err != nil && !kerrors.IsNotFound(err) && !kerrors.IsConflict(err) {
return false, fmt.Errorf("could not delete build %s: %w", buildAttempt.Name, err)
if err := waitForBuildOrTimeout(ctx, client, ns, name); err != nil {
errs = append(errs, err)
return false, handleFailedBuild(ctx, client, ns, name, err)
}
if err := waitForBuildDeletion(ctx, buildClient, buildAttempt.Namespace, buildAttempt.Name); err != nil {
return false, fmt.Errorf("could not wait for build %s to be deleted: %w", buildAttempt.Name, err)
if err := gatherSuccessfulBuildLog(client, ns, name); err != nil {
// log error but do not fail successful build
logrus.WithError(err).Warnf("Failed gathering successful build %s logs into artifacts.", name)
}
return false, nil
}); boErr != nil {
if boErr == wait.ErrWaitTimeout {
return fmt.Errorf("build not successful after %d attempts: %w", attempts, errors.NewAggregate(buildErrs))
return true, nil
}); err != nil {
if err == wait.ErrWaitTimeout {
return fmt.Errorf("build not successful after %d attempts: %w", attempts, utilerrors.NewAggregate(errs))
}
return boErr
return err
}
return nil
}
Expand Down Expand Up @@ -499,7 +506,7 @@ func waitForBuildDeletion(ctx context.Context, client ctrlruntimeclient.Client,

func isInfraReason(reason buildapi.StatusReason) bool {
infraReasons := []buildapi.StatusReason{
buildapi.StatusReason("BuildPodEvicted"), // vendoring to get this is so hard
buildapi.StatusReasonBuildPodEvicted,
buildapi.StatusReasonBuildPodDeleted,
buildapi.StatusReasonBuildPodExists,
buildapi.StatusReasonCannotCreateBuildPod,
Expand Down Expand Up @@ -533,19 +540,10 @@ func hintsAtInfraReason(logSnippet string) bool {
}

func waitForBuildOrTimeout(ctx context.Context, buildClient BuildClient, namespace, name string) error {
return waitForBuild(ctx, buildClient, namespace, name, 0, buildDuration)
return waitForBuild(ctx, buildClient, namespace, name)
}

func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name string, timeout time.Duration, buildDurationFunc func(*buildapi.Build) time.Duration) error {
isOK := func(b *buildapi.Build) bool {
return b.Status.Phase == buildapi.BuildPhaseComplete
}
isFailed := func(b *buildapi.Build) bool {
return b.Status.Phase == buildapi.BuildPhaseFailed ||
b.Status.Phase == buildapi.BuildPhaseCancelled ||
b.Status.Phase == buildapi.BuildPhaseError
}

func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name string) error {
logrus.WithFields(logrus.Fields{
"namespace": namespace,
"name": name,
Expand All @@ -554,22 +552,22 @@ func waitForBuild(ctx context.Context, buildClient BuildClient, namespace, name
evaluatorFunc := func(obj runtime.Object) (bool, error) {
switch build := obj.(type) {
case *buildapi.Build:
if isOK(build) {
logrus.Infof("Build %s succeeded after %s", build.Name, buildDurationFunc(build).Truncate(time.Second))
switch build.Status.Phase {
case buildapi.BuildPhaseComplete:
logrus.Infof("Build %s succeeded after %s", build.Name, buildDuration(build).Truncate(time.Second))
return true, nil
}
if isFailed(build) {
case buildapi.BuildPhaseFailed, buildapi.BuildPhaseCancelled, buildapi.BuildPhaseError:
logrus.Infof("Build %s failed, printing logs:", build.Name)
printBuildLogs(buildClient, build.Namespace, build.Name)
return true, util.AppendLogToError(fmt.Errorf("the build %s failed after %s with reason %s: %s", build.Name, buildDurationFunc(build).Truncate(time.Second), build.Status.Reason, build.Status.Message), build.Status.LogSnippet)
return true, util.AppendLogToError(fmt.Errorf("the build %s failed after %s with reason %s: %s", build.Name, buildDuration(build).Truncate(time.Second), build.Status.Reason, build.Status.Message), build.Status.LogSnippet)
}
default:
return false, fmt.Errorf("build/%v ns/%v got an event that did not contain a build: %v", name, namespace, obj)
}
return false, nil
}

return kubernetes.WaitForConditionOnObject(ctx, buildClient, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: name}, &buildapi.BuildList{}, &buildapi.Build{}, evaluatorFunc, timeout)
return kubernetes.WaitForConditionOnObject(ctx, buildClient, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: name}, &buildapi.BuildList{}, &buildapi.Build{}, evaluatorFunc, 0)
}

func buildDuration(build *buildapi.Build) time.Duration {
Expand Down
25 changes: 12 additions & 13 deletions pkg/steps/source_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,16 +386,13 @@ func init() {
}

func TestWaitForBuild(t *testing.T) {
now := meta.Time{Time: time.Now()}
start, end := meta.Time{Time: now.Time.Add(-3 * time.Second)}, now
var testCases = []struct {
name string
buildClient BuildClient
expected error
}{
{
name: "timeout",
buildClient: NewBuildClient(loggingclient.New(fakectrlruntimeclient.NewClientBuilder().WithRuntimeObjects().Build()), nil, nil),
expected: fmt.Errorf("timed out waiting for the condition"),
},
{
name: "build succeeded",
buildClient: NewBuildClient(loggingclient.New(fakectrlruntimeclient.NewClientBuilder().WithRuntimeObjects(
Expand All @@ -405,7 +402,9 @@ func TestWaitForBuild(t *testing.T) {
Namespace: "some-ns",
},
Status: buildapi.BuildStatus{
Phase: buildapi.BuildPhaseComplete,
Phase: buildapi.BuildPhaseComplete,
StartTimestamp: &start,
CompletionTimestamp: &end,
},
}).Build()), nil, nil),
},
Expand All @@ -418,10 +417,12 @@ func TestWaitForBuild(t *testing.T) {
Namespace: "some-ns",
},
Status: buildapi.BuildStatus{
Phase: buildapi.BuildPhaseCancelled,
Reason: "reason",
Message: "msg",
LogSnippet: "snippet",
Phase: buildapi.BuildPhaseCancelled,
Reason: "reason",
Message: "msg",
LogSnippet: "snippet",
StartTimestamp: &start,
CompletionTimestamp: &end,
},
}).Build()), "abc\n"), // the line break is for gotestsum https://github.com/gotestyourself/gotestsum/issues/141#issuecomment-1209146526
expected: fmt.Errorf("%s\n\n%s", "the build some-build failed after 3s with reason reason: msg", "snippet"),
Expand All @@ -430,9 +431,7 @@ func TestWaitForBuild(t *testing.T) {

for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
actual := waitForBuild(context.TODO(), testCase.buildClient, "some-ns", "some-build", 90*time.Millisecond, func(build *buildapi.Build) time.Duration {
return 3 * time.Second
})
actual := waitForBuild(context.TODO(), testCase.buildClient, "some-ns", "some-build")
if diff := cmp.Diff(testCase.expected, actual, testhelper.EquateErrorMessage); diff != "" {
t.Errorf("%s: mismatch (-expected +actual), diff: %s", testCase.name, diff)
}
Expand Down