From c33c93860044e889f00089ea965b25a03b7e39b2 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 31 Jan 2024 17:32:46 +0100 Subject: [PATCH 1/2] SyncWorkerStatus: Add `Failures` field with all failures Previously, the errors that happened during reconciliation were flattened by `summarizeTaskGraphErrors` method call in `consistentReporter.Errors()` before they were sent to status reconciliation method `syncStatus`, which propagates this flattened error to the ClusterVersion status field. Rename the `Failure` field to `FailureSummary`, it still caries the flattened error. Add a new `Failures` field to carry the full slice of the original errors. The error processing is overall a bit weird because the errors get flattened first at the resource reconciling loop side, and then the status reporting loop side uses the flattened error to somehow reconstruct what actually happened, but cleaning up that code would be a much larger effort, so this commit simply makes original errors to be passed to status reporting loop. It is not easy to do all error processing at a single side; it would make sense to do that on status reporting level, but the initial flattening uses some task graph / resource reconciliation data that are not available on the status reporting side. --- pkg/cvo/cvo.go | 2 +- pkg/cvo/cvo_scenarios_test.go | 47 +++++++++++++++++++++++++++++------ pkg/cvo/cvo_test.go | 24 +++++++++--------- pkg/cvo/status.go | 12 ++++----- pkg/cvo/status_test.go | 6 ++--- pkg/cvo/sync_worker.go | 44 +++++++++++--------------------- pkg/cvo/sync_worker_test.go | 24 +++++++++--------- 7 files changed, 87 insertions(+), 72 deletions(-) diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index ee8d49c82a..0f7ddfc8e1 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -681,7 +681,7 @@ func (optr *Operator) sync(ctx context.Context, key string) error { // handle the case of a misconfigured CVO by doing nothing if len(desired.Image) == 0 { return optr.syncStatus(ctx, original, config, &SyncWorkerStatus{ - Failure: &payload.UpdateError{ + FailureSummary: &payload.UpdateError{ Reason: "NoDesiredImage", Message: "No configured operator version, unable to update cluster", }, diff --git a/pkg/cvo/cvo_scenarios_test.go b/pkg/cvo/cvo_scenarios_test.go index 82c1cdd5a7..7733e4e10f 100644 --- a/pkg/cvo/cvo_scenarios_test.go +++ b/pkg/cvo/cvo_scenarios_test.go @@ -11,6 +11,8 @@ import ( "time" "github.com/davecgh/go-spew/spew" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" "k8s.io/apimachinery/pkg/api/errors" @@ -3473,7 +3475,7 @@ func TestCVO_ErrorDuringReconcile(t *testing.T) { Done: 2, Total: 3, VersionHash: "DL-FFQ2Uem8=", Architecture: architecture, - Failure: &payload.UpdateError{ + FailureSummary: &payload.UpdateError{ Nested: fmt.Errorf("unable to proceed"), Reason: "UpdatePayloadFailed", Message: "Could not update test \"file-yml\" (3 of 3)", @@ -3644,7 +3646,7 @@ func TestCVO_ParallelError(t *testing.T) { // // verify we observe the remaining changes in the first sync for status := range worker.StatusCh() { - if status.Failure == nil { + if status.FailureSummary == nil { if status.Done == 0 || (status.Done == 1 && status.Total == 3) { if !reflect.DeepEqual(status, SyncWorkerStatus{ Initial: true, @@ -3673,7 +3675,7 @@ func TestCVO_ParallelError(t *testing.T) { } continue } - err := status.Failure + err := status.FailureSummary uErr, ok := err.(*payload.UpdateError) if !ok || uErr.Reason != "MultipleErrors" || uErr.Message != "Multiple errors are preventing progress:\n* Failed to reconcile 10-a-file resource\n* Failed to reconcile 20-b-file resource" { t.Fatalf("unexpected error: %v", err) @@ -3681,9 +3683,34 @@ func TestCVO_ParallelError(t *testing.T) { if status.LastProgress.IsZero() { t.Fatalf("unexpected last progress: %v", status.LastProgress) } - if !reflect.DeepEqual(status, SyncWorkerStatus{ - Initial: true, - Failure: err, + if diff := cmp.Diff(status, SyncWorkerStatus{ + Initial: true, + FailureSummary: err, + Failures: []error{ + &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Message: "Failed to reconcile 10-a-file resource", + Task: &payload.Task{ + Index: 1, + Total: 3, + Manifest: &manifest.Manifest{ + OriginalFilename: "0000_10_a_file.yaml", + GVK: schema.GroupVersionKind{Version: "v1", Kind: "Test"}, + }, + }, + }, + &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Message: "Failed to reconcile 20-b-file resource", + Task: &payload.Task{ + Index: 3, + Total: 3, + Manifest: &manifest.Manifest{ + OriginalFilename: "0000_20_b_file.yaml", + GVK: schema.GroupVersionKind{Version: "v1", Kind: "Test"}, + }, + }, + }}, Done: 1, Total: 3, VersionHash: "Gyh2W6qcDO4=", @@ -3703,8 +3730,8 @@ func TestCVO_ParallelError(t *testing.T) { LastTransitionTime: status.loadPayloadStatus.LastTransitionTime, Update: configv1.Update{Version: "1.0.0-abc", Image: "image/image:1"}, }, - }) { - t.Fatalf("unexpected final: %v", status) + }, cmp.AllowUnexported(SyncWorkerStatus{}), cmpopts.IgnoreFields(manifest.Manifest{}, "id", "Raw", "Obj")); diff != "" { + t.Fatalf("finals differs from expected: %s", diff) } break } @@ -3964,6 +3991,10 @@ func verifyAllStatusOptionalDone(t *testing.T, stepName string, ignoreDone bool, t.Fatalf("%s: channel closed after reading only %d items", testName, i) } + if expect.FailureSummary != nil { + expect.Failures = []error{expect.FailureSummary} + } + if nextTime := actual.LastProgress; !nextTime.Equal(lastTime) { actual.LastProgress = time.Unix(count, 0) count++ diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index 73c2275bb6..4125254679 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -194,7 +194,7 @@ func TestOperator_sync(t *testing.T) { syncStatus: &SyncWorkerStatus{ Reconciling: false, Actual: configv1.Release{Version: "0.0.1-abc", Image: "image/image:v4.0.1"}, - Failure: &payload.UpdateError{ + FailureSummary: &payload.UpdateError{ Reason: "UpdatePayloadIntegrity", Message: "unable to apply object", }, @@ -275,7 +275,7 @@ func TestOperator_sync(t *testing.T) { Returns: &SyncWorkerStatus{ Reconciling: true, Actual: configv1.Release{Version: "0.0.1-abc", Image: "image/image:v4.0.1"}, - Failure: &payload.UpdateError{ + FailureSummary: &payload.UpdateError{ Reason: "UpdatePayloadIntegrity", Message: "unable to apply object", }, @@ -352,7 +352,7 @@ func TestOperator_sync(t *testing.T) { Reconciling: true, Completed: 2, Actual: configv1.Release{Version: "0.0.1-abc", Image: "image/image:v4.0.1"}, - Failure: &payload.UpdateError{ + FailureSummary: &payload.UpdateError{ Reason: "UpdatePayloadIntegrity", Message: "unable to apply object", }, @@ -426,9 +426,9 @@ func TestOperator_sync(t *testing.T) { name: "default", configSync: &fakeSyncRecorder{ Returns: &SyncWorkerStatus{ - Actual: configv1.Release{Version: "0.0.1-abc", Image: "image/image:v4.0.1"}, - Failure: fmt.Errorf("injected error"), - VersionHash: "foo", + Actual: configv1.Release{Version: "0.0.1-abc", Image: "image/image:v4.0.1"}, + FailureSummary: fmt.Errorf("injected error"), + VersionHash: "foo", }, }, client: fake.NewSimpleClientset( @@ -491,8 +491,8 @@ func TestOperator_sync(t *testing.T) { { name: "invalid image reports image error", syncStatus: &SyncWorkerStatus{ - Failure: os.ErrNotExist, - Actual: configv1.Release{Image: "image/image:v4.0.1", Version: "4.0.1"}, + FailureSummary: os.ErrNotExist, + Actual: configv1.Release{Image: "image/image:v4.0.1", Version: "4.0.1"}, }, optr: &Operator{ release: configv1.Release{ @@ -546,10 +546,10 @@ func TestOperator_sync(t *testing.T) { { name: "invalid image while progressing preserves progressing order and partial history", syncStatus: &SyncWorkerStatus{ - Done: 600, - Total: 1000, - Failure: os.ErrNotExist, - Actual: configv1.Release{Image: "image/image:v4.0.1", Version: "4.0.1"}, + Done: 600, + Total: 1000, + FailureSummary: os.ErrNotExist, + Actual: configv1.Release{Image: "image/image:v4.0.1", Version: "4.0.1"}, }, optr: &Operator{ release: configv1.Release{ diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 36ec476c22..92ad84af73 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -179,7 +179,7 @@ const ImplicitlyEnabledCapabilities configv1.ClusterStatusConditionType = "Impli func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1.ClusterVersion, status *SyncWorkerStatus, validationErrs field.ErrorList) error { // Be more verbose when we are syncing something interesting verbosityLevel := klog.Level(4) - if len(validationErrs) != 0 || status.Failure != nil { + if len(validationErrs) != 0 || status.FailureSummary != nil { verbosityLevel = klog.Level(2) } klog.V(verbosityLevel).Infof("Synchronizing status errs=%#v status=%#v", validationErrs, status) @@ -295,7 +295,7 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status progressReason, progressMessage, skipFailure := convertErrorToProgressing(cvStatus.History, now.Time, status) - if err := status.Failure; err != nil && !skipFailure { + if err := status.FailureSummary; err != nil && !skipFailure { var reason string msg := progressMessage if uErr, ok := err.(*payload.UpdateError); ok { @@ -389,10 +389,10 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status Reason: noReconciliationIssuesReason, Message: noReconciliationIssuesMessage, } - if status.Failure != nil { + if status.FailureSummary != nil { riCondition.Status = configv1.ConditionTrue riCondition.Reason = reconciliationIssuesFoundReason - riCondition.Message = fmt.Sprintf("%s: %s", reconciliationIssuesFoundMessage, status.Failure.Error()) + riCondition.Message = fmt.Sprintf("%s: %s", reconciliationIssuesFoundMessage, status.FailureSummary.Error()) } resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, riCondition) } else if oldRiCondition != nil { @@ -476,10 +476,10 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus, // failing. An error may indicate the update is failing or that if the error continues for a defined interval the // update is failing. func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { - if len(history) == 0 || status.Failure == nil || status.Reconciling { + if len(history) == 0 || status.FailureSummary == nil || status.Reconciling { return "", "", false } - uErr, ok := status.Failure.(*payload.UpdateError) + uErr, ok := status.FailureSummary.(*payload.UpdateError) if !ok { return "", "", false } diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index c98d8985db..073a86bb7d 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -270,7 +270,7 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndReconciliationIssues(t *tes if tc.oldCondition != nil { cvStatus.Conditions = append(cvStatus.Conditions, *tc.oldCondition) } - updateClusterVersionStatus(&cvStatus, &SyncWorkerStatus{Failure: tc.failure}, release, getAvailableUpdates, gates, noErrors) + updateClusterVersionStatus(&cvStatus, &SyncWorkerStatus{FailureSummary: tc.failure}, release, getAvailableUpdates, gates, noErrors) condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, reconciliationIssuesConditionType) if diff := cmp.Diff(tc.expectedRiCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition\n:%s", diff) @@ -306,7 +306,7 @@ func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { { name: "ReconciliationIssues present and unhappy when gate is enabled and failures happened", syncWorkerStatus: SyncWorkerStatus{ - Failure: fmt.Errorf("Something happened"), + FailureSummary: fmt.Errorf("Something happened"), }, enabled: true, expectedCondition: &configv1.ClusterOperatorStatusCondition{ @@ -319,7 +319,7 @@ func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { { name: "ReconciliationIssues not present when gate is enabled and failures happened", syncWorkerStatus: SyncWorkerStatus{ - Failure: fmt.Errorf("Something happened"), + FailureSummary: fmt.Errorf("Something happened"), }, enabled: false, expectedCondition: nil, diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index cfaeb3a318..72f00f13fe 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -16,6 +16,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/util/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" @@ -100,7 +101,8 @@ type CapabilityStatus struct { type SyncWorkerStatus struct { Generation int64 - Failure error + FailureSummary error + Failures []error Done int Total int @@ -692,7 +694,7 @@ func (w *statusWrapper) Report(status SyncWorkerStatus) { if p.Total > 0 { previousFractionComplete = float32(p.Done) / float32(p.Total) } - if p.Failure != nil && status.Failure == nil { + if p.FailureSummary != nil && status.FailureSummary == nil { if p.Actual.Image == status.Actual.Image { if fractionComplete < previousFractionComplete { klog.V(2).Infof("Dropping status report from earlier in sync loop") @@ -700,7 +702,7 @@ func (w *statusWrapper) Report(status SyncWorkerStatus) { } } } - if fractionComplete > previousFractionComplete || status.Completed > p.Completed || (status.Failure == nil && status.Actual.Image != p.Actual.Image) { + if fractionComplete > previousFractionComplete || status.Completed > p.Completed || (status.FailureSummary == nil && status.Actual.Image != p.Actual.Image) { status.LastProgress = time.Now() } if status.Generation == 0 { @@ -839,7 +841,7 @@ func (w *SyncWorker) updateApplyStatus(update SyncWorkerStatus) { func (w *SyncWorker) updateLoadStatus(update SyncWorkerStatus) { // do not overwrite these status values which are not managed by load - update.Failure = w.status.Failure + update.FailureSummary = w.status.FailureSummary update.Done = w.status.Done update.Total = w.status.Total update.Completed = w.status.Completed @@ -1088,7 +1090,8 @@ func (r *consistentReporter) Errors(errs []error) error { copied.Done = r.done copied.Total = r.total if err != nil { - copied.Failure = err + copied.FailureSummary = err + copied.Failures = errs } r.reporter.Report(copied) return err @@ -1248,25 +1251,6 @@ func condenseClusterOperators(errs []error) []error { return condensed } -// uniqueStrings returns an array with all sequential identical items removed. It modifies the contents -// of arr. Sort the input array before calling to remove all duplicates. -func uniqueStrings(arr []string) []string { - var last int - for i := 1; i < len(arr); i++ { - if arr[i] == arr[last] { - continue - } - last++ - if last != i { - arr[last] = arr[i] - } - } - if last < len(arr) { - last++ - } - return arr[:last] -} - // newMultipleError reports a generic set of errors that block progress. This method expects multiple // errors but handles singular and empty arrays gracefully. If all errors have the same message, the // first item is returned. @@ -1277,19 +1261,19 @@ func newMultipleError(errs []error) error { if len(errs) == 1 { return errs[0] } - messages := make([]string, 0, len(errs)) + messages := sets.New[string]() for _, err := range errs { - messages = append(messages, err.Error()) + messages.Insert(err.Error()) } - sort.Strings(messages) - messages = uniqueStrings(messages) + if len(messages) == 0 { return errs[0] } + return &payload.UpdateError{ Nested: apierrors.NewAggregate(errs), Reason: "MultipleErrors", - Message: fmt.Sprintf("Multiple errors are preventing progress:\n* %s", strings.Join(messages, "\n* ")), + Message: fmt.Sprintf("Multiple errors are preventing progress:\n* %s", strings.Join(sets.List(messages), "\n* ")), } } @@ -1307,7 +1291,7 @@ func runThrottledStatusNotifier(ctx context.Context, interval time.Duration, buc return case next := <-ch: // only throttle if we aren't on an edge - if next.Generation == last.Generation && next.Actual.Image == last.Actual.Image && next.Reconciling == last.Reconciling && (next.Failure != nil) == (last.Failure != nil) { + if next.Generation == last.Generation && next.Actual.Image == last.Actual.Image && next.Reconciling == last.Reconciling && (next.FailureSummary != nil) == (last.FailureSummary != nil) { if err := throttle.Wait(ctx); err != nil && err != context.Canceled && err != context.DeadlineExceeded { utilruntime.HandleError(fmt.Errorf("unable to throttle status notification: %v", err)) } diff --git a/pkg/cvo/sync_worker_test.go b/pkg/cvo/sync_worker_test.go index 3da2ac7af7..7353563329 100644 --- a/pkg/cvo/sync_worker_test.go +++ b/pkg/cvo/sync_worker_test.go @@ -30,34 +30,34 @@ func Test_statusWrapper_ReportProgress(t *testing.T) { }{ { name: "skip updates that clear an error and are at an earlier fraction", - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, next: SyncWorkerStatus{Actual: configv1.Release{Image: "testing"}}, want: false, }, { - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, next: SyncWorkerStatus{Actual: configv1.Release{Image: "testing2"}}, want: true, wantProgress: true, }, { - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}}, next: SyncWorkerStatus{Actual: configv1.Release{Image: "testing"}}, want: true, }, { - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, - next: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + next: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}}, want: true, }, { - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, - next: SyncWorkerStatus{Failure: fmt.Errorf("b"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + next: SyncWorkerStatus{FailureSummary: fmt.Errorf("b"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, want: true, }, { - previous: SyncWorkerStatus{Failure: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, - next: SyncWorkerStatus{Failure: fmt.Errorf("b"), Actual: configv1.Release{Image: "testing"}, Done: 20, Total: 100}, + previous: SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Actual: configv1.Release{Image: "testing"}, Done: 10, Total: 100}, + next: SyncWorkerStatus{FailureSummary: fmt.Errorf("b"), Actual: configv1.Release{Image: "testing"}, Done: 20, Total: 100}, want: true, wantProgress: true, }, @@ -169,21 +169,21 @@ func Test_runThrottledStatusNotifier(t *testing.T) { t.Fatalf("should have not throttled") } - in <- SyncWorkerStatus{Failure: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} + in <- SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} select { case <-out: case <-time.After(100 * time.Millisecond): t.Fatalf("should have not throttled") } - in <- SyncWorkerStatus{Failure: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} + in <- SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} select { case <-out: case <-time.After(100 * time.Millisecond): t.Fatalf("should have not throttled") } - in <- SyncWorkerStatus{Failure: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} + in <- SyncWorkerStatus{FailureSummary: fmt.Errorf("a"), Reconciling: true, Actual: configv1.Release{Image: "test"}} select { case <-out: t.Fatalf("should have throttled") From 7e000f0e16026c82cba95941dfd873b5a3869b15 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 31 Jan 2024 18:26:45 +0100 Subject: [PATCH 2/2] status sync: propagate reconciliation errors via RRI condition Fulfilling OTA-1159, this change adds a JSON-serialized form of all reconciliation errors encountered during a single reconciliation loop run to the `Message` field of the `ReconciliationIssues`-type condition, present only when `ReconciliationIssuesCondition` feature gate is enabled (=in tech preview). --- pkg/cvo/reconciliation_issues.go | 71 ++++++++++++++++++++-- pkg/cvo/status.go | 6 +- pkg/cvo/status_test.go | 100 +++++++++++++++++++++++++++++-- 3 files changed, 167 insertions(+), 10 deletions(-) diff --git a/pkg/cvo/reconciliation_issues.go b/pkg/cvo/reconciliation_issues.go index 144cc619ef..dd07326693 100644 --- a/pkg/cvo/reconciliation_issues.go +++ b/pkg/cvo/reconciliation_issues.go @@ -1,13 +1,76 @@ package cvo -import v1 "github.com/openshift/api/config/v1" +import ( + "encoding/json" + "strings" + + configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/cluster-version-operator/pkg/payload" +) const ( - reconciliationIssuesConditionType v1.ClusterStatusConditionType = "ReconciliationIssues" + reconciliationIssuesConditionType configv1.ClusterStatusConditionType = "ReconciliationIssues" noReconciliationIssuesReason string = "NoIssues" noReconciliationIssuesMessage string = "No issues found during reconciliation" - reconciliationIssuesFoundReason string = "IssuesFound" - reconciliationIssuesFoundMessage string = "Issues found during reconciliation" + reconciliationIssuesFoundReason string = "IssuesFound" ) + +type ReconciliationIssueUpdateError struct { + NestedMessage string `json:"nestedMessage,omitempty"` + Effect string `json:"effect,omitempty"` + Reason string `json:"reason,omitempty"` + PluralReason string `json:"pluralReason,omitempty"` + Message string `json:"message,omitempty"` + Name string `json:"name,omitempty"` + Names string `json:"names,omitempty"` + + ManifestFilename string `json:"manifestFilename,omitempty"` + ResourceGroup string `json:"manifestGroup,omitempty"` + ResourceVersion string `json:"resourceVersion,omitempty"` + ResourceKind string `json:"resourceKind,omitempty"` + ResourceSummary string `json:"resourceSummary,omitempty"` +} + +type ReconciliationIssueString struct { + Message string `json:"message"` +} + +type ReconciliationIssue struct { + UpdateError *ReconciliationIssueUpdateError `json:"updateError,omitempty"` + SimpleError *ReconciliationIssueString `json:"simpleError,omitempty"` +} + +func reconciliationIssuesFromErrors(errs []error) string { + var reconciliationIssues []ReconciliationIssue + for _, err := range errs { + if updateError, ok := err.(*payload.UpdateError); ok { + reconciliationIssues = append(reconciliationIssues, ReconciliationIssue{ + UpdateError: &ReconciliationIssueUpdateError{ + NestedMessage: updateError.Nested.Error(), + Effect: string(updateError.UpdateEffect), + Reason: updateError.Reason, + PluralReason: updateError.PluralReason, + Message: updateError.Message, + Name: updateError.Name, + Names: strings.Join(updateError.Names, ", "), + ManifestFilename: updateError.Task.Manifest.OriginalFilename, + ResourceGroup: updateError.Task.Manifest.GVK.Group, + ResourceVersion: updateError.Task.Manifest.GVK.Version, + ResourceKind: updateError.Task.Manifest.GVK.Kind, + }, + }) + } else { + reconciliationIssues = append(reconciliationIssues, ReconciliationIssue{ + SimpleError: &ReconciliationIssueString{ + Message: err.Error(), + }, + }) + } + } + if raw, err := json.Marshal(reconciliationIssues); err == nil { + return string(raw) + } + return "" +} diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 92ad84af73..c9f0fe2a44 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -392,7 +392,11 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status if status.FailureSummary != nil { riCondition.Status = configv1.ConditionTrue riCondition.Reason = reconciliationIssuesFoundReason - riCondition.Message = fmt.Sprintf("%s: %s", reconciliationIssuesFoundMessage, status.FailureSummary.Error()) + if len(status.Failures) == 0 { // this should not happen with non-nil FailureSummary but lets be defensive + riCondition.Message = reconciliationIssuesFromErrors([]error{status.FailureSummary}) + } else { + riCondition.Message = reconciliationIssuesFromErrors(status.Failures) + } } resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, riCondition) } else if oldRiCondition != nil { diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index 073a86bb7d..ea066ac071 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -2,6 +2,7 @@ package cvo import ( "context" + "encoding/json" "fmt" "reflect" "testing" @@ -9,14 +10,17 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/client-go/tools/record" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/client-go/config/clientset/versioned/fake" + "github.com/openshift/library-go/pkg/manifest" "github.com/openshift/cluster-version-operator/lib/resourcemerge" + "github.com/openshift/cluster-version-operator/pkg/payload" ) func Test_mergeEqualVersions(t *testing.T) { @@ -246,7 +250,7 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndReconciliationIssues(t *tes Type: reconciliationIssuesConditionType, Status: configv1.ConditionTrue, Reason: reconciliationIssuesFoundReason, - Message: "Issues found during reconciliation: Something happened", + Message: `[{"simpleError":{"message":"Something happened"}}]`, }, }, { @@ -284,6 +288,46 @@ func TestUpdateClusterVersionStatus_UnknownVersionAndReconciliationIssues(t *tes func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { ignoreLastTransitionTime := cmpopts.IgnoreFields(configv1.ClusterOperatorStatusCondition{}, "LastTransitionTime") + resourceUpdateError := payload.UpdateError{ + Nested: fmt.Errorf("This is a nested error"), + UpdateEffect: payload.UpdateEffectReport, + Reason: "SomeReason", + PluralReason: "MultipleReasons", + Message: "Message about update error", + PluralMessageFormat: "We do not carry this to RRI", + Name: "No idea", + Names: []string{"one", "two"}, + Task: &payload.Task{ + Manifest: &manifest.Manifest{ + OriginalFilename: "0000_00_some-configmap.yaml", + GVK: schema.GroupVersionKind{Version: "v1", Kind: "ConfigMap"}, + }, + }, + } + + matchingReconciliationIssue := ReconciliationIssue{ + UpdateError: &ReconciliationIssueUpdateError{ + NestedMessage: "This is a nested error", + Effect: "Report", + Reason: "SomeReason", + PluralReason: "MultipleReasons", + Message: "Message about update error", + Name: "No idea", + Names: "one, two", + ManifestFilename: "0000_00_some-configmap.yaml", + ResourceGroup: "", + ResourceVersion: "v1", + ResourceKind: "ConfigMap", + ResourceSummary: "", + }, + } + + riAsJsonRaw, err := json.Marshal(matchingReconciliationIssue) + if err != nil { + t.Fatalf("Failed to marshal reconciliation issue: %v", err) + } + riAsJson := string(riAsJsonRaw) + testCases := []struct { name string syncWorkerStatus SyncWorkerStatus @@ -293,7 +337,7 @@ func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { expectedCondition *configv1.ClusterOperatorStatusCondition }{ { - name: "ReconciliationIssues present and happy when gate is enabled and no failures happened", + name: "gate enabled, no failures => condition present and happy", syncWorkerStatus: SyncWorkerStatus{}, enabled: true, expectedCondition: &configv1.ClusterOperatorStatusCondition{ @@ -304,7 +348,7 @@ func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { }, }, { - name: "ReconciliationIssues present and unhappy when gate is enabled and failures happened", + name: "gate enabled, failure summary present but failures empty => condition present and unhappy, backfilled from failure summary", syncWorkerStatus: SyncWorkerStatus{ FailureSummary: fmt.Errorf("Something happened"), }, @@ -313,13 +357,59 @@ func TestUpdateClusterVersionStatus_ReconciliationIssues(t *testing.T) { Type: reconciliationIssuesConditionType, Status: configv1.ConditionTrue, Reason: reconciliationIssuesFoundReason, - Message: "Issues found during reconciliation: Something happened", + Message: `[{"simpleError":{"message":"Something happened"}}]`, + }, + }, + { + name: "gate enabled, update failure => condition present and unhappy", + syncWorkerStatus: SyncWorkerStatus{ + FailureSummary: &resourceUpdateError, + Failures: []error{&resourceUpdateError}, + }, + enabled: true, + expectedCondition: &configv1.ClusterOperatorStatusCondition{ + Type: reconciliationIssuesConditionType, + Status: configv1.ConditionTrue, + Reason: reconciliationIssuesFoundReason, + Message: fmt.Sprintf("[%s]", string(riAsJson)), + }, + }, + { + name: "gate enabled, non-update error => condition present and unhappy", + syncWorkerStatus: SyncWorkerStatus{ + FailureSummary: fmt.Errorf("Something happened"), + Failures: []error{fmt.Errorf("Something happened")}, + }, + enabled: true, + expectedCondition: &configv1.ClusterOperatorStatusCondition{ + Type: reconciliationIssuesConditionType, + Status: configv1.ConditionTrue, + Reason: reconciliationIssuesFoundReason, + Message: `[{"simpleError":{"message":"Something happened"}}]`, + }, + }, + { + name: "gate enabled, update and non-update error => condition present and unhappy, contains both errors", + syncWorkerStatus: SyncWorkerStatus{ + FailureSummary: fmt.Errorf("Some kind of summary of both errors"), + Failures: []error{ + fmt.Errorf("Something happened"), + &resourceUpdateError, + }, + }, + enabled: true, + expectedCondition: &configv1.ClusterOperatorStatusCondition{ + Type: reconciliationIssuesConditionType, + Status: configv1.ConditionTrue, + Reason: reconciliationIssuesFoundReason, + Message: fmt.Sprintf(`[{"simpleError":{"message":"Something happened"}},%s]`, string(riAsJson)), }, }, { - name: "ReconciliationIssues not present when gate is enabled and failures happened", + name: "gate disabled, failure summary and failures present => condition not present", syncWorkerStatus: SyncWorkerStatus{ FailureSummary: fmt.Errorf("Something happened"), + Failures: []error{fmt.Errorf("Something happened")}, }, enabled: false, expectedCondition: nil,