From e9a4fa499542be8eb4d95238294af0c6a2ecf271 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:37:02 -0400 Subject: [PATCH 1/8] Add dd annotations and experiment phase as predicates --- .../controller/datadogagent_controller.go | 66 ++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/internal/controller/datadogagent_controller.go b/internal/controller/datadogagent_controller.go index be385e1a9e..8a10f64efb 100644 --- a/internal/controller/datadogagent_controller.go +++ b/internal/controller/datadogagent_controller.go @@ -8,6 +8,7 @@ package controller import ( "context" "reflect" + "strings" edsdatadoghqv1alpha1 "github.com/DataDog/extendeddaemonset/api/v1alpha1" "github.com/go-logr/logr" @@ -308,7 +309,7 @@ func (r *DatadogAgentReconciler) SetupWithManager(mgr ctrl.Manager, metricForwar } or := reconcile.AsReconciler[*v2alpha1.DatadogAgent](r.Client, r) - if err := builder.For(&v2alpha1.DatadogAgent{}, builderOptions...).WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.AnnotationChangedPredicate{})).Complete(or); err != nil { + if err := builder.For(&v2alpha1.DatadogAgent{}, builderOptions...).WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, datadogAnnotationChangedPredicate(), experimentPhaseChangedPredicate())).Complete(or); err != nil { return err } @@ -321,6 +322,69 @@ func (r *DatadogAgentReconciler) SetupWithManager(mgr ctrl.Manager, metricForwar return nil } +// datadogAnnotationChangedPredicate returns a predicate that triggers reconciliation +// only when annotations containing ".datadoghq.com/" change, ignoring unrelated annotation updates. +func datadogAnnotationChangedPredicate() predicate.Predicate { + hasDDAnnotation := func(annotations map[string]string) bool { + for k := range annotations { + if strings.Contains(k, ".datadoghq.com/") { + return true + } + } + return false + } + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldAnnotations := e.ObjectOld.GetAnnotations() + newAnnotations := e.ObjectNew.GetAnnotations() + for k, v := range newAnnotations { + if strings.Contains(k, ".datadoghq.com/") && oldAnnotations[k] != v { + return true + } + } + for k := range oldAnnotations { + if strings.Contains(k, ".datadoghq.com/") { + if _, exists := newAnnotations[k]; !exists { + return true + } + } + } + return false + }, + CreateFunc: func(e event.CreateEvent) bool { return hasDDAnnotation(e.Object.GetAnnotations()) }, + DeleteFunc: func(e event.DeleteEvent) bool { return true }, + GenericFunc: func(e event.GenericEvent) bool { return hasDDAnnotation(e.Object.GetAnnotations()) }, + } +} + +// experimentPhaseChangedPredicate returns a predicate that triggers reconciliation +// when the experiment phase in the DDA status changes. This allows the operator to +// react to RC signals (e.g. phase=stopped) which are status-only changes that do not +// bump metadata.generation. +func experimentPhaseChangedPredicate() predicate.Predicate { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldDDA, ok1 := e.ObjectOld.(*v2alpha1.DatadogAgent) + newDDA, ok2 := e.ObjectNew.(*v2alpha1.DatadogAgent) + if !ok1 || !ok2 { + return false + } + oldPhase := "" + if oldDDA.Status.Experiment != nil { + oldPhase = string(oldDDA.Status.Experiment.Phase) + } + newPhase := "" + if newDDA.Status.Experiment != nil { + newPhase = string(newDDA.Status.Experiment.Phase) + } + return oldPhase != newPhase + }, + CreateFunc: func(e event.CreateEvent) bool { return false }, + DeleteFunc: func(e event.DeleteEvent) bool { return false }, + GenericFunc: func(e event.GenericEvent) bool { return false }, + } +} + func enqueueIfOwnedByDatadogAgent(ctx context.Context, obj client.Object) []reconcile.Request { labels := obj.GetLabels() From 5c6dedace5ccabede21ec2730bb576767704c122 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:37:38 -0400 Subject: [PATCH 2/8] Add experiment status --- api/datadoghq/v2alpha1/datadogagent_types.go | 41 +++++++++++++++++++ .../v2alpha1/zz_generated.deepcopy.go | 20 +++++++++ .../bases/v1/datadoghq.com_datadogagents.yaml | 30 ++++++++++++++ .../datadoghq.com_datadogagents_v2alpha1.json | 28 +++++++++++++ 4 files changed, 119 insertions(+) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index caa0f99a9a..781b0ede2a 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -2349,6 +2349,43 @@ type RemoteConfigConfiguration struct { Features *DatadogFeatures `json:"features,omitempty"` } +// ExperimentPhase is the lifecycle phase of a Fleet Automation experiment. +// +kubebuilder:validation:Enum=running;stopped;rollback;timeout;promoted;aborted +type ExperimentPhase string + +const ( + // ExperimentPhaseRunning is set by RC when an experiment starts (startExperiment). + ExperimentPhaseRunning ExperimentPhase = "running" + // ExperimentPhaseStopped is set by RC to request a rollback (stopExperiment). + ExperimentPhaseStopped ExperimentPhase = "stopped" + // ExperimentPhaseRollback is set by the operator after processing a stopped signal and restoring the previous spec. + ExperimentPhaseRollback ExperimentPhase = "rollback" + // ExperimentPhaseTimeout is set by the operator when the experiment exceeds the timeout and is auto-rolled back. + ExperimentPhaseTimeout ExperimentPhase = "timeout" + // ExperimentPhasePromoted is set by RC when an experiment succeeds (promoteExperiment). + ExperimentPhasePromoted ExperimentPhase = "promoted" + // ExperimentPhaseAborted is set by the operator when a manual spec change is detected during a running experiment. + ExperimentPhaseAborted ExperimentPhase = "aborted" +) + +// ExperimentStatus defines the state of a Fleet Automation experiment. +// +k8s:openapi-gen=true +type ExperimentStatus struct { + // Phase is the current state of the experiment. + // +optional + Phase ExperimentPhase `json:"phase,omitempty"` + // ID is the unique experiment ID sent by Fleet Automation. + // +optional + ID string `json:"id,omitempty"` + // Generation is the DDA metadata.generation recorded when the experiment started. + // Used to detect manual spec changes while the experiment is running: if the + // current DDA generation differs from this value, the operator aborts the experiment. + // + // This value must be recorded after the DDA is patched for a startExperiment signal. + // +optional + Generation int64 `json:"generation,omitempty"` +} + // DatadogAgentStatus defines the observed state of DatadogAgent. // +k8s:openapi-gen=true type DatadogAgentStatus struct { @@ -2376,6 +2413,9 @@ type DatadogAgentStatus struct { // RemoteConfigConfiguration stores the configuration received from RemoteConfig. // +optional RemoteConfigConfiguration *RemoteConfigConfiguration `json:"remoteConfigConfiguration,omitempty"` + // Experiment tracks the state of an active or recent Fleet Automation experiment. + // +optional + Experiment *ExperimentStatus `json:"experiment,omitempty"` } // DatadogAgent defines Agent configuration, see reference https://github.com/DataDog/datadog-operator/blob/main/docs/configuration.v2alpha1.md @@ -2387,6 +2427,7 @@ type DatadogAgentStatus struct { // +kubebuilder:printcolumn:name="cluster-agent",type="string",JSONPath=".status.clusterAgent.status" // +kubebuilder:printcolumn:name="cluster-checks-runner",type="string",JSONPath=".status.clusterChecksRunner.status" // +kubebuilder:printcolumn:name="age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:printcolumn:name="experiment-phase",type="string",JSONPath=".status.experiment.phase",priority=1 // +k8s:openapi-gen=true // +genclient type DatadogAgent struct { diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index 4733c0d92d..3417c2bfe1 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1361,6 +1361,26 @@ func (in *DatadogAgentStatus) DeepCopyInto(out *DatadogAgentStatus) { *out = new(RemoteConfigConfiguration) (*in).DeepCopyInto(*out) } + if in.Experiment != nil { + in, out := &in.Experiment, &out.Experiment + *out = new(ExperimentStatus) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExperimentStatus) DeepCopyInto(out *ExperimentStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExperimentStatus. +func (in *ExperimentStatus) DeepCopy() *ExperimentStatus { + if in == nil { + return nil + } + out := new(ExperimentStatus) + in.DeepCopyInto(out) + return out } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatadogAgentStatus. diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 0325733812..eb531bc093 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -29,6 +29,10 @@ spec: - jsonPath: .metadata.creationTimestamp name: age type: date + - jsonPath: .status.experiment.phase + name: experiment-phase + priority: 1 + type: string name: v2alpha1 schema: openAPIV3Schema: @@ -8474,6 +8478,32 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + experiment: + description: Experiment tracks the state of an active or recent Fleet Automation experiment. + properties: + generation: + description: |- + Generation is the DDA metadata.generation recorded when the experiment started. + Used to detect manual spec changes while the experiment is running: if the + current DDA generation differs from this value, the operator aborts the experiment. + + This value must be recorded after the DDA is patched for a startExperiment signal. + format: int64 + type: integer + id: + description: ID is the unique experiment ID sent by Fleet Automation. + type: string + phase: + description: Phase is the current state of the experiment. + enum: + - running + - stopped + - rollback + - timeout + - promoted + - aborted + type: string + type: object otelAgentGateway: description: The actual state of the OTel Agent Gateway as a deployment. properties: diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index a1e1934161..a64a9d5835 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -8175,6 +8175,34 @@ ], "x-kubernetes-list-type": "map" }, + "experiment": { + "additionalProperties": false, + "description": "Experiment tracks the state of an active or recent Fleet Automation experiment.", + "properties": { + "generation": { + "description": "Generation is the DDA metadata.generation recorded when the experiment started.\nUsed to detect manual spec changes while the experiment is running: if the\ncurrent DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", + "format": "int64", + "type": "integer" + }, + "id": { + "description": "ID is the unique experiment ID sent by Fleet Automation.", + "type": "string" + }, + "phase": { + "description": "Phase is the current state of the experiment.", + "enum": [ + "running", + "stopped", + "rollback", + "timeout", + "promoted", + "aborted" + ], + "type": "string" + } + }, + "type": "object" + }, "otelAgentGateway": { "additionalProperties": false, "description": "The actual state of the OTel Agent Gateway as a deployment.", From 7474895da36c5ed5159a259d2ada1ea33b904423 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:39:00 -0400 Subject: [PATCH 3/8] Pass revision list instead of entire object --- .../datadogagent/controller_reconcile_v2.go | 10 ++- internal/controller/datadogagent/revision.go | 38 ++++----- .../controller/datadogagent/revision_test.go | 85 +++++++++++-------- 3 files changed, 74 insertions(+), 59 deletions(-) diff --git a/internal/controller/datadogagent/controller_reconcile_v2.go b/internal/controller/datadogagent/controller_reconcile_v2.go index 1e59d6fe7d..a35e27afb6 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2.go +++ b/internal/controller/datadogagent/controller_reconcile_v2.go @@ -71,11 +71,17 @@ func (r *Reconciler) reconcileInstanceV3(ctx context.Context, logger logr.Logger ddaStatusCopy := instance.Status.DeepCopy() newDDAStatus := generateNewStatusFromDDA(ddaStatusCopy) - // Manage ControllerRevision snapshots if r.options.CreateControllerRevisions { - if err := r.manageRevision(ctx, instance); err != nil { + revList, err := r.listRevisions(ctx, instance) + if err != nil { return r.updateStatusIfNeededV2(logger, instance, ddaStatusCopy, result, err, now) } + if err := r.manageExperiment(ctx, instance, newDDAStatus, now, revList); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) + } + if err := r.manageRevision(ctx, instance, revList); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) + } } // Manage dependencies diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 436235b2af..6c245bc1eb 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -31,13 +31,9 @@ type revisionSnapshot struct { Annotations map[string]string `json:"annotations,omitempty"` } -// manageRevision creates or ensures a ControllerRevision snapshot of the current -// DDA spec and GCs old revisions beyond the current and previous. -func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent) error { - revList, err := r.listRevisions(ctx, instance) - if err != nil { - return err - } +// manageRevision creates a ControllerRevision snapshot of the current spec and +// garbage collects old revisions. Must be called after manageExperiment. +func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent, revList []appsv1.ControllerRevision) error { revName, err := r.ensureRevision(ctx, instance, revList) if err != nil { return err @@ -48,7 +44,7 @@ func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.Data return nil } -func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.DatadogAgent) (*appsv1.ControllerRevisionList, error) { +func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.DatadogAgent) ([]appsv1.ControllerRevision, error) { revList := &appsv1.ControllerRevisionList{} if err := r.client.List(ctx, revList, client.InNamespace(instance.GetNamespace()), @@ -71,17 +67,17 @@ func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.Datad } } revList.Items = owned - return revList, nil + return revList.Items, nil } // ensureRevision creates a ControllerRevision snapshot of the instance spec and -// annotations if it does not already exist. +// annotations if it does not already exist, and returns the revision name. // // The Revision field is a monotonic creation counter. func (r *Reconciler) ensureRevision( ctx context.Context, instance *v2alpha1.DatadogAgent, - revList *appsv1.ControllerRevisionList, + revList []appsv1.ControllerRevision, ) (string, error) { logger := ctrl.LoggerFrom(ctx) @@ -99,8 +95,8 @@ func (r *Reconciler) ensureRevision( // Find any existing revision with identical data, and track the max Revision. var matchingRev *appsv1.ControllerRevision maxRevision := int64(0) - for i := range revList.Items { - existing := &revList.Items[i] + for i := range revList { + existing := &revList[i] if bytes.Equal(existing.Data.Raw, specBytes) { matchingRev = existing } @@ -135,9 +131,9 @@ func (r *Reconciler) ensureRevision( rev := controllerrevisions.NewControllerRevision(instance, gvks[0], labels, data, nextRevision, nil) // Check for a name conflict before creating. - existingByName := make(map[string][]byte, len(revList.Items)) - for i := range revList.Items { - existingByName[revList.Items[i].Name] = revList.Items[i].Data.Raw + existingByName := make(map[string][]byte, len(revList)) + for i := range revList { + existingByName[revList[i].Name] = revList[i].Data.Raw } if existingData, nameUsed := existingByName[rev.Name]; nameUsed { if bytes.Equal(existingData, specBytes) { @@ -185,15 +181,15 @@ func (r *Reconciler) gcOldRevisions( ctx context.Context, instance *v2alpha1.DatadogAgent, current string, - revList *appsv1.ControllerRevisionList, + revList []appsv1.ControllerRevision, ) error { logger := ctrl.LoggerFrom(ctx) // Identify the most recent non-current revision to keep as previous. previous := "" previousRevision := int64(-1) - for i := range revList.Items { - rev := &revList.Items[i] + for i := range revList { + rev := &revList[i] if rev.Name == current { continue } @@ -203,8 +199,8 @@ func (r *Reconciler) gcOldRevisions( } } - for i := range revList.Items { - rev := &revList.Items[i] + for i := range revList { + rev := &revList[i] if rev.Name == current || rev.Name == previous { continue } diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index ae495cb4c3..7f95d7f8a9 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -52,23 +52,29 @@ func newRevisionTestReconciler(t *testing.T) (*Reconciler, client.Client) { return &Reconciler{client: c, scheme: scheme}, c } -func mustListRevisions(t *testing.T, r *Reconciler, instance *v2alpha1.DatadogAgent) *appsv1.ControllerRevisionList { +func mustListRevisions(t *testing.T, r *Reconciler, instance *v2alpha1.DatadogAgent) []appsv1.ControllerRevision { t.Helper() - revList, err := r.listRevisions(context.Background(), instance) + revisions, err := r.listRevisions(context.Background(), instance) require.NoError(t, err) - return revList + return revisions +} + +func fetchRevisionByName(t *testing.T, c client.Client, namespace, name string) *appsv1.ControllerRevision { + t.Helper() + rev := &appsv1.ControllerRevision{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, rev)) + return rev } func TestEnsureRevision_CreatesOnFirstCall(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) require.NoError(t, err) - assert.NotEmpty(t, revName) + assert.NotEmpty(t, name) - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: revName}, rev)) + rev := fetchRevisionByName(t, c, "default", name) assert.NotEmpty(t, rev.Data.Raw) assert.Equal(t, int64(1), rev.Revision) assert.Len(t, rev.OwnerReferences, 1) @@ -200,8 +206,7 @@ func TestEnsureRevision_RevertBumpsRevision(t *testing.T) { require.NoError(t, err) assert.Equal(t, name1, name3, "revert should reuse same CR name") - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: name1}, rev)) + rev := fetchRevisionByName(t, c, "default", name1) assert.Equal(t, int64(3), rev.Revision, "revision should be bumped to max+1") } @@ -250,8 +255,7 @@ func TestEnsureRevision_RevisionNumbersMonotonic(t *testing.T) { } for i, name := range names { - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: name}, rev)) + rev := fetchRevisionByName(t, c, "default", name) assert.Equal(t, int64(i+1), rev.Revision, "revision %d should be %d", i, i+1) } } @@ -260,10 +264,10 @@ func TestGCOldRevisions_NoPreviousWhenOnlyCurrent(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instance, name, mustListRevisions(t, r, instance)) + err = r.gcOldRevisions(context.Background(), instance, revName, mustListRevisions(t, r, instance)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -308,30 +312,34 @@ func TestManageRevision_CreatesRevision(t *testing.T) { r, _ := newRevisionTestReconciler(t) instanceA := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) revList := mustListRevisions(t, r, instanceA) - assert.Len(t, revList.Items, 1, "one revision after first call") + assert.Len(t, revList, 1, "one revision after first call") // Change spec and call again — should now have current and previous. instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) - assert.Len(t, revList.Items, 2, "current and previous kept after spec change") + assert.Len(t, revList, 2, "current and previous kept after spec change") } func TestManageRevision_Idempotent(t *testing.T) { r, _ := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instance)) - require.NoError(t, r.manageRevision(context.Background(), instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + require.NoError(t, err) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + require.NoError(t, err) revList := mustListRevisions(t, r, instance) - assert.Len(t, revList.Items, 1, "idempotent calls should not create duplicate revisions") + assert.Len(t, revList, 1, "idempotent calls should not create duplicate revisions") } // TestManageRevision_CurrentDeletedIsRecreated verifies that if the current @@ -341,24 +349,26 @@ func TestManageRevision_CurrentDeletedIsRecreated(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + require.NoError(t, err) revList := mustListRevisions(t, r, instance) - require.Len(t, revList.Items, 1) - deletedName := revList.Items[0].Name + require.Len(t, revList, 1) + deletedName := revList[0].Name // Simulate manual deletion. - require.NoError(t, c.Delete(context.Background(), &revList.Items[0])) + require.NoError(t, c.Delete(context.Background(), &revList[0])) // Next reconcile should re-create the revision. - require.NoError(t, r.manageRevision(context.Background(), instance)) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + require.NoError(t, err) revList = mustListRevisions(t, r, instance) - require.Len(t, revList.Items, 1) + require.Len(t, revList, 1) // Content is identical so the name (hash-based) should be the same. - assert.Equal(t, deletedName, revList.Items[0].Name, "re-created revision should have the same name") - assert.Equal(t, int64(1), revList.Items[0].Revision) + assert.Equal(t, deletedName, revList[0].Name, "re-created revision should have the same name") + assert.Equal(t, int64(1), revList[0].Revision) } // TestManageRevision_PreviousDeletedContinuesNormally verifies that if the @@ -371,15 +381,17 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - require.NoError(t, r.manageRevision(context.Background(), instanceA)) - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) revList := mustListRevisions(t, r, instanceB) - require.Len(t, revList.Items, 2) + require.Len(t, revList, 2) // Identify and delete the previous (lower Revision number). var prev appsv1.ControllerRevision - for _, rev := range revList.Items { + for _, rev := range revList { if prev.Name == "" || rev.Revision < prev.Revision { prev = rev } @@ -387,11 +399,12 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { require.NoError(t, c.Delete(context.Background(), &prev)) // Next reconcile on the same spec should succeed and keep only the current. - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) - assert.Len(t, revList.Items, 1, "only current should remain after previous was deleted") - assert.Equal(t, int64(2), revList.Items[0].Revision) + assert.Len(t, revList, 1, "only current should remain after previous was deleted") + assert.Equal(t, int64(2), revList[0].Revision) } // TestListRevisions_ExcludesForeignOwner verifies that a ControllerRevision @@ -427,5 +440,5 @@ func TestListRevisions_ExcludesForeignOwner(t *testing.T) { revList, err := r.listRevisions(context.Background(), current) require.NoError(t, err) - assert.Empty(t, revList.Items, "foreign revision should be excluded by UID filter") + assert.Empty(t, revList, "foreign revision should be excluded by UID filter") } From b6980bf253e59cd13b1c66e442488bd5f9e44b68 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:39:45 -0400 Subject: [PATCH 4/8] Add rollback functionality --- .../controller/datadogagent/controller.go | 2 + .../controller_experiment_integration_test.go | 251 ++++++++++++++++++ .../controller_reconcile_v2_common.go | 4 + .../controller_reconcile_v2_helpers.go | 1 + .../controller/datadogagent/experiment.go | 192 ++++++++++++++ .../datadogagent/experiment_test.go | 133 ++++++++++ 6 files changed, 583 insertions(+) create mode 100644 internal/controller/datadogagent/controller_experiment_integration_test.go create mode 100644 internal/controller/datadogagent/experiment.go create mode 100644 internal/controller/datadogagent/experiment_test.go diff --git a/internal/controller/datadogagent/controller.go b/internal/controller/datadogagent/controller.go index f3913ef127..65cd8b9287 100644 --- a/internal/controller/datadogagent/controller.go +++ b/internal/controller/datadogagent/controller.go @@ -75,6 +75,8 @@ type ReconcilerOptions struct { DatadogAgentProfileEnabled bool DatadogAgentInternalEnabled bool CreateControllerRevisions bool + // ExperimentTimeout overrides ExperimentDefaultTimeout. Zero means use the default. + ExperimentTimeout time.Duration } // Reconciler is the internal reconciler for Datadog Agent diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go new file mode 100644 index 0000000000..2f5957a69b --- /dev/null +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -0,0 +1,251 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +// Integration tests for the experiment rollback flow wired through the full +// DDA reconcile path. These complement the unit tests in experiment_test.go. +// +// Coverage goals: +// - Stopped rollback: RC writing phase=stopped causes the operator to restore +// the previous spec and set phase=rollback. +// - Timeout rollback: an experiment running past ExperimentTimeout causes the +// operator to restore the previous spec and set phase=timeout. +// +// NOTE: rollback is idempotent — if the spec is already at the rollback target +// the Update is skipped. This means the status update in the same reconcile +// succeeds without a ResourceVersion conflict. In the first rollback reconcile +// the spec update bumps ResourceVersion and the status update conflicts; the +// second reconcile (fresh fetch) finds the spec already correct, skips the +// Update, and the status update succeeds. Tests therefore run two reconciles +// after triggering rollback. + +import ( + "context" + "testing" + "time" + + assert "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" +) + +// newExperimentIntegrationReconciler builds a revision reconciler with an +// overridden ExperimentTimeout for testing. +func newExperimentIntegrationReconciler(t *testing.T, timeout time.Duration) *Reconciler { + t.Helper() + r, _ := newRevisionIntegrationReconciler(t) + r.options.ExperimentTimeout = timeout + return r +} + +// reconcileN re-fetches the DDA and calls Reconcile n times in sequence. +func reconcileN(t *testing.T, r *Reconciler, ns, name string, n int) { + t.Helper() + nsName := types.NamespacedName{Namespace: ns, Name: name} + for i := 0; i < n; i++ { + var dda v2alpha1.DatadogAgent + assert.NoError(t, r.client.Get(context.TODO(), nsName, &dda)) + _, err := r.Reconcile(context.TODO(), &dda) + assert.NoError(t, err) + } +} + +// Test_Experiment_StoppedRollback verifies that when RC writes phase=stopped, +// the operator restores the previous spec and sets phase=rollback. +func Test_Experiment_StoppedRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=stopped. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // First reconcile: rollback triggered (spec restored, status update may conflict). + // Second reconcile: spec already correct, status update succeeds. + reconcileN(t, r, ns, name, 2) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The snapshot is taken from the defaulted spec (defaults.DefaultDatadogAgentSpec applies + // site="datadoghq.com" before snapshotting), so rollback restores that value. + assert.NotNil(t, dda.Spec.Global.Site, "spec should be restored to pre-experiment state") + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be restored to pre-experiment state") + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +// Test_Experiment_TimeoutRollback verifies that an experiment running past +// ExperimentTimeout causes the operator to restore the previous spec and set +// phase=timeout. +func Test_Experiment_TimeoutRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Wait for the timeout to elapse. + time.Sleep(2 * timeout) + + // Reconcile 1: timeout detected → spec restored; status write may conflict. + // Reconcile 2: idempotent rollback → status write succeeds. + reconcileN(t, r, ns, name, 2) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The snapshot is taken from the defaulted spec, so rollback restores site="datadoghq.com". + assert.NotNil(t, dda.Spec.Global.Site, "spec should be restored after timeout") + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be restored after timeout") + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +// Test_Experiment_AbortOnManualChange verifies that a spec change while an +// experiment is running sets phase=aborted and does not trigger rollback. +func Test_Experiment_AbortOnManualChange(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec; RC signals running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // The fake client does not auto-increment Generation on Update, so revisions + // end up with a zero CreationTimestamp (fake-client limitation). Patch them to + // a recent time so the timeout path in handleRollback is not accidentally + // triggered before the abort check runs. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The fake client never increments Generation, so dda.Generation is always 0. + // Set experiment.Generation to a non-zero sentinel so the abortExperiment + // generation-mismatch check (instance.Generation != experiment.Generation) fires. + const experimentGen = int64(2) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: experimentGen, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // User manually changes the spec — in a real cluster this bumps Generation past + // experimentGen. In the fake client Generation stays at 0, which differs from + // the experimentGen sentinel above, so the mismatch is already in place. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // Spec should be the user's manual change, not rolled back. + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, dda.Status.Experiment.Phase) +} + +// mustGetExperimentPhase fetches the DDA and returns the experiment phase, or +// empty string if no experiment is set. Helper for readability in assertions. +func mustGetExperimentPhase(t *testing.T, r *Reconciler, ns, name string) v2alpha1.ExperimentPhase { + t.Helper() + var dda v2alpha1.DatadogAgent + assert.NoError(t, r.client.Get(context.TODO(), types.NamespacedName{Namespace: ns, Name: name}, &dda)) + if dda.Status.Experiment == nil { + return "" + } + return dda.Status.Experiment.Phase +} + +// Test_Experiment_AbortDoesNotRollback verifies that phase=aborted is a +// terminal state and does not trigger a spec restore on subsequent reconciles. +func Test_Experiment_AbortDoesNotRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Apply experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Manually force phase=aborted (as if abort already happened). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseAborted, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + // Spec should be unchanged (experiment spec), phase still aborted. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.eu", *dda.Spec.Global.Site, "aborted experiment must not trigger rollback") + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, mustGetExperimentPhase(t, r, ns, name)) + + // Also verify the revision timestamp is not used as a proxy for time.Now() comparison. + _ = metav1.Now() +} diff --git a/internal/controller/datadogagent/controller_reconcile_v2_common.go b/internal/controller/datadogagent/controller_reconcile_v2_common.go index ef70322ea0..89deeee536 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2_common.go +++ b/internal/controller/datadogagent/controller_reconcile_v2_common.go @@ -826,5 +826,9 @@ func IsEqualStatus(current *v2alpha1.DatadogAgentStatus, newStatus *v2alpha1.Dat return false } + if !apiequality.Semantic.DeepEqual(current.Experiment, newStatus.Experiment) { + return false + } + return condition.IsEqualConditions(current.Conditions, newStatus.Conditions) } diff --git a/internal/controller/datadogagent/controller_reconcile_v2_helpers.go b/internal/controller/datadogagent/controller_reconcile_v2_helpers.go index cdbf084c33..ef0a2b209c 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2_helpers.go +++ b/internal/controller/datadogagent/controller_reconcile_v2_helpers.go @@ -259,6 +259,7 @@ func generateNewStatusFromDDA(ddaStatus *datadoghqv2alpha1.DatadogAgentStatus) * if ddaStatus.RemoteConfigConfiguration != nil { status.RemoteConfigConfiguration = ddaStatus.RemoteConfigConfiguration } + status.Experiment = ddaStatus.Experiment.DeepCopy() } return status } diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go new file mode 100644 index 0000000000..e541333168 --- /dev/null +++ b/internal/controller/datadogagent/experiment.go @@ -0,0 +1,192 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "time" + + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// ExperimentDefaultTimeout is the duration after which a running experiment is automatically rolled back. +const ExperimentDefaultTimeout = 15 * time.Minute + +// manageExperiment handles all experiment state transitions for a reconcile cycle. +// Must be called before manageRevision. +func (r *Reconciler) manageExperiment( + ctx context.Context, + instance *v2alpha1.DatadogAgent, + newStatus *v2alpha1.DatadogAgentStatus, + now metav1.Time, + revList []appsv1.ControllerRevision, +) error { + experiment := instance.Status.Experiment + if experiment == nil { + return nil + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("experimentID", experiment.ID)) + + if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { + return err + } + abortExperiment(ctx, instance.Generation, experiment, newStatus) + return nil +} + +// abortExperiment marks the experiment as aborted in newStatus if a manual spec +// change is detected (DDA generation differs from the recorded experiment generation). +func abortExperiment( + ctx context.Context, + instanceGeneration int64, + experiment *v2alpha1.ExperimentStatus, + newStatus *v2alpha1.DatadogAgentStatus, +) { + if experiment.Phase != v2alpha1.ExperimentPhaseRunning { + return + } + if experiment.Generation == 0 || instanceGeneration == experiment.Generation { + return + } + ctrl.LoggerFrom(ctx).Info("Aborting experiment due to manual spec change") + newStatus.Experiment.Phase = v2alpha1.ExperimentPhaseAborted +} + +// handleRollback checks if the experiment needs rollback (explicit stop or timeout). +func (r *Reconciler) handleRollback( + ctx context.Context, + instance *v2alpha1.DatadogAgent, + newStatus *v2alpha1.DatadogAgentStatus, + now metav1.Time, + revisions []appsv1.ControllerRevision, +) error { + logger := ctrl.LoggerFrom(ctx) + if instance.Status.Experiment == nil { + return nil + } + + phase := instance.Status.Experiment.Phase + + switch { + // stopped signal from RC: restore DDA spec and ack by setting phase to rollback. + case phase == v2alpha1.ExperimentPhaseStopped: + logger.Info("Experiment stopped, rolling back") + return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) + case phase == v2alpha1.ExperimentPhaseRunning: + rev := findMostRecentRevision(revisions) + if rev != nil { + elapsed := now.Sub(rev.CreationTimestamp.Time) + if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { + logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String()) + return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout) + } + } + } + + return nil +} + +// restorePreviousSpec sets the terminal experiment phase and restores the DDA spec +// from the previous ControllerRevision. +func (r *Reconciler) restorePreviousSpec( + ctx context.Context, + instanceMeta metav1.ObjectMeta, + newStatus *v2alpha1.DatadogAgentStatus, + revisions []appsv1.ControllerRevision, + terminalPhase v2alpha1.ExperimentPhase, +) error { + newStatus.Experiment.Phase = terminalPhase + return r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)) +} + +// rollback restores the DDA spec from the named ControllerRevision. +func (r *Reconciler) rollback( + ctx context.Context, + instanceMeta metav1.ObjectMeta, + rollbackTarget string, +) error { + if rollbackTarget == "" { + return fmt.Errorf("no previous revision to roll back to") + } + + cr := &appsv1.ControllerRevision{} + if err := r.client.Get(ctx, types.NamespacedName{Namespace: instanceMeta.Namespace, Name: rollbackTarget}, cr); err != nil { + return fmt.Errorf("failed to get previous ControllerRevision %s: %w", rollbackTarget, err) + } + + var snapshot revisionSnapshot + if err := json.Unmarshal(cr.Data.Raw, &snapshot); err != nil { + return fmt.Errorf("failed to decode ControllerRevision data: %w", err) + } + + // Re-fetch for the latest ResourceVersion and to check whether the spec is + // rolled back already. If it is, skip the update. + current := &v2alpha1.DatadogAgent{} + if err := r.client.Get(ctx, types.NamespacedName{Namespace: instanceMeta.Namespace, Name: instanceMeta.Name}, current); err != nil { + return fmt.Errorf("failed to get current DDA for rollback: %w", err) + } + currentSnap, err := json.Marshal(revisionSnapshot{Spec: current.Spec, Annotations: datadogAnnotations(current.GetAnnotations())}) + if err != nil { + return fmt.Errorf("failed to marshal current snapshot for comparison: %w", err) + } + if bytes.Equal(currentSnap, cr.Data.Raw) { + ctrl.LoggerFrom(ctx).Info("Rollback spec already matches target, skipping update", "rollbackTarget", rollbackTarget) + return nil + } + + toUpdate := &v2alpha1.DatadogAgent{ + ObjectMeta: current.ObjectMeta, + Spec: snapshot.Spec, + } + toUpdate.Annotations = snapshot.Annotations + return r.client.Update(ctx, toUpdate) +} + +// findRollbackTarget returns the name of the previous ControllerRevision to restore. +// GC keeps at most two revisions (current and previous), so this returns whichever +// revision has the lower revision number. +func findRollbackTarget(revisions []appsv1.ControllerRevision) string { + var curRev, prevRev int64 = -1, -1 + var curName, prevName string + for i := range revisions { + rev := &revisions[i] + if rev.Revision > curRev { + prevRev, prevName = curRev, curName + curRev, curName = rev.Revision, rev.Name + } else if rev.Revision > prevRev { + prevRev, prevName = rev.Revision, rev.Name + } + } + return prevName +} + +// findMostRecentRevision returns the ControllerRevision with the highest Revision number, +// or nil if the list is empty. Used to determine the experiment start time for timeout checks. +func findMostRecentRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRevision { + var result *appsv1.ControllerRevision + for i := range revisions { + if result == nil || revisions[i].Revision > result.Revision { + result = &revisions[i] + } + } + return result +} + +func getExperimentTimeout(timeout time.Duration) time.Duration { + if timeout == 0 { + return ExperimentDefaultTimeout + } + return timeout +} diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go new file mode 100644 index 0000000000..eccd0b458e --- /dev/null +++ b/internal/controller/datadogagent/experiment_test.go @@ -0,0 +1,133 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +func TestManageExperiment_AbortsOnManualChange(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + + instance := newRevisionTestOwner("test-dda", "default") + instance.Generation = 3 + instance.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instance.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: 2, + } + + status := &v2alpha1.DatadogAgentStatus{ + Experiment: instance.Status.Experiment.DeepCopy(), + } + + err := r.manageExperiment(context.Background(), instance, status, metav1.Now(), mustListRevisions(t, r, instance)) + require.NoError(t, err) + require.NotNil(t, status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, status.Experiment.Phase) +} + +func TestRollback_RestoresSpec(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create a revision for specA. + instanceA := newRevisionTestOwner("test-dda", "default") + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) + + revListA := mustListRevisions(t, r, instanceA) + require.Len(t, revListA, 1) + prevRevision := revListA[0].Name + + // Create a second revision for specB. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Rollback from instanceB to prevRevision (specA). + require.NoError(t, r.rollback(context.Background(), instanceB.ObjectMeta, prevRevision)) +} + +func TestRollback_NoPreviousRevision(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + instance := newRevisionTestOwner("test-dda", "default") + + err := r.rollback(context.Background(), instance.ObjectMeta, "") + require.Error(t, err) + assert.Contains(t, err.Error(), "no previous revision") +} + +func TestHandleRollback_StoppedPhase(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so we have a previous to roll back to. + instanceA := newRevisionTestOwner("test-dda", "default") + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + } + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + revList := mustListRevisions(t, r, instanceB) + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + require.NotNil(t, newStatus.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase, "should ack stopped by setting phase to rollback") +} + +func TestHandleRollback_Timeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA))) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB))) + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + // Simulate the most recent revision having a creation timestamp past the timeout threshold + // by modifying the in-memory revList before passing it to handleRollback. + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + require.NotNil(t, newStatus.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} From d358c9c2bd15c0f77d9f3ae4b5c4e67d25d34514 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Thu, 26 Mar 2026 17:04:46 -0400 Subject: [PATCH 5/8] make generate manifests --- .../v2alpha1/zz_generated.deepcopy.go | 32 +++++++------- .../v2alpha1/zz_generated.openapi.go | 43 ++++++++++++++++++- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index 3417c2bfe1..fee99ea239 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1364,23 +1364,8 @@ func (in *DatadogAgentStatus) DeepCopyInto(out *DatadogAgentStatus) { if in.Experiment != nil { in, out := &in.Experiment, &out.Experiment *out = new(ExperimentStatus) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ExperimentStatus) DeepCopyInto(out *ExperimentStatus) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExperimentStatus. -func (in *ExperimentStatus) DeepCopy() *ExperimentStatus { - if in == nil { - return nil + **out = **in } - out := new(ExperimentStatus) - in.DeepCopyInto(out) - return out } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatadogAgentStatus. @@ -1784,6 +1769,21 @@ func (in *EventTypes) DeepCopy() *EventTypes { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExperimentStatus) DeepCopyInto(out *ExperimentStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExperimentStatus. +func (in *ExperimentStatus) DeepCopy() *ExperimentStatus { + if in == nil { + return nil + } + out := new(ExperimentStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ExternalMetricsServerFeatureConfig) DeepCopyInto(out *ExternalMetricsServerFeatureConfig) { *out = *in diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index d1672fbc04..e35c857471 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -34,6 +34,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_DogstatsdFeatureConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ErrorTrackingStandalone": schema_datadog_operator_api_datadoghq_v2alpha1_ErrorTrackingStandalone(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_EventCollectionFeatureConfig(ref), + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus": schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.FIPSConfig": schema_datadog_operator_api_datadoghq_v2alpha1_FIPSConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_HelmCheckFeatureConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_KubeStateMetricsCoreFeatureConfig(ref), @@ -645,11 +646,17 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogAgentStatus(ref commo Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration"), }, }, + "experiment": { + SchemaProps: spec.SchemaProps{ + Description: "Experiment tracks the state of an active or recent Fleet Automation experiment.", + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus"), + }, + }, }, }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DaemonSetStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DeploymentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration", "k8s.io/apimachinery/pkg/apis/meta/v1.Condition"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DaemonSetStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DeploymentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration", "k8s.io/apimachinery/pkg/apis/meta/v1.Condition"}, } } @@ -1114,6 +1121,40 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_EventCollectionFeatureConfig } } +func schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "ExperimentStatus defines the state of a Fleet Automation experiment.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "phase": { + SchemaProps: spec.SchemaProps{ + Description: "Phase is the current state of the experiment.", + Type: []string{"string"}, + Format: "", + }, + }, + "id": { + SchemaProps: spec.SchemaProps{ + Description: "ID is the unique experiment ID sent by Fleet Automation.", + Type: []string{"string"}, + Format: "", + }, + }, + "generation": { + SchemaProps: spec.SchemaProps{ + Description: "Generation is the DDA metadata.generation recorded when the experiment started. Used to detect manual spec changes while the experiment is running: if the current DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", + Type: []string{"integer"}, + Format: "int64", + }, + }, + }, + }, + }, + } +} + func schema_datadog_operator_api_datadoghq_v2alpha1_FIPSConfig(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ From 7814314275cb7ad6c971e8358f6aeadd06cb6674 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:37:59 -0400 Subject: [PATCH 6/8] Address review suggestions --- .../controller_experiment_integration_test.go | 164 +++++++++++++++++ .../datadogagent/controller_reconcile_v2.go | 2 +- .../controller/datadogagent/experiment.go | 53 ++++-- .../datadogagent/experiment_test.go | 170 +++++++++++++++++- internal/controller/datadogagent/revision.go | 27 ++- .../controller/datadogagent/revision_test.go | 54 +++--- 6 files changed, 422 insertions(+), 48 deletions(-) diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 2f5957a69b..0679a4fd64 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -212,6 +212,170 @@ func mustGetExperimentPhase(t *testing.T, r *Reconciler, ns, name string) v2alph return dda.Status.Experiment.Phase } +// Test_Experiment_TimeoutPhase_IsStable verifies that once phase=timeout is +// persisted, further reconciles do not change the spec or phase. +func Test_Experiment_TimeoutPhase_IsStable(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * timeout) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + + // Extra reconciles must not change phase or spec. + reconcileN(t, r, ns, name, 3) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_RollbackPhase_IsStable verifies that once phase=rollback is +// persisted, further reconciles do not change the spec or phase. +func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // Extra reconciles must not change phase or spec. + reconcileN(t, r, ns, name, 3) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_RunningAfterTimeout_StaleGeneration verifies that if RC +// writes phase=running after a timeout rollback has completed, the operator +// fires timeout again idempotently: the pre-experiment revision is old enough +// to exceed the timeout threshold, rollback is a no-op (spec already correct), +// and phase=timeout is written again. +func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * timeout) + reconcileN(t, r, ns, name, 2) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + + // RC writes phase=running again with the old (pre-rollback) generation — stale. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + const staleGen = int64(99) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: staleGen, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + // The pre-experiment revision is old enough that timeout fires idempotently — + // rollback is a no-op (spec already correct) and phase=timeout is written again. + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_StoppedAfterRollback verifies that if RC writes phase=stopped +// after a rollback has already completed, the idempotent rollback path handles +// it cleanly (spec unchanged, phase set to rollback again). +func Test_Experiment_StoppedAfterRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // RC writes phase=stopped again. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 2) + + // Spec should still be the rolled-back spec; phase=rollback again. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) +} + // Test_Experiment_AbortDoesNotRollback verifies that phase=aborted is a // terminal state and does not trigger a spec restore on subsequent reconciles. func Test_Experiment_AbortDoesNotRollback(t *testing.T) { diff --git a/internal/controller/datadogagent/controller_reconcile_v2.go b/internal/controller/datadogagent/controller_reconcile_v2.go index a35e27afb6..b97a45a720 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2.go +++ b/internal/controller/datadogagent/controller_reconcile_v2.go @@ -79,7 +79,7 @@ func (r *Reconciler) reconcileInstanceV3(ctx context.Context, logger logr.Logger if err := r.manageExperiment(ctx, instance, newDDAStatus, now, revList); err != nil { return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) } - if err := r.manageRevision(ctx, instance, revList); err != nil { + if err := r.manageRevision(ctx, instance, revList, newDDAStatus); err != nil { return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) } } diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index e541333168..aa4a59c46c 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -10,6 +10,7 @@ import ( "context" "encoding/json" "fmt" + "maps" "time" appsv1 "k8s.io/api/apps/v1" @@ -48,6 +49,8 @@ func (r *Reconciler) manageExperiment( // abortExperiment marks the experiment as aborted in newStatus if a manual spec // change is detected (DDA generation differs from the recorded experiment generation). +// It is a no-op if handleRollback has already set a terminal phase (e.g. timeout), +// preventing spurious abort logs and phase overwrites. func abortExperiment( ctx context.Context, instanceGeneration int64, @@ -57,6 +60,10 @@ func abortExperiment( if experiment.Phase != v2alpha1.ExperimentPhaseRunning { return } + if newStatus.Experiment.Phase != v2alpha1.ExperimentPhaseRunning { + // handleRollback already determined a terminal phase (e.g. timeout); don't overwrite or log. + return + } if experiment.Generation == 0 || instanceGeneration == experiment.Generation { return } @@ -85,7 +92,7 @@ func (r *Reconciler) handleRollback( logger.Info("Experiment stopped, rolling back") return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) case phase == v2alpha1.ExperimentPhaseRunning: - rev := findMostRecentRevision(revisions) + rev := findMostRecentMatchingRevision(revisions, instance) if rev != nil { elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { @@ -98,8 +105,8 @@ func (r *Reconciler) handleRollback( return nil } -// restorePreviousSpec sets the terminal experiment phase and restores the DDA spec -// from the previous ControllerRevision. +// restorePreviousSpec restores the DDA spec from the previous ControllerRevision +// and, on success, sets the terminal experiment phase. func (r *Reconciler) restorePreviousSpec( ctx context.Context, instanceMeta metav1.ObjectMeta, @@ -107,8 +114,11 @@ func (r *Reconciler) restorePreviousSpec( revisions []appsv1.ControllerRevision, terminalPhase v2alpha1.ExperimentPhase, ) error { + if err := r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)); err != nil { + return err + } newStatus.Experiment.Phase = terminalPhase - return r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)) + return nil } // rollback restores the DDA spec from the named ControllerRevision. @@ -146,11 +156,16 @@ func (r *Reconciler) rollback( return nil } + // Merge snapshot annotations (Datadog-only keys) on top of current annotations + // so that non-Datadog annotations (user metadata, tooling labels, etc.) are preserved. + merged := maps.Clone(current.Annotations) + maps.Copy(merged, snapshot.Annotations) + toUpdate := &v2alpha1.DatadogAgent{ ObjectMeta: current.ObjectMeta, Spec: snapshot.Spec, } - toUpdate.Annotations = snapshot.Annotations + toUpdate.Annotations = merged return r.client.Update(ctx, toUpdate) } @@ -172,13 +187,31 @@ func findRollbackTarget(revisions []appsv1.ControllerRevision) string { return prevName } -// findMostRecentRevision returns the ControllerRevision with the highest Revision number, -// or nil if the list is empty. Used to determine the experiment start time for timeout checks. -func findMostRecentRevision(revisions []appsv1.ControllerRevision) *appsv1.ControllerRevision { +// findMostRecentMatchingRevision returns the revision with the highest Revision number +// whose snapshot content matches the current instance spec and annotations, or nil if +// none match. This serves two purposes: +// +// - First reconcile after experiment start: the revision for the new spec has not been +// created yet, so no revision matches → nil → timeout check is skipped, preventing a +// spurious immediate timeout from an old pre-experiment revision's timestamp. +// +// - Post-rollback reconcile: the spec has been restored to the pre-experiment value. +// The matching revision is the pre-experiment one (old timestamp), so elapsed is +// large, timeout fires, and the idempotent rollback path sets phase=timeout cleanly +// without a spec-update conflict (ResourceVersion unchanged → status write succeeds). +func findMostRecentMatchingRevision(revisions []appsv1.ControllerRevision, instance *v2alpha1.DatadogAgent) *appsv1.ControllerRevision { + snap := revisionSnapshot{Spec: instance.Spec, Annotations: datadogAnnotations(instance.GetAnnotations())} + snapBytes, err := json.Marshal(snap) + if err != nil { + return nil + } var result *appsv1.ControllerRevision for i := range revisions { - if result == nil || revisions[i].Revision > result.Revision { - result = &revisions[i] + rev := &revisions[i] + if bytes.Equal(rev.Data.Raw, snapBytes) { + if result == nil || rev.Revision > result.Revision { + result = rev + } } } return result diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index eccd0b458e..d14418a314 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" ) @@ -43,7 +44,7 @@ func TestRollback_RestoresSpec(t *testing.T) { // Create a revision for specA. instanceA := newRevisionTestOwner("test-dda", "default") - err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) require.NoError(t, err) revListA := mustListRevisions(t, r, instanceA) @@ -53,7 +54,7 @@ func TestRollback_RestoresSpec(t *testing.T) { // Create a second revision for specB. instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) require.NoError(t, err) // rollback fetches the current DDA to compare specs; it must exist in the fake client. @@ -77,12 +78,12 @@ func TestHandleRollback_StoppedPhase(t *testing.T) { // Create two revisions so we have a previous to roll back to. instanceA := newRevisionTestOwner("test-dda", "default") - err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) require.NoError(t, err) instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) require.NoError(t, err) instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ @@ -99,16 +100,173 @@ func TestHandleRollback_StoppedPhase(t *testing.T) { assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase, "should ack stopped by setting phase to rollback") } +func TestRollback_PreservesNonDatadogAnnotations(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create revision for specA with a Datadog annotation. + instanceA := newRevisionTestOwner("test-dda", "default") + instanceA.Annotations = map[string]string{ + "some.datadoghq.com/key": "old-value", + } + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + revListA := mustListRevisions(t, r, instanceA) + require.Len(t, revListA, 1) + prevRevision := revListA[0].Name + + // instanceB is the "current" DDA: has a different Datadog annotation value, + // plus a non-Datadog annotation that should survive rollback. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB.Annotations = map[string]string{ + "some.datadoghq.com/key": "experiment-value", + "user-tooling/key": "keep-me", + } + require.NoError(t, c.Create(context.Background(), instanceB)) + + require.NoError(t, r.rollback(context.Background(), instanceB.ObjectMeta, prevRevision)) + + updated := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-dda"}, updated)) + + // Datadog annotation should be restored to the snapshot value. + assert.Equal(t, "old-value", updated.Annotations["some.datadoghq.com/key"]) + // Non-Datadog annotation must be preserved. + assert.Equal(t, "keep-me", updated.Annotations["user-tooling/key"]) +} + +func TestRestorePreviousSpec_PhaseSetOnlyOnSuccess(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped}} + + // rollback requires the DDA to exist in the fake client; don't create it so it errors. + err := r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + require.Error(t, err) + // Phase must NOT have been set since rollback failed. + assert.Equal(t, v2alpha1.ExperimentPhaseStopped, newStatus.Experiment.Phase) + + // Now create the DDA so rollback can succeed. + require.NoError(t, c.Create(context.Background(), instanceB)) + err = r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + require.NoError(t, err) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) +} + +func TestManageExperiment_TimeoutWinsOverSpuriousAbort(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Simulate a post-409 reconcile: phase=running, but generation was bumped by the + // rollback spec update (instanceB.Generation != experiment.Generation), AND timeout + // has elapsed. abortExperiment would fire for the generation mismatch, but + // handleRollback must win and persist phase=timeout. + instanceB.Generation = 99 + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: 1, + } + + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.manageExperiment(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + +func TestHandleRollback_NoTimeoutOnFirstReconcile(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Only one revision exists — for the pre-experiment spec — with an old timestamp. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 1) + revList[0].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + + // instanceB has a different spec (the experiment spec); its revision hasn't been created yet. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + require.NoError(t, c.Create(context.Background(), instanceB)) + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + // Pass the stale revList (pre-experiment revision only) — timeout must NOT fire. + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus.Experiment.Phase) +} + +// TestHandleRollback_PostRollbackSetsTimeout verifies the reconcile-2 scenario: +// the spec has already been restored to the pre-experiment value (e.g. by a +// previous reconcile whose status write 409'd), so phase is still running and +// the generation is mismatched. findMostRecentMatchingRevision finds the +// pre-experiment revision (spec matches), elapsed is large, idempotent rollback +// fires, and phase=timeout is set without a spec-update conflict. +func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // rev1: pre-experiment spec (instanceA). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + // rev2: experiment spec (instanceB). + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + // The DDA in the cluster already has the rolled-back spec (instanceA's spec), + // as if reconcile-1 restored it but its status write 409'd. + // Generation is set to a realistic non-zero value: RC would have recorded + // instanceA.Generation when the experiment started. + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: instanceA.Generation, + } + require.NoError(t, c.Create(context.Background(), instanceA)) + + // rev1 has an old timestamp — it was created well before the experiment started. + revList := mustListRevisions(t, r, instanceA) + for i := range revList { + if revList[i].Revision == 1 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceA.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceA, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + func TestHandleRollback_Timeout(t *testing.T) { r, c := newRevisionTestReconciler(t) // Create two revisions so rollback has a target. instanceA := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA))) + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB))) + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) // rollback fetches the current DDA to compare specs; it must exist in the fake client. require.NoError(t, c.Create(context.Background(), instanceB)) diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 6c245bc1eb..35560c6aad 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -31,10 +31,23 @@ type revisionSnapshot struct { Annotations map[string]string `json:"annotations,omitempty"` } +// skipRevisionBump returns true when the revision bump should be suppressed. +// During experiment rollback the spec is restored to an older revision; bumping +// its revision number to "latest" would make it appear newer than the experiment +// revision, causing findRollbackTarget to return the experiment revision on the +// next rollback attempt instead of the pre-experiment revision. +func skipRevisionBump(newStatus *v2alpha1.DatadogAgentStatus) bool { + if newStatus == nil || newStatus.Experiment == nil { + return false + } + phase := newStatus.Experiment.Phase + return phase == v2alpha1.ExperimentPhaseRollback || phase == v2alpha1.ExperimentPhaseTimeout +} + // manageRevision creates a ControllerRevision snapshot of the current spec and // garbage collects old revisions. Must be called after manageExperiment. -func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent, revList []appsv1.ControllerRevision) error { - revName, err := r.ensureRevision(ctx, instance, revList) +func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent, revList []appsv1.ControllerRevision, newStatus *v2alpha1.DatadogAgentStatus) error { + revName, err := r.ensureRevision(ctx, instance, revList, skipRevisionBump(newStatus)) if err != nil { return err } @@ -73,11 +86,13 @@ func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.Datad // ensureRevision creates a ControllerRevision snapshot of the instance spec and // annotations if it does not already exist, and returns the revision name. // -// The Revision field is a monotonic creation counter. +// The Revision field is a monotonic creation counter. If skipBump is true the +// existing revision is returned as-is without bumping its Revision number. func (r *Reconciler) ensureRevision( ctx context.Context, instance *v2alpha1.DatadogAgent, revList []appsv1.ControllerRevision, + skipBump bool, ) (string, error) { logger := ctrl.LoggerFrom(ctx) @@ -108,7 +123,11 @@ func (r *Reconciler) ensureRevision( if matchingRev != nil { // Identical content already snapshotted. Bump Revision to max+1 if it // has been superseded (e.g. after a revert) so ordering stays correct. - if matchingRev.Revision < maxRevision { + // Skip the bump during experiment rollback: bumping the pre-experiment + // revision above the experiment revision would cause findRollbackTarget + // to select the experiment revision as the rollback target on the next + // stopped signal, reversing the rollback. + if matchingRev.Revision < maxRevision && !skipBump { objLogger := logger.WithValues( "object.kind", "ControllerRevision", "object.namespace", matchingRev.Namespace, diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index 7f95d7f8a9..ce71466563 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -70,7 +70,7 @@ func TestEnsureRevision_CreatesOnFirstCall(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) assert.NotEmpty(t, name) @@ -86,9 +86,9 @@ func TestEnsureRevision_Idempotent(t *testing.T) { instance := newRevisionTestOwner("test-dda", "default") instance.Annotations = map[string]string{"foo": "bar"} - name1, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name1, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name2, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) assert.Equal(t, name1, name2) @@ -105,9 +105,9 @@ func TestEnsureRevision_DifferentSpecsDifferentNames(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.NotEqual(t, name1, name2) @@ -120,9 +120,9 @@ func TestEnsureRevision_DifferentAnnotationsDifferentNames(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Annotations = map[string]string{"feature.datadoghq.com/beta": "true"} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.NotEqual(t, name1, name2) @@ -142,9 +142,9 @@ func TestEnsureRevision_NonDatadogAnnotationsIgnored(t *testing.T) { "some-other-tool/annotation": "value", } - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.Equal(t, name1, name2, "non-datadoghq annotations should not affect the revision snapshot") @@ -168,7 +168,7 @@ func TestGCOldRevisions_KeepsCurrentAndPrevious(t *testing.T) { } names := make([]string, len(instances)) for i, inst := range instances { - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } @@ -196,13 +196,13 @@ func TestEnsureRevision_RevertBumpsRevision(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) // Revert to spec A — should reuse name1 but bump its Revision. - name3, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name3, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) assert.Equal(t, name1, name3, "revert should reuse same CR name") @@ -217,9 +217,9 @@ func TestGCOldRevisions_KeepsTwoRevisions(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) err = r.gcOldRevisions(context.Background(), instanceB, name2, mustListRevisions(t, r, instanceB)) @@ -249,7 +249,7 @@ func TestEnsureRevision_RevisionNumbersMonotonic(t *testing.T) { names := make([]string, len(instances)) for i, inst := range instances { - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } @@ -264,7 +264,7 @@ func TestGCOldRevisions_NoPreviousWhenOnlyCurrent(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) err = r.gcOldRevisions(context.Background(), instance, revName, mustListRevisions(t, r, instance)) @@ -284,7 +284,7 @@ func TestGCOldRevisions_DeletesMultipleOld(t *testing.T) { for i, site := range sites { inst := newRevisionTestOwner("test-dda", "default") inst.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: apiutils.NewStringPointer(site)}} - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } @@ -312,7 +312,7 @@ func TestManageRevision_CreatesRevision(t *testing.T) { r, _ := newRevisionTestReconciler(t) instanceA := newRevisionTestOwner("test-dda", "default") - err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) require.NoError(t, err) revList := mustListRevisions(t, r, instanceA) @@ -322,7 +322,7 @@ func TestManageRevision_CreatesRevision(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) @@ -333,9 +333,9 @@ func TestManageRevision_Idempotent(t *testing.T) { r, _ := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) require.NoError(t, err) - err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) require.NoError(t, err) revList := mustListRevisions(t, r, instance) @@ -349,7 +349,7 @@ func TestManageRevision_CurrentDeletedIsRecreated(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) require.NoError(t, err) revList := mustListRevisions(t, r, instance) @@ -360,7 +360,7 @@ func TestManageRevision_CurrentDeletedIsRecreated(t *testing.T) { require.NoError(t, c.Delete(context.Background(), &revList[0])) // Next reconcile should re-create the revision. - err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) require.NoError(t, err) revList = mustListRevisions(t, r, instance) @@ -381,9 +381,9 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) require.NoError(t, err) - err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) require.NoError(t, err) revList := mustListRevisions(t, r, instanceB) @@ -399,7 +399,7 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { require.NoError(t, c.Delete(context.Background(), &prev)) // Next reconcile on the same spec should succeed and keep only the current. - err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) From 6106687cdffb9b8d28551eab4ea921639201bc29 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Wed, 1 Apr 2026 13:13:32 -0400 Subject: [PATCH 7/8] Allow applying same change after rollback --- .../datadogagent/experiment_test.go | 66 ++++++++++++++++ internal/controller/datadogagent/revision.go | 48 +++++++++--- .../controller/datadogagent/revision_test.go | 77 +++++++++++++++++++ 3 files changed, 182 insertions(+), 9 deletions(-) diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index d14418a314..be255f20b8 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -257,6 +257,72 @@ func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) } +// TestReapplySameSpecAfterRollback_NoImmediateTimeout is the end-to-end +// regression test for the stale-revision bug. +// +// Without the fix: gcOldRevisions kept the experiment revision (rev2/B) after +// rollback. When RC later re-applied spec B as a new experiment and set +// phase=Running, findMostRecentMatchingRevision found the stale rev2 — its +// CreationTimestamp predated the current experiment — so elapsed >= timeout +// fired immediately and the operator rolled back again, making the re-apply +// appear to have no effect. +// +// With the fix: gcOldRevisions deletes rev2 once phase=Rollback is persisted. +// Re-applying spec B creates a fresh revision with a current timestamp, and +// the timeout clock starts correctly. +func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Setup: create revisions for spec A (pre-experiment) and spec B (experiment). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Backdate rev2 (B) to simulate a long-running experiment whose revision + // timestamp is well past the timeout threshold. + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + } + } + + // Rollback: RC sets phase=Stopped; operator restores spec A. + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} + rollbackStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, rollbackStatus, metav1.Now(), revList)) + require.Equal(t, v2alpha1.ExperimentPhaseRollback, rollbackStatus.Experiment.Phase) + + // Next reconcile: Rollback phase is now persisted in instance.Status. + // gcOldRevisions must delete the stale rev2 (B). + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback} + rollbackNewStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback}} + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), rollbackNewStatus)) + + remaining := mustListRevisions(t, r, instanceA) + require.Len(t, remaining, 1, "stale experiment revision (spec B) must be deleted once Rollback phase is persisted") + + // RC re-applies spec B as a new experiment (sets spec=B, phase=Running). + // In the real reconcile loop manageExperiment (handleRollback) runs before + // manageRevision, so no revision for spec B exists at the time of the check. + // findMostRecentMatchingRevision returns nil → timeout check is skipped. + instanceB2 := newRevisionTestOwner("test-dda", "default") + instanceB2.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + + revListBeforeNewRevision := mustListRevisions(t, r, instanceB2) + require.Len(t, revListBeforeNewRevision, 1, "only rev1 (A) should exist before the new experiment's revision is created") + + newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListBeforeNewRevision)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, + "re-applying the same spec after rollback must not immediately time out") +} + func TestHandleRollback_Timeout(t *testing.T) { r, c := newRevisionTestReconciler(t) diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 35560c6aad..1ec5abe75c 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -196,6 +196,17 @@ func datadogAnnotations(all map[string]string) map[string]string { // gcOldRevisions deletes all but the two most recent ControllerRevisions: // the current and previous. +// +// Exception: after a rejected experiment (rollback, timeout, or abort) the +// experiment revision carries a stale CreationTimestamp. If that revision were +// kept and the same spec were re-applied as a new experiment, +// findMostRecentMatchingRevision would find it, compute an elapsed time based +// on the old timestamp, and immediately time out. To prevent this, all +// non-current revisions are deleted once the rejected phase is persisted in +// instance.Status. The check uses the persisted status (not the in-flight +// newStatus) so that the revision survives any status-write conflicts during +// the rollback itself — the deletion is intentionally deferred to the next +// reconcile cycle after the terminal phase is confirmed. func (r *Reconciler) gcOldRevisions( ctx context.Context, instance *v2alpha1.DatadogAgent, @@ -205,16 +216,20 @@ func (r *Reconciler) gcOldRevisions( logger := ctrl.LoggerFrom(ctx) // Identify the most recent non-current revision to keep as previous. + // Skip this when the persisted experiment phase is a rejected terminal + // phase — in that case we want to delete the stale experiment revision. previous := "" - previousRevision := int64(-1) - for i := range revList { - rev := &revList[i] - if rev.Name == current { - continue - } - if rev.Revision > previousRevision { - previousRevision = rev.Revision - previous = rev.Name + if !isRejectedExperimentPhase(instance) { + previousRevision := int64(-1) + for i := range revList { + rev := &revList[i] + if rev.Name == current { + continue + } + if rev.Revision > previousRevision { + previousRevision = rev.Revision + previous = rev.Name + } } } @@ -236,3 +251,18 @@ func (r *Reconciler) gcOldRevisions( return nil } + +// isRejectedExperimentPhase returns true when the persisted experiment phase +// indicates the experiment spec was rejected — rolled back, timed out, or +// aborted. These are the cases where keeping the experiment revision would +// cause an immediate timeout if the same spec were re-applied. +func isRejectedExperimentPhase(instance *v2alpha1.DatadogAgent) bool { + if instance.Status.Experiment == nil { + return false + } + switch instance.Status.Experiment.Phase { + case v2alpha1.ExperimentPhaseRollback, v2alpha1.ExperimentPhaseTimeout, v2alpha1.ExperimentPhaseAborted: + return true + } + return false +} diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index ce71466563..8f199169cc 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -407,6 +407,83 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { assert.Equal(t, int64(2), revList[0].Revision) } +// TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment verifies that when +// the persisted experiment phase is a rejected terminal phase (Rollback, Timeout, +// or Aborted), gcOldRevisions deletes the stale experiment revision instead of +// keeping it as "previous". This prevents an immediate timeout if the same spec +// is re-applied as a new experiment. +func TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment(t *testing.T) { + rejectedPhases := []v2alpha1.ExperimentPhase{ + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhaseAborted, + } + + for _, phase := range rejectedPhases { + t.Run(string(phase), func(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + + nameA, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + // Simulate the persisted status having a rejected terminal phase. + // current is instanceA (spec restored after rollback). + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + + err = r.gcOldRevisions(context.Background(), instanceA, nameA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 1, "experiment revision should be deleted after rejected phase") + assert.Equal(t, nameA, revList.Items[0].Name, "only the current (pre-experiment) revision should remain") + }) + } +} + +// TestGCOldRevisions_KeepsPreviousForNonRejectedPhases verifies that for +// non-rejected experiment phases (Running, Promoted, or nil), the normal +// "keep current + previous" behavior is preserved. +func TestGCOldRevisions_KeepsPreviousForNonRejectedPhases(t *testing.T) { + nonRejectedPhases := []v2alpha1.ExperimentPhase{ + v2alpha1.ExperimentPhaseRunning, + v2alpha1.ExperimentPhasePromoted, + "", // nil experiment + } + + for _, phase := range nonRejectedPhases { + t.Run(string(phase)+"_or_nil", func(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + if phase != "" { + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + } + + err = r.gcOldRevisions(context.Background(), instanceB, nameB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 2, "current and previous should both be kept for non-rejected phase") + }) + } +} + // TestListRevisions_ExcludesForeignOwner verifies that a ControllerRevision // sharing the same name label but owned by a different DDA UID (e.g. a // deleted-and-recreated DDA) is not returned by listRevisions. From d4b5a046bfd38a12a3b7c2f5bda08b878a27d0e1 Mon Sep 17 00:00:00 2001 From: khewonc <39867936+khewonc@users.noreply.github.com> Date: Mon, 6 Apr 2026 17:45:27 -0400 Subject: [PATCH 8/8] Fix build --- .../controller_experiment_integration_test.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index 0679a4fd64..399685ee25 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -30,9 +30,9 @@ import ( assert "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" - apiutils "github.com/DataDog/datadog-operator/api/utils" ) // newExperimentIntegrationReconciler builds a revision reconciler with an @@ -71,7 +71,7 @@ func Test_Experiment_StoppedRollback(t *testing.T) { // Rev2: RC applies experiment spec. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) @@ -115,7 +115,7 @@ func Test_Experiment_TimeoutRollback(t *testing.T) { // Rev2: RC applies experiment spec. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) @@ -159,7 +159,7 @@ func Test_Experiment_AbortOnManualChange(t *testing.T) { // Rev2: RC applies experiment spec; RC signals running. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -188,7 +188,7 @@ func Test_Experiment_AbortOnManualChange(t *testing.T) { // experimentGen. In the fake client Generation stays at 0, which differs from // the experimentGen sentinel above, so the mismatch is already in place. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.com") + dda.Spec.Global.Site = ptr.To("datadoghq.com") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -226,7 +226,7 @@ func Test_Experiment_TimeoutPhase_IsStable(t *testing.T) { createAndReconcile(t, r, dda) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -263,7 +263,7 @@ func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { createAndReconcile(t, r, dda) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -303,7 +303,7 @@ func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { createAndReconcile(t, r, dda) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -349,7 +349,7 @@ func Test_Experiment_StoppedAfterRollback(t *testing.T) { createAndReconcile(t, r, dda) assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1) @@ -390,7 +390,7 @@ func Test_Experiment_AbortDoesNotRollback(t *testing.T) { // Apply experiment spec. assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) - dda.Spec.Global.Site = apiutils.NewStringPointer("datadoghq.eu") + dda.Spec.Global.Site = ptr.To("datadoghq.eu") assert.NoError(t, r.client.Update(context.TODO(), dda)) reconcileN(t, r, ns, name, 1)