diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 5ad50df0da..72174b785e 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -2349,6 +2349,43 @@ type RemoteConfigConfiguration struct { Features *DatadogFeatures `json:"features,omitempty"` } +// ExperimentPhase is the lifecycle phase of a Fleet Automation experiment. +// +kubebuilder:validation:Enum=running;stopped;rollback;timeout;promoted;aborted +type ExperimentPhase string + +const ( + // ExperimentPhaseRunning is set by RC when an experiment starts (startExperiment). + ExperimentPhaseRunning ExperimentPhase = "running" + // ExperimentPhaseStopped is set by RC to request a rollback (stopExperiment). + ExperimentPhaseStopped ExperimentPhase = "stopped" + // ExperimentPhaseRollback is set by the operator after processing a stopped signal and restoring the previous spec. + ExperimentPhaseRollback ExperimentPhase = "rollback" + // ExperimentPhaseTimeout is set by the operator when the experiment exceeds the timeout and is auto-rolled back. + ExperimentPhaseTimeout ExperimentPhase = "timeout" + // ExperimentPhasePromoted is set by RC when an experiment succeeds (promoteExperiment). + ExperimentPhasePromoted ExperimentPhase = "promoted" + // ExperimentPhaseAborted is set by the operator when a manual spec change is detected during a running experiment. + ExperimentPhaseAborted ExperimentPhase = "aborted" +) + +// ExperimentStatus defines the state of a Fleet Automation experiment. +// +k8s:openapi-gen=true +type ExperimentStatus struct { + // Phase is the current state of the experiment. + // +optional + Phase ExperimentPhase `json:"phase,omitempty"` + // ID is the unique experiment ID sent by Fleet Automation. + // +optional + ID string `json:"id,omitempty"` + // Generation is the DDA metadata.generation recorded when the experiment started. + // Used to detect manual spec changes while the experiment is running: if the + // current DDA generation differs from this value, the operator aborts the experiment. + // + // This value must be recorded after the DDA is patched for a startExperiment signal. + // +optional + Generation int64 `json:"generation,omitempty"` +} + // DatadogAgentStatus defines the observed state of DatadogAgent. // +k8s:openapi-gen=true type DatadogAgentStatus struct { @@ -2376,6 +2413,9 @@ type DatadogAgentStatus struct { // RemoteConfigConfiguration stores the configuration received from RemoteConfig. // +optional RemoteConfigConfiguration *RemoteConfigConfiguration `json:"remoteConfigConfiguration,omitempty"` + // Experiment tracks the state of an active or recent Fleet Automation experiment. + // +optional + Experiment *ExperimentStatus `json:"experiment,omitempty"` } // DatadogAgent defines Agent configuration, see reference https://github.com/DataDog/datadog-operator/blob/main/docs/configuration.v2alpha1.md @@ -2387,6 +2427,7 @@ type DatadogAgentStatus struct { // +kubebuilder:printcolumn:name="cluster-agent",type="string",JSONPath=".status.clusterAgent.status" // +kubebuilder:printcolumn:name="cluster-checks-runner",type="string",JSONPath=".status.clusterChecksRunner.status" // +kubebuilder:printcolumn:name="age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:printcolumn:name="experiment-phase",type="string",JSONPath=".status.experiment.phase",priority=1 // +k8s:openapi-gen=true // +genclient type DatadogAgent struct { diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index 4733c0d92d..fee99ea239 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1361,6 +1361,11 @@ func (in *DatadogAgentStatus) DeepCopyInto(out *DatadogAgentStatus) { *out = new(RemoteConfigConfiguration) (*in).DeepCopyInto(*out) } + if in.Experiment != nil { + in, out := &in.Experiment, &out.Experiment + *out = new(ExperimentStatus) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatadogAgentStatus. @@ -1764,6 +1769,21 @@ func (in *EventTypes) DeepCopy() *EventTypes { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExperimentStatus) DeepCopyInto(out *ExperimentStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExperimentStatus. +func (in *ExperimentStatus) DeepCopy() *ExperimentStatus { + if in == nil { + return nil + } + out := new(ExperimentStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ExternalMetricsServerFeatureConfig) DeepCopyInto(out *ExternalMetricsServerFeatureConfig) { *out = *in diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index d1672fbc04..e35c857471 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -34,6 +34,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_DogstatsdFeatureConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ErrorTrackingStandalone": schema_datadog_operator_api_datadoghq_v2alpha1_ErrorTrackingStandalone(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_EventCollectionFeatureConfig(ref), + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus": schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.FIPSConfig": schema_datadog_operator_api_datadoghq_v2alpha1_FIPSConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_HelmCheckFeatureConfig(ref), "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig": schema_datadog_operator_api_datadoghq_v2alpha1_KubeStateMetricsCoreFeatureConfig(ref), @@ -645,11 +646,17 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogAgentStatus(ref commo Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration"), }, }, + "experiment": { + SchemaProps: spec.SchemaProps{ + Description: "Experiment tracks the state of an active or recent Fleet Automation experiment.", + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus"), + }, + }, }, }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DaemonSetStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DeploymentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration", "k8s.io/apimachinery/pkg/apis/meta/v1.Condition"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DaemonSetStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DeploymentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExperimentStatus", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigConfiguration", "k8s.io/apimachinery/pkg/apis/meta/v1.Condition"}, } } @@ -1114,6 +1121,40 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_EventCollectionFeatureConfig } } +func schema_datadog_operator_api_datadoghq_v2alpha1_ExperimentStatus(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "ExperimentStatus defines the state of a Fleet Automation experiment.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "phase": { + SchemaProps: spec.SchemaProps{ + Description: "Phase is the current state of the experiment.", + Type: []string{"string"}, + Format: "", + }, + }, + "id": { + SchemaProps: spec.SchemaProps{ + Description: "ID is the unique experiment ID sent by Fleet Automation.", + Type: []string{"string"}, + Format: "", + }, + }, + "generation": { + SchemaProps: spec.SchemaProps{ + Description: "Generation is the DDA metadata.generation recorded when the experiment started. Used to detect manual spec changes while the experiment is running: if the current DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", + Type: []string{"integer"}, + Format: "int64", + }, + }, + }, + }, + }, + } +} + func schema_datadog_operator_api_datadoghq_v2alpha1_FIPSConfig(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 4dde7e8f70..f9a7b43dc8 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -29,6 +29,10 @@ spec: - jsonPath: .metadata.creationTimestamp name: age type: date + - jsonPath: .status.experiment.phase + name: experiment-phase + priority: 1 + type: string name: v2alpha1 schema: openAPIV3Schema: @@ -8470,6 +8474,32 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + experiment: + description: Experiment tracks the state of an active or recent Fleet Automation experiment. + properties: + generation: + description: |- + Generation is the DDA metadata.generation recorded when the experiment started. + Used to detect manual spec changes while the experiment is running: if the + current DDA generation differs from this value, the operator aborts the experiment. + + This value must be recorded after the DDA is patched for a startExperiment signal. + format: int64 + type: integer + id: + description: ID is the unique experiment ID sent by Fleet Automation. + type: string + phase: + description: Phase is the current state of the experiment. + enum: + - running + - stopped + - rollback + - timeout + - promoted + - aborted + type: string + type: object otelAgentGateway: description: The actual state of the OTel Agent Gateway as a deployment. properties: diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index e7b9123233..fc427592bc 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -8175,6 +8175,34 @@ ], "x-kubernetes-list-type": "map" }, + "experiment": { + "additionalProperties": false, + "description": "Experiment tracks the state of an active or recent Fleet Automation experiment.", + "properties": { + "generation": { + "description": "Generation is the DDA metadata.generation recorded when the experiment started.\nUsed to detect manual spec changes while the experiment is running: if the\ncurrent DDA generation differs from this value, the operator aborts the experiment.\n\nThis value must be recorded after the DDA is patched for a startExperiment signal.", + "format": "int64", + "type": "integer" + }, + "id": { + "description": "ID is the unique experiment ID sent by Fleet Automation.", + "type": "string" + }, + "phase": { + "description": "Phase is the current state of the experiment.", + "enum": [ + "running", + "stopped", + "rollback", + "timeout", + "promoted", + "aborted" + ], + "type": "string" + } + }, + "type": "object" + }, "otelAgentGateway": { "additionalProperties": false, "description": "The actual state of the OTel Agent Gateway as a deployment.", diff --git a/internal/controller/datadogagent/controller.go b/internal/controller/datadogagent/controller.go index 4e6c1907a3..57d728a7d8 100644 --- a/internal/controller/datadogagent/controller.go +++ b/internal/controller/datadogagent/controller.go @@ -76,6 +76,8 @@ type ReconcilerOptions struct { DatadogAgentProfileEnabled bool DatadogAgentInternalEnabled bool CreateControllerRevisions bool + // ExperimentTimeout overrides ExperimentDefaultTimeout. Zero means use the default. + ExperimentTimeout time.Duration } // Reconciler is the internal reconciler for Datadog Agent diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go new file mode 100644 index 0000000000..399685ee25 --- /dev/null +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -0,0 +1,415 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +// Integration tests for the experiment rollback flow wired through the full +// DDA reconcile path. These complement the unit tests in experiment_test.go. +// +// Coverage goals: +// - Stopped rollback: RC writing phase=stopped causes the operator to restore +// the previous spec and set phase=rollback. +// - Timeout rollback: an experiment running past ExperimentTimeout causes the +// operator to restore the previous spec and set phase=timeout. +// +// NOTE: rollback is idempotent — if the spec is already at the rollback target +// the Update is skipped. This means the status update in the same reconcile +// succeeds without a ResourceVersion conflict. In the first rollback reconcile +// the spec update bumps ResourceVersion and the status update conflicts; the +// second reconcile (fresh fetch) finds the spec already correct, skips the +// Update, and the status update succeeds. Tests therefore run two reconciles +// after triggering rollback. + +import ( + "context" + "testing" + "time" + + assert "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// newExperimentIntegrationReconciler builds a revision reconciler with an +// overridden ExperimentTimeout for testing. +func newExperimentIntegrationReconciler(t *testing.T, timeout time.Duration) *Reconciler { + t.Helper() + r, _ := newRevisionIntegrationReconciler(t) + r.options.ExperimentTimeout = timeout + return r +} + +// reconcileN re-fetches the DDA and calls Reconcile n times in sequence. +func reconcileN(t *testing.T, r *Reconciler, ns, name string, n int) { + t.Helper() + nsName := types.NamespacedName{Namespace: ns, Name: name} + for i := 0; i < n; i++ { + var dda v2alpha1.DatadogAgent + assert.NoError(t, r.client.Get(context.TODO(), nsName, &dda)) + _, err := r.Reconcile(context.TODO(), &dda) + assert.NoError(t, err) + } +} + +// Test_Experiment_StoppedRollback verifies that when RC writes phase=stopped, +// the operator restores the previous spec and sets phase=rollback. +func Test_Experiment_StoppedRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=stopped. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // First reconcile: rollback triggered (spec restored, status update may conflict). + // Second reconcile: spec already correct, status update succeeds. + reconcileN(t, r, ns, name, 2) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The snapshot is taken from the defaulted spec (defaults.DefaultDatadogAgentSpec applies + // site="datadoghq.com" before snapshotting), so rollback restores that value. + assert.NotNil(t, dda.Spec.Global.Site, "spec should be restored to pre-experiment state") + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be restored to pre-experiment state") + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, dda.Status.Experiment.Phase) +} + +// Test_Experiment_TimeoutRollback verifies that an experiment running past +// ExperimentTimeout causes the operator to restore the previous spec and set +// phase=timeout. +func Test_Experiment_TimeoutRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + assert.Len(t, listOwnedRevisions(t, r.client, ns, uid), 2) + + // RC writes phase=running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Wait for the timeout to elapse. + time.Sleep(2 * timeout) + + // Reconcile 1: timeout detected → spec restored; status write may conflict. + // Reconcile 2: idempotent rollback → status write succeeds. + reconcileN(t, r, ns, name, 2) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The snapshot is taken from the defaulted spec, so rollback restores site="datadoghq.com". + assert.NotNil(t, dda.Spec.Global.Site, "spec should be restored after timeout") + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec should be restored after timeout") + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase) +} + +// Test_Experiment_AbortOnManualChange verifies that a spec change while an +// experiment is running sets phase=aborted and does not trigger rollback. +func Test_Experiment_AbortOnManualChange(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Rev1: initial spec. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Rev2: RC applies experiment spec; RC signals running. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // The fake client does not auto-increment Generation on Update, so revisions + // end up with a zero CreationTimestamp (fake-client limitation). Patch them to + // a recent time so the timeout path in handleRollback is not accidentally + // triggered before the abort check runs. + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + rev.CreationTimestamp = metav1.Now() + assert.NoError(t, r.client.Update(context.TODO(), &rev)) + } + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // The fake client never increments Generation, so dda.Generation is always 0. + // Set experiment.Generation to a non-zero sentinel so the abortExperiment + // generation-mismatch check (instance.Generation != experiment.Generation) fires. + const experimentGen = int64(2) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: experimentGen, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // User manually changes the spec — in a real cluster this bumps Generation past + // experimentGen. In the fake client Generation stays at 0, which differs from + // the experimentGen sentinel above, so the mismatch is already in place. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.com") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + // Spec should be the user's manual change, not rolled back. + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, dda.Status.Experiment.Phase) +} + +// mustGetExperimentPhase fetches the DDA and returns the experiment phase, or +// empty string if no experiment is set. Helper for readability in assertions. +func mustGetExperimentPhase(t *testing.T, r *Reconciler, ns, name string) v2alpha1.ExperimentPhase { + t.Helper() + var dda v2alpha1.DatadogAgent + assert.NoError(t, r.client.Get(context.TODO(), types.NamespacedName{Namespace: ns, Name: name}, &dda)) + if dda.Status.Experiment == nil { + return "" + } + return dda.Status.Experiment.Phase +} + +// Test_Experiment_TimeoutPhase_IsStable verifies that once phase=timeout is +// persisted, further reconciles do not change the spec or phase. +func Test_Experiment_TimeoutPhase_IsStable(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * timeout) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + + // Extra reconciles must not change phase or spec. + reconcileN(t, r, ns, name, 3) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_RollbackPhase_IsStable verifies that once phase=rollback is +// persisted, further reconciles do not change the spec or phase. +func Test_Experiment_RollbackPhase_IsStable(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // Extra reconciles must not change phase or spec. + reconcileN(t, r, ns, name, 3) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_RunningAfterTimeout_StaleGeneration verifies that if RC +// writes phase=running after a timeout rollback has completed, the operator +// fires timeout again idempotently: the pre-experiment revision is old enough +// to exceed the timeout threshold, rollback is a no-op (spec already correct), +// and phase=timeout is written again. +func Test_Experiment_RunningAfterTimeout_StaleGeneration(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * timeout) + reconcileN(t, r, ns, name, 2) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) + + // RC writes phase=running again with the old (pre-rollback) generation — stale. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + const staleGen = int64(99) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: staleGen, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + // The pre-experiment revision is old enough that timeout fires idempotently — + // rollback is a no-op (spec already correct) and phase=timeout is written again. + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_StoppedAfterRollback verifies that if RC writes phase=stopped +// after a rollback has already completed, the idempotent rollback path handles +// it cleanly (spec unchanged, phase set to rollback again). +func Test_Experiment_StoppedAfterRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 2) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) + + // RC writes phase=stopped again. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment.Phase = v2alpha1.ExperimentPhaseStopped + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 2) + + // Spec should still be the rolled-back spec; phase=rollback again. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, mustGetExperimentPhase(t, r, ns, name)) +} + +// Test_Experiment_AbortDoesNotRollback verifies that phase=aborted is a +// terminal state and does not trigger a spec restore on subsequent reconciles. +func Test_Experiment_AbortDoesNotRollback(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + // Apply experiment spec. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + // Manually force phase=aborted (as if abort already happened). + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseAborted, + Generation: dda.Generation, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + reconcileN(t, r, ns, name, 1) + + // Spec should be unchanged (experiment spec), phase still aborted. + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.eu", *dda.Spec.Global.Site, "aborted experiment must not trigger rollback") + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, mustGetExperimentPhase(t, r, ns, name)) + + // Also verify the revision timestamp is not used as a proxy for time.Now() comparison. + _ = metav1.Now() +} diff --git a/internal/controller/datadogagent/controller_reconcile_v2.go b/internal/controller/datadogagent/controller_reconcile_v2.go index 1e59d6fe7d..b97a45a720 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2.go +++ b/internal/controller/datadogagent/controller_reconcile_v2.go @@ -71,11 +71,17 @@ func (r *Reconciler) reconcileInstanceV3(ctx context.Context, logger logr.Logger ddaStatusCopy := instance.Status.DeepCopy() newDDAStatus := generateNewStatusFromDDA(ddaStatusCopy) - // Manage ControllerRevision snapshots if r.options.CreateControllerRevisions { - if err := r.manageRevision(ctx, instance); err != nil { + revList, err := r.listRevisions(ctx, instance) + if err != nil { return r.updateStatusIfNeededV2(logger, instance, ddaStatusCopy, result, err, now) } + if err := r.manageExperiment(ctx, instance, newDDAStatus, now, revList); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) + } + if err := r.manageRevision(ctx, instance, revList, newDDAStatus); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newDDAStatus, result, err, now) + } } // Manage dependencies diff --git a/internal/controller/datadogagent/controller_reconcile_v2_common.go b/internal/controller/datadogagent/controller_reconcile_v2_common.go index ef70322ea0..89deeee536 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2_common.go +++ b/internal/controller/datadogagent/controller_reconcile_v2_common.go @@ -826,5 +826,9 @@ func IsEqualStatus(current *v2alpha1.DatadogAgentStatus, newStatus *v2alpha1.Dat return false } + if !apiequality.Semantic.DeepEqual(current.Experiment, newStatus.Experiment) { + return false + } + return condition.IsEqualConditions(current.Conditions, newStatus.Conditions) } diff --git a/internal/controller/datadogagent/controller_reconcile_v2_helpers.go b/internal/controller/datadogagent/controller_reconcile_v2_helpers.go index cdbf084c33..ef0a2b209c 100644 --- a/internal/controller/datadogagent/controller_reconcile_v2_helpers.go +++ b/internal/controller/datadogagent/controller_reconcile_v2_helpers.go @@ -259,6 +259,7 @@ func generateNewStatusFromDDA(ddaStatus *datadoghqv2alpha1.DatadogAgentStatus) * if ddaStatus.RemoteConfigConfiguration != nil { status.RemoteConfigConfiguration = ddaStatus.RemoteConfigConfiguration } + status.Experiment = ddaStatus.Experiment.DeepCopy() } return status } diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go new file mode 100644 index 0000000000..aa4a59c46c --- /dev/null +++ b/internal/controller/datadogagent/experiment.go @@ -0,0 +1,225 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "maps" + "time" + + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +// ExperimentDefaultTimeout is the duration after which a running experiment is automatically rolled back. +const ExperimentDefaultTimeout = 15 * time.Minute + +// manageExperiment handles all experiment state transitions for a reconcile cycle. +// Must be called before manageRevision. +func (r *Reconciler) manageExperiment( + ctx context.Context, + instance *v2alpha1.DatadogAgent, + newStatus *v2alpha1.DatadogAgentStatus, + now metav1.Time, + revList []appsv1.ControllerRevision, +) error { + experiment := instance.Status.Experiment + if experiment == nil { + return nil + } + + ctx = ctrl.LoggerInto(ctx, ctrl.LoggerFrom(ctx).WithValues("experimentID", experiment.ID)) + + if err := r.handleRollback(ctx, instance, newStatus, now, revList); err != nil { + return err + } + abortExperiment(ctx, instance.Generation, experiment, newStatus) + return nil +} + +// abortExperiment marks the experiment as aborted in newStatus if a manual spec +// change is detected (DDA generation differs from the recorded experiment generation). +// It is a no-op if handleRollback has already set a terminal phase (e.g. timeout), +// preventing spurious abort logs and phase overwrites. +func abortExperiment( + ctx context.Context, + instanceGeneration int64, + experiment *v2alpha1.ExperimentStatus, + newStatus *v2alpha1.DatadogAgentStatus, +) { + if experiment.Phase != v2alpha1.ExperimentPhaseRunning { + return + } + if newStatus.Experiment.Phase != v2alpha1.ExperimentPhaseRunning { + // handleRollback already determined a terminal phase (e.g. timeout); don't overwrite or log. + return + } + if experiment.Generation == 0 || instanceGeneration == experiment.Generation { + return + } + ctrl.LoggerFrom(ctx).Info("Aborting experiment due to manual spec change") + newStatus.Experiment.Phase = v2alpha1.ExperimentPhaseAborted +} + +// handleRollback checks if the experiment needs rollback (explicit stop or timeout). +func (r *Reconciler) handleRollback( + ctx context.Context, + instance *v2alpha1.DatadogAgent, + newStatus *v2alpha1.DatadogAgentStatus, + now metav1.Time, + revisions []appsv1.ControllerRevision, +) error { + logger := ctrl.LoggerFrom(ctx) + if instance.Status.Experiment == nil { + return nil + } + + phase := instance.Status.Experiment.Phase + + switch { + // stopped signal from RC: restore DDA spec and ack by setting phase to rollback. + case phase == v2alpha1.ExperimentPhaseStopped: + logger.Info("Experiment stopped, rolling back") + return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) + case phase == v2alpha1.ExperimentPhaseRunning: + rev := findMostRecentMatchingRevision(revisions, instance) + if rev != nil { + elapsed := now.Sub(rev.CreationTimestamp.Time) + if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { + logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String()) + return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout) + } + } + } + + return nil +} + +// restorePreviousSpec restores the DDA spec from the previous ControllerRevision +// and, on success, sets the terminal experiment phase. +func (r *Reconciler) restorePreviousSpec( + ctx context.Context, + instanceMeta metav1.ObjectMeta, + newStatus *v2alpha1.DatadogAgentStatus, + revisions []appsv1.ControllerRevision, + terminalPhase v2alpha1.ExperimentPhase, +) error { + if err := r.rollback(ctx, instanceMeta, findRollbackTarget(revisions)); err != nil { + return err + } + newStatus.Experiment.Phase = terminalPhase + return nil +} + +// rollback restores the DDA spec from the named ControllerRevision. +func (r *Reconciler) rollback( + ctx context.Context, + instanceMeta metav1.ObjectMeta, + rollbackTarget string, +) error { + if rollbackTarget == "" { + return fmt.Errorf("no previous revision to roll back to") + } + + cr := &appsv1.ControllerRevision{} + if err := r.client.Get(ctx, types.NamespacedName{Namespace: instanceMeta.Namespace, Name: rollbackTarget}, cr); err != nil { + return fmt.Errorf("failed to get previous ControllerRevision %s: %w", rollbackTarget, err) + } + + var snapshot revisionSnapshot + if err := json.Unmarshal(cr.Data.Raw, &snapshot); err != nil { + return fmt.Errorf("failed to decode ControllerRevision data: %w", err) + } + + // Re-fetch for the latest ResourceVersion and to check whether the spec is + // rolled back already. If it is, skip the update. + current := &v2alpha1.DatadogAgent{} + if err := r.client.Get(ctx, types.NamespacedName{Namespace: instanceMeta.Namespace, Name: instanceMeta.Name}, current); err != nil { + return fmt.Errorf("failed to get current DDA for rollback: %w", err) + } + currentSnap, err := json.Marshal(revisionSnapshot{Spec: current.Spec, Annotations: datadogAnnotations(current.GetAnnotations())}) + if err != nil { + return fmt.Errorf("failed to marshal current snapshot for comparison: %w", err) + } + if bytes.Equal(currentSnap, cr.Data.Raw) { + ctrl.LoggerFrom(ctx).Info("Rollback spec already matches target, skipping update", "rollbackTarget", rollbackTarget) + return nil + } + + // Merge snapshot annotations (Datadog-only keys) on top of current annotations + // so that non-Datadog annotations (user metadata, tooling labels, etc.) are preserved. + merged := maps.Clone(current.Annotations) + maps.Copy(merged, snapshot.Annotations) + + toUpdate := &v2alpha1.DatadogAgent{ + ObjectMeta: current.ObjectMeta, + Spec: snapshot.Spec, + } + toUpdate.Annotations = merged + return r.client.Update(ctx, toUpdate) +} + +// findRollbackTarget returns the name of the previous ControllerRevision to restore. +// GC keeps at most two revisions (current and previous), so this returns whichever +// revision has the lower revision number. +func findRollbackTarget(revisions []appsv1.ControllerRevision) string { + var curRev, prevRev int64 = -1, -1 + var curName, prevName string + for i := range revisions { + rev := &revisions[i] + if rev.Revision > curRev { + prevRev, prevName = curRev, curName + curRev, curName = rev.Revision, rev.Name + } else if rev.Revision > prevRev { + prevRev, prevName = rev.Revision, rev.Name + } + } + return prevName +} + +// findMostRecentMatchingRevision returns the revision with the highest Revision number +// whose snapshot content matches the current instance spec and annotations, or nil if +// none match. This serves two purposes: +// +// - First reconcile after experiment start: the revision for the new spec has not been +// created yet, so no revision matches → nil → timeout check is skipped, preventing a +// spurious immediate timeout from an old pre-experiment revision's timestamp. +// +// - Post-rollback reconcile: the spec has been restored to the pre-experiment value. +// The matching revision is the pre-experiment one (old timestamp), so elapsed is +// large, timeout fires, and the idempotent rollback path sets phase=timeout cleanly +// without a spec-update conflict (ResourceVersion unchanged → status write succeeds). +func findMostRecentMatchingRevision(revisions []appsv1.ControllerRevision, instance *v2alpha1.DatadogAgent) *appsv1.ControllerRevision { + snap := revisionSnapshot{Spec: instance.Spec, Annotations: datadogAnnotations(instance.GetAnnotations())} + snapBytes, err := json.Marshal(snap) + if err != nil { + return nil + } + var result *appsv1.ControllerRevision + for i := range revisions { + rev := &revisions[i] + if bytes.Equal(rev.Data.Raw, snapBytes) { + if result == nil || rev.Revision > result.Revision { + result = rev + } + } + } + return result +} + +func getExperimentTimeout(timeout time.Duration) time.Duration { + if timeout == 0 { + return ExperimentDefaultTimeout + } + return timeout +} diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go new file mode 100644 index 0000000000..be255f20b8 --- /dev/null +++ b/internal/controller/datadogagent/experiment_test.go @@ -0,0 +1,357 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package datadogagent + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + v2alpha1 "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" +) + +func TestManageExperiment_AbortsOnManualChange(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + + instance := newRevisionTestOwner("test-dda", "default") + instance.Generation = 3 + instance.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instance.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: 2, + } + + status := &v2alpha1.DatadogAgentStatus{ + Experiment: instance.Status.Experiment.DeepCopy(), + } + + err := r.manageExperiment(context.Background(), instance, status, metav1.Now(), mustListRevisions(t, r, instance)) + require.NoError(t, err) + require.NotNil(t, status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseAborted, status.Experiment.Phase) +} + +func TestRollback_RestoresSpec(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create a revision for specA. + instanceA := newRevisionTestOwner("test-dda", "default") + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) + require.NoError(t, err) + + revListA := mustListRevisions(t, r, instanceA) + require.Len(t, revListA, 1) + prevRevision := revListA[0].Name + + // Create a second revision for specB. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) + require.NoError(t, err) + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Rollback from instanceB to prevRevision (specA). + require.NoError(t, r.rollback(context.Background(), instanceB.ObjectMeta, prevRevision)) +} + +func TestRollback_NoPreviousRevision(t *testing.T) { + r, _ := newRevisionTestReconciler(t) + instance := newRevisionTestOwner("test-dda", "default") + + err := r.rollback(context.Background(), instance.ObjectMeta, "") + require.Error(t, err) + assert.Contains(t, err.Error(), "no previous revision") +} + +func TestHandleRollback_StoppedPhase(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so we have a previous to roll back to. + instanceA := newRevisionTestOwner("test-dda", "default") + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) + require.NoError(t, err) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) + require.NoError(t, err) + + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseStopped, + } + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + revList := mustListRevisions(t, r, instanceB) + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + require.NotNil(t, newStatus.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase, "should ack stopped by setting phase to rollback") +} + +func TestRollback_PreservesNonDatadogAnnotations(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create revision for specA with a Datadog annotation. + instanceA := newRevisionTestOwner("test-dda", "default") + instanceA.Annotations = map[string]string{ + "some.datadoghq.com/key": "old-value", + } + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + revListA := mustListRevisions(t, r, instanceA) + require.Len(t, revListA, 1) + prevRevision := revListA[0].Name + + // instanceB is the "current" DDA: has a different Datadog annotation value, + // plus a non-Datadog annotation that should survive rollback. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB.Annotations = map[string]string{ + "some.datadoghq.com/key": "experiment-value", + "user-tooling/key": "keep-me", + } + require.NoError(t, c.Create(context.Background(), instanceB)) + + require.NoError(t, r.rollback(context.Background(), instanceB.ObjectMeta, prevRevision)) + + updated := &v2alpha1.DatadogAgent{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-dda"}, updated)) + + // Datadog annotation should be restored to the snapshot value. + assert.Equal(t, "old-value", updated.Annotations["some.datadoghq.com/key"]) + // Non-Datadog annotation must be preserved. + assert.Equal(t, "keep-me", updated.Annotations["user-tooling/key"]) +} + +func TestRestorePreviousSpec_PhaseSetOnlyOnSuccess(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped}} + + // rollback requires the DDA to exist in the fake client; don't create it so it errors. + err := r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + require.Error(t, err) + // Phase must NOT have been set since rollback failed. + assert.Equal(t, v2alpha1.ExperimentPhaseStopped, newStatus.Experiment.Phase) + + // Now create the DDA so rollback can succeed. + require.NoError(t, c.Create(context.Background(), instanceB)) + err = r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + require.NoError(t, err) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) +} + +func TestManageExperiment_TimeoutWinsOverSpuriousAbort(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Simulate a post-409 reconcile: phase=running, but generation was bumped by the + // rollback spec update (instanceB.Generation != experiment.Generation), AND timeout + // has elapsed. abortExperiment would fire for the generation mismatch, but + // handleRollback must win and persist phase=timeout. + instanceB.Generation = 99 + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: 1, + } + + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.manageExperiment(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + +func TestHandleRollback_NoTimeoutOnFirstReconcile(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Only one revision exists — for the pre-experiment spec — with an old timestamp. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + revList := mustListRevisions(t, r, instanceA) + require.Len(t, revList, 1) + revList[0].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + + // instanceB has a different spec (the experiment spec); its revision hasn't been created yet. + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + require.NoError(t, c.Create(context.Background(), instanceB)) + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + // Pass the stale revList (pre-experiment revision only) — timeout must NOT fire. + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus.Experiment.Phase) +} + +// TestHandleRollback_PostRollbackSetsTimeout verifies the reconcile-2 scenario: +// the spec has already been restored to the pre-experiment value (e.g. by a +// previous reconcile whose status write 409'd), so phase is still running and +// the generation is mismatched. findMostRecentMatchingRevision finds the +// pre-experiment revision (spec matches), elapsed is large, idempotent rollback +// fires, and phase=timeout is set without a spec-update conflict. +func TestHandleRollback_PostRollbackSetsTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // rev1: pre-experiment spec (instanceA). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + // rev2: experiment spec (instanceB). + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + // The DDA in the cluster already has the rolled-back spec (instanceA's spec), + // as if reconcile-1 restored it but its status write 409'd. + // Generation is set to a realistic non-zero value: RC would have recorded + // instanceA.Generation when the experiment started. + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + Generation: instanceA.Generation, + } + require.NoError(t, c.Create(context.Background(), instanceA)) + + // rev1 has an old timestamp — it was created well before the experiment started. + revList := mustListRevisions(t, r, instanceA) + for i := range revList { + if revList[i].Revision == 1 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceA.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceA, newStatus, metav1.Now(), revList)) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} + +// TestReapplySameSpecAfterRollback_NoImmediateTimeout is the end-to-end +// regression test for the stale-revision bug. +// +// Without the fix: gcOldRevisions kept the experiment revision (rev2/B) after +// rollback. When RC later re-applied spec B as a new experiment and set +// phase=Running, findMostRecentMatchingRevision found the stale rev2 — its +// CreationTimestamp predated the current experiment — so elapsed >= timeout +// fired immediately and the operator rolled back again, making the re-apply +// appear to have no effect. +// +// With the fix: gcOldRevisions deletes rev2 once phase=Rollback is persisted. +// Re-applying spec B creates a fresh revision with a current timestamp, and +// the timeout clock starts correctly. +func TestReapplySameSpecAfterRollback_NoImmediateTimeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Setup: create revisions for spec A (pre-experiment) and spec B (experiment). + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + require.NoError(t, c.Create(context.Background(), instanceB)) + + // Backdate rev2 (B) to simulate a long-running experiment whose revision + // timestamp is well past the timeout threshold. + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Hour)) + } + } + + // Rollback: RC sets phase=Stopped; operator restores spec A. + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} + rollbackStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, rollbackStatus, metav1.Now(), revList)) + require.Equal(t, v2alpha1.ExperimentPhaseRollback, rollbackStatus.Experiment.Phase) + + // Next reconcile: Rollback phase is now persisted in instance.Status. + // gcOldRevisions must delete the stale rev2 (B). + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback} + rollbackNewStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRollback}} + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), rollbackNewStatus)) + + remaining := mustListRevisions(t, r, instanceA) + require.Len(t, remaining, 1, "stale experiment revision (spec B) must be deleted once Rollback phase is persisted") + + // RC re-applies spec B as a new experiment (sets spec=B, phase=Running). + // In the real reconcile loop manageExperiment (handleRollback) runs before + // manageRevision, so no revision for spec B exists at the time of the check. + // findMostRecentMatchingRevision returns nil → timeout check is skipped. + instanceB2 := newRevisionTestOwner("test-dda", "default") + instanceB2.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB2.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseRunning} + + revListBeforeNewRevision := mustListRevisions(t, r, instanceB2) + require.Len(t, revListBeforeNewRevision, 1, "only rev1 (A) should exist before the new experiment's revision is created") + + newStatus2 := &v2alpha1.DatadogAgentStatus{Experiment: instanceB2.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB2, newStatus2, metav1.Now(), revListBeforeNewRevision)) + assert.Equal(t, v2alpha1.ExperimentPhaseRunning, newStatus2.Experiment.Phase, + "re-applying the same spec after rollback must not immediately time out") +} + +func TestHandleRollback_Timeout(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + // Create two revisions so rollback has a target. + instanceA := newRevisionTestOwner("test-dda", "default") + require.NoError(t, r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil)) + + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) + + // rollback fetches the current DDA to compare specs; it must exist in the fake client. + require.NoError(t, c.Create(context.Background(), instanceB)) + + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + } + + // Simulate the most recent revision having a creation timestamp past the timeout threshold + // by modifying the in-memory revList before passing it to handleRollback. + revList := mustListRevisions(t, r, instanceB) + for i := range revList { + if revList[i].Revision == 2 { + revList[i].CreationTimestamp = metav1.NewTime(time.Now().Add(-ExperimentDefaultTimeout - time.Minute)) + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceB.Status.Experiment.DeepCopy()} + require.NoError(t, r.handleRollback(context.Background(), instanceB, newStatus, metav1.Now(), revList)) + require.NotNil(t, newStatus.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, newStatus.Experiment.Phase) +} diff --git a/internal/controller/datadogagent/revision.go b/internal/controller/datadogagent/revision.go index 436235b2af..1ec5abe75c 100644 --- a/internal/controller/datadogagent/revision.go +++ b/internal/controller/datadogagent/revision.go @@ -31,14 +31,23 @@ type revisionSnapshot struct { Annotations map[string]string `json:"annotations,omitempty"` } -// manageRevision creates or ensures a ControllerRevision snapshot of the current -// DDA spec and GCs old revisions beyond the current and previous. -func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent) error { - revList, err := r.listRevisions(ctx, instance) - if err != nil { - return err +// skipRevisionBump returns true when the revision bump should be suppressed. +// During experiment rollback the spec is restored to an older revision; bumping +// its revision number to "latest" would make it appear newer than the experiment +// revision, causing findRollbackTarget to return the experiment revision on the +// next rollback attempt instead of the pre-experiment revision. +func skipRevisionBump(newStatus *v2alpha1.DatadogAgentStatus) bool { + if newStatus == nil || newStatus.Experiment == nil { + return false } - revName, err := r.ensureRevision(ctx, instance, revList) + phase := newStatus.Experiment.Phase + return phase == v2alpha1.ExperimentPhaseRollback || phase == v2alpha1.ExperimentPhaseTimeout +} + +// manageRevision creates a ControllerRevision snapshot of the current spec and +// garbage collects old revisions. Must be called after manageExperiment. +func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.DatadogAgent, revList []appsv1.ControllerRevision, newStatus *v2alpha1.DatadogAgentStatus) error { + revName, err := r.ensureRevision(ctx, instance, revList, skipRevisionBump(newStatus)) if err != nil { return err } @@ -48,7 +57,7 @@ func (r *Reconciler) manageRevision(ctx context.Context, instance *v2alpha1.Data return nil } -func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.DatadogAgent) (*appsv1.ControllerRevisionList, error) { +func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.DatadogAgent) ([]appsv1.ControllerRevision, error) { revList := &appsv1.ControllerRevisionList{} if err := r.client.List(ctx, revList, client.InNamespace(instance.GetNamespace()), @@ -71,17 +80,19 @@ func (r *Reconciler) listRevisions(ctx context.Context, instance *v2alpha1.Datad } } revList.Items = owned - return revList, nil + return revList.Items, nil } // ensureRevision creates a ControllerRevision snapshot of the instance spec and -// annotations if it does not already exist. +// annotations if it does not already exist, and returns the revision name. // -// The Revision field is a monotonic creation counter. +// The Revision field is a monotonic creation counter. If skipBump is true the +// existing revision is returned as-is without bumping its Revision number. func (r *Reconciler) ensureRevision( ctx context.Context, instance *v2alpha1.DatadogAgent, - revList *appsv1.ControllerRevisionList, + revList []appsv1.ControllerRevision, + skipBump bool, ) (string, error) { logger := ctrl.LoggerFrom(ctx) @@ -99,8 +110,8 @@ func (r *Reconciler) ensureRevision( // Find any existing revision with identical data, and track the max Revision. var matchingRev *appsv1.ControllerRevision maxRevision := int64(0) - for i := range revList.Items { - existing := &revList.Items[i] + for i := range revList { + existing := &revList[i] if bytes.Equal(existing.Data.Raw, specBytes) { matchingRev = existing } @@ -112,7 +123,11 @@ func (r *Reconciler) ensureRevision( if matchingRev != nil { // Identical content already snapshotted. Bump Revision to max+1 if it // has been superseded (e.g. after a revert) so ordering stays correct. - if matchingRev.Revision < maxRevision { + // Skip the bump during experiment rollback: bumping the pre-experiment + // revision above the experiment revision would cause findRollbackTarget + // to select the experiment revision as the rollback target on the next + // stopped signal, reversing the rollback. + if matchingRev.Revision < maxRevision && !skipBump { objLogger := logger.WithValues( "object.kind", "ControllerRevision", "object.namespace", matchingRev.Namespace, @@ -135,9 +150,9 @@ func (r *Reconciler) ensureRevision( rev := controllerrevisions.NewControllerRevision(instance, gvks[0], labels, data, nextRevision, nil) // Check for a name conflict before creating. - existingByName := make(map[string][]byte, len(revList.Items)) - for i := range revList.Items { - existingByName[revList.Items[i].Name] = revList.Items[i].Data.Raw + existingByName := make(map[string][]byte, len(revList)) + for i := range revList { + existingByName[revList[i].Name] = revList[i].Data.Raw } if existingData, nameUsed := existingByName[rev.Name]; nameUsed { if bytes.Equal(existingData, specBytes) { @@ -181,30 +196,45 @@ func datadogAnnotations(all map[string]string) map[string]string { // gcOldRevisions deletes all but the two most recent ControllerRevisions: // the current and previous. +// +// Exception: after a rejected experiment (rollback, timeout, or abort) the +// experiment revision carries a stale CreationTimestamp. If that revision were +// kept and the same spec were re-applied as a new experiment, +// findMostRecentMatchingRevision would find it, compute an elapsed time based +// on the old timestamp, and immediately time out. To prevent this, all +// non-current revisions are deleted once the rejected phase is persisted in +// instance.Status. The check uses the persisted status (not the in-flight +// newStatus) so that the revision survives any status-write conflicts during +// the rollback itself — the deletion is intentionally deferred to the next +// reconcile cycle after the terminal phase is confirmed. func (r *Reconciler) gcOldRevisions( ctx context.Context, instance *v2alpha1.DatadogAgent, current string, - revList *appsv1.ControllerRevisionList, + revList []appsv1.ControllerRevision, ) error { logger := ctrl.LoggerFrom(ctx) // Identify the most recent non-current revision to keep as previous. + // Skip this when the persisted experiment phase is a rejected terminal + // phase — in that case we want to delete the stale experiment revision. previous := "" - previousRevision := int64(-1) - for i := range revList.Items { - rev := &revList.Items[i] - if rev.Name == current { - continue - } - if rev.Revision > previousRevision { - previousRevision = rev.Revision - previous = rev.Name + if !isRejectedExperimentPhase(instance) { + previousRevision := int64(-1) + for i := range revList { + rev := &revList[i] + if rev.Name == current { + continue + } + if rev.Revision > previousRevision { + previousRevision = rev.Revision + previous = rev.Name + } } } - for i := range revList.Items { - rev := &revList.Items[i] + for i := range revList { + rev := &revList[i] if rev.Name == current || rev.Name == previous { continue } @@ -221,3 +251,18 @@ func (r *Reconciler) gcOldRevisions( return nil } + +// isRejectedExperimentPhase returns true when the persisted experiment phase +// indicates the experiment spec was rejected — rolled back, timed out, or +// aborted. These are the cases where keeping the experiment revision would +// cause an immediate timeout if the same spec were re-applied. +func isRejectedExperimentPhase(instance *v2alpha1.DatadogAgent) bool { + if instance.Status.Experiment == nil { + return false + } + switch instance.Status.Experiment.Phase { + case v2alpha1.ExperimentPhaseRollback, v2alpha1.ExperimentPhaseTimeout, v2alpha1.ExperimentPhaseAborted: + return true + } + return false +} diff --git a/internal/controller/datadogagent/revision_test.go b/internal/controller/datadogagent/revision_test.go index 53a7ab035a..2689f6db70 100644 --- a/internal/controller/datadogagent/revision_test.go +++ b/internal/controller/datadogagent/revision_test.go @@ -53,23 +53,29 @@ func newRevisionTestReconciler(t *testing.T) (*Reconciler, client.Client) { return &Reconciler{client: c, scheme: scheme}, c } -func mustListRevisions(t *testing.T, r *Reconciler, instance *v2alpha1.DatadogAgent) *appsv1.ControllerRevisionList { +func mustListRevisions(t *testing.T, r *Reconciler, instance *v2alpha1.DatadogAgent) []appsv1.ControllerRevision { t.Helper() - revList, err := r.listRevisions(context.Background(), instance) + revisions, err := r.listRevisions(context.Background(), instance) require.NoError(t, err) - return revList + return revisions +} + +func fetchRevisionByName(t *testing.T, c client.Client, namespace, name string) *appsv1.ControllerRevision { + t.Helper() + rev := &appsv1.ControllerRevision{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, rev)) + return rev } func TestEnsureRevision_CreatesOnFirstCall(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - assert.NotEmpty(t, revName) + assert.NotEmpty(t, name) - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: revName}, rev)) + rev := fetchRevisionByName(t, c, "default", name) assert.NotEmpty(t, rev.Data.Raw) assert.Equal(t, int64(1), rev.Revision) assert.Len(t, rev.OwnerReferences, 1) @@ -81,9 +87,9 @@ func TestEnsureRevision_Idempotent(t *testing.T) { instance := newRevisionTestOwner("test-dda", "default") instance.Annotations = map[string]string{"foo": "bar"} - name1, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name1, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + name2, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) assert.Equal(t, name1, name2) @@ -100,9 +106,9 @@ func TestEnsureRevision_DifferentSpecsDifferentNames(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.NotEqual(t, name1, name2) @@ -115,9 +121,9 @@ func TestEnsureRevision_DifferentAnnotationsDifferentNames(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Annotations = map[string]string{"feature.datadoghq.com/beta": "true"} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.NotEqual(t, name1, name2) @@ -137,9 +143,9 @@ func TestEnsureRevision_NonDatadogAnnotationsIgnored(t *testing.T) { "some-other-tool/annotation": "value", } - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) assert.Equal(t, name1, name2, "non-datadoghq annotations should not affect the revision snapshot") @@ -163,7 +169,7 @@ func TestGCOldRevisions_KeepsCurrentAndPrevious(t *testing.T) { } names := make([]string, len(instances)) for i, inst := range instances { - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } @@ -191,18 +197,17 @@ func TestEnsureRevision_RevertBumpsRevision(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name1, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) // Revert to spec A — should reuse name1 but bump its Revision. - name3, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + name3, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) assert.Equal(t, name1, name3, "revert should reuse same CR name") - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: name1}, rev)) + rev := fetchRevisionByName(t, c, "default", name1) assert.Equal(t, int64(3), rev.Revision, "revision should be bumped to max+1") } @@ -213,9 +218,9 @@ func TestGCOldRevisions_KeepsTwoRevisions(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA)) + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) require.NoError(t, err) - name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB)) + name2, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) require.NoError(t, err) err = r.gcOldRevisions(context.Background(), instanceB, name2, mustListRevisions(t, r, instanceB)) @@ -245,14 +250,13 @@ func TestEnsureRevision_RevisionNumbersMonotonic(t *testing.T) { names := make([]string, len(instances)) for i, inst := range instances { - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } for i, name := range names { - rev := &appsv1.ControllerRevision{} - require.NoError(t, c.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: name}, rev)) + rev := fetchRevisionByName(t, c, "default", name) assert.Equal(t, int64(i+1), rev.Revision, "revision %d should be %d", i, i+1) } } @@ -261,10 +265,10 @@ func TestGCOldRevisions_NoPreviousWhenOnlyCurrent(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - name, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance)) + revName, err := r.ensureRevision(context.Background(), instance, mustListRevisions(t, r, instance), false) require.NoError(t, err) - err = r.gcOldRevisions(context.Background(), instance, name, mustListRevisions(t, r, instance)) + err = r.gcOldRevisions(context.Background(), instance, revName, mustListRevisions(t, r, instance)) require.NoError(t, err) revList := &appsv1.ControllerRevisionList{} @@ -281,7 +285,7 @@ func TestGCOldRevisions_DeletesMultipleOld(t *testing.T) { for i, site := range sites { inst := newRevisionTestOwner("test-dda", "default") inst.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{Site: ptr.To(site)}} - name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst)) + name, err := r.ensureRevision(context.Background(), inst, mustListRevisions(t, r, inst), false) require.NoError(t, err) names[i] = name } @@ -309,30 +313,34 @@ func TestManageRevision_CreatesRevision(t *testing.T) { r, _ := newRevisionTestReconciler(t) instanceA := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instanceA)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) + require.NoError(t, err) revList := mustListRevisions(t, r, instanceA) - assert.Len(t, revList.Items, 1, "one revision after first call") + assert.Len(t, revList, 1, "one revision after first call") // Change spec and call again — should now have current and previous. instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) + require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) - assert.Len(t, revList.Items, 2, "current and previous kept after spec change") + assert.Len(t, revList, 2, "current and previous kept after spec change") } func TestManageRevision_Idempotent(t *testing.T) { r, _ := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instance)) - require.NoError(t, r.manageRevision(context.Background(), instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) + require.NoError(t, err) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) + require.NoError(t, err) revList := mustListRevisions(t, r, instance) - assert.Len(t, revList.Items, 1, "idempotent calls should not create duplicate revisions") + assert.Len(t, revList, 1, "idempotent calls should not create duplicate revisions") } // TestManageRevision_CurrentDeletedIsRecreated verifies that if the current @@ -342,24 +350,26 @@ func TestManageRevision_CurrentDeletedIsRecreated(t *testing.T) { r, c := newRevisionTestReconciler(t) instance := newRevisionTestOwner("test-dda", "default") - require.NoError(t, r.manageRevision(context.Background(), instance)) + err := r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) + require.NoError(t, err) revList := mustListRevisions(t, r, instance) - require.Len(t, revList.Items, 1) - deletedName := revList.Items[0].Name + require.Len(t, revList, 1) + deletedName := revList[0].Name // Simulate manual deletion. - require.NoError(t, c.Delete(context.Background(), &revList.Items[0])) + require.NoError(t, c.Delete(context.Background(), &revList[0])) // Next reconcile should re-create the revision. - require.NoError(t, r.manageRevision(context.Background(), instance)) + err = r.manageRevision(context.Background(), instance, mustListRevisions(t, r, instance), nil) + require.NoError(t, err) revList = mustListRevisions(t, r, instance) - require.Len(t, revList.Items, 1) + require.Len(t, revList, 1) // Content is identical so the name (hash-based) should be the same. - assert.Equal(t, deletedName, revList.Items[0].Name, "re-created revision should have the same name") - assert.Equal(t, int64(1), revList.Items[0].Revision) + assert.Equal(t, deletedName, revList[0].Name, "re-created revision should have the same name") + assert.Equal(t, int64(1), revList[0].Revision) } // TestManageRevision_PreviousDeletedContinuesNormally verifies that if the @@ -372,15 +382,17 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} - require.NoError(t, r.manageRevision(context.Background(), instanceA)) - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err := r.manageRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), nil) + require.NoError(t, err) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) + require.NoError(t, err) revList := mustListRevisions(t, r, instanceB) - require.Len(t, revList.Items, 2) + require.Len(t, revList, 2) // Identify and delete the previous (lower Revision number). var prev appsv1.ControllerRevision - for _, rev := range revList.Items { + for _, rev := range revList { if prev.Name == "" || rev.Revision < prev.Revision { prev = rev } @@ -388,11 +400,89 @@ func TestManageRevision_PreviousDeletedContinuesNormally(t *testing.T) { require.NoError(t, c.Delete(context.Background(), &prev)) // Next reconcile on the same spec should succeed and keep only the current. - require.NoError(t, r.manageRevision(context.Background(), instanceB)) + err = r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil) + require.NoError(t, err) revList = mustListRevisions(t, r, instanceB) - assert.Len(t, revList.Items, 1, "only current should remain after previous was deleted") - assert.Equal(t, int64(2), revList.Items[0].Revision) + assert.Len(t, revList, 1, "only current should remain after previous was deleted") + assert.Equal(t, int64(2), revList[0].Revision) +} + +// TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment verifies that when +// the persisted experiment phase is a rejected terminal phase (Rollback, Timeout, +// or Aborted), gcOldRevisions deletes the stale experiment revision instead of +// keeping it as "previous". This prevents an immediate timeout if the same spec +// is re-applied as a new experiment. +func TestGCOldRevisions_DeletesPreviousAfterRejectedExperiment(t *testing.T) { + rejectedPhases := []v2alpha1.ExperimentPhase{ + v2alpha1.ExperimentPhaseRollback, + v2alpha1.ExperimentPhaseTimeout, + v2alpha1.ExperimentPhaseAborted, + } + + for _, phase := range rejectedPhases { + t.Run(string(phase), func(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + + nameA, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + _, err = r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + // Simulate the persisted status having a rejected terminal phase. + // current is instanceA (spec restored after rollback). + instanceA.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + + err = r.gcOldRevisions(context.Background(), instanceA, nameA, mustListRevisions(t, r, instanceA)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 1, "experiment revision should be deleted after rejected phase") + assert.Equal(t, nameA, revList.Items[0].Name, "only the current (pre-experiment) revision should remain") + }) + } +} + +// TestGCOldRevisions_KeepsPreviousForNonRejectedPhases verifies that for +// non-rejected experiment phases (Running, Promoted, or nil), the normal +// "keep current + previous" behavior is preserved. +func TestGCOldRevisions_KeepsPreviousForNonRejectedPhases(t *testing.T) { + nonRejectedPhases := []v2alpha1.ExperimentPhase{ + v2alpha1.ExperimentPhaseRunning, + v2alpha1.ExperimentPhasePromoted, + "", // nil experiment + } + + for _, phase := range nonRejectedPhases { + t.Run(string(phase)+"_or_nil", func(t *testing.T) { + r, c := newRevisionTestReconciler(t) + + instanceA := newRevisionTestOwner("test-dda", "default") + instanceB := newRevisionTestOwner("test-dda", "default") + instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + + _, err := r.ensureRevision(context.Background(), instanceA, mustListRevisions(t, r, instanceA), false) + require.NoError(t, err) + nameB, err := r.ensureRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), false) + require.NoError(t, err) + + if phase != "" { + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: phase} + } + + err = r.gcOldRevisions(context.Background(), instanceB, nameB, mustListRevisions(t, r, instanceB)) + require.NoError(t, err) + + revList := &appsv1.ControllerRevisionList{} + require.NoError(t, c.List(context.Background(), revList)) + assert.Len(t, revList.Items, 2, "current and previous should both be kept for non-rejected phase") + }) + } } // TestListRevisions_ExcludesForeignOwner verifies that a ControllerRevision @@ -428,5 +518,5 @@ func TestListRevisions_ExcludesForeignOwner(t *testing.T) { revList, err := r.listRevisions(context.Background(), current) require.NoError(t, err) - assert.Empty(t, revList.Items, "foreign revision should be excluded by UID filter") + assert.Empty(t, revList, "foreign revision should be excluded by UID filter") } diff --git a/internal/controller/datadogagent_controller.go b/internal/controller/datadogagent_controller.go index be385e1a9e..8a10f64efb 100644 --- a/internal/controller/datadogagent_controller.go +++ b/internal/controller/datadogagent_controller.go @@ -8,6 +8,7 @@ package controller import ( "context" "reflect" + "strings" edsdatadoghqv1alpha1 "github.com/DataDog/extendeddaemonset/api/v1alpha1" "github.com/go-logr/logr" @@ -308,7 +309,7 @@ func (r *DatadogAgentReconciler) SetupWithManager(mgr ctrl.Manager, metricForwar } or := reconcile.AsReconciler[*v2alpha1.DatadogAgent](r.Client, r) - if err := builder.For(&v2alpha1.DatadogAgent{}, builderOptions...).WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.AnnotationChangedPredicate{})).Complete(or); err != nil { + if err := builder.For(&v2alpha1.DatadogAgent{}, builderOptions...).WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, datadogAnnotationChangedPredicate(), experimentPhaseChangedPredicate())).Complete(or); err != nil { return err } @@ -321,6 +322,69 @@ func (r *DatadogAgentReconciler) SetupWithManager(mgr ctrl.Manager, metricForwar return nil } +// datadogAnnotationChangedPredicate returns a predicate that triggers reconciliation +// only when annotations containing ".datadoghq.com/" change, ignoring unrelated annotation updates. +func datadogAnnotationChangedPredicate() predicate.Predicate { + hasDDAnnotation := func(annotations map[string]string) bool { + for k := range annotations { + if strings.Contains(k, ".datadoghq.com/") { + return true + } + } + return false + } + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldAnnotations := e.ObjectOld.GetAnnotations() + newAnnotations := e.ObjectNew.GetAnnotations() + for k, v := range newAnnotations { + if strings.Contains(k, ".datadoghq.com/") && oldAnnotations[k] != v { + return true + } + } + for k := range oldAnnotations { + if strings.Contains(k, ".datadoghq.com/") { + if _, exists := newAnnotations[k]; !exists { + return true + } + } + } + return false + }, + CreateFunc: func(e event.CreateEvent) bool { return hasDDAnnotation(e.Object.GetAnnotations()) }, + DeleteFunc: func(e event.DeleteEvent) bool { return true }, + GenericFunc: func(e event.GenericEvent) bool { return hasDDAnnotation(e.Object.GetAnnotations()) }, + } +} + +// experimentPhaseChangedPredicate returns a predicate that triggers reconciliation +// when the experiment phase in the DDA status changes. This allows the operator to +// react to RC signals (e.g. phase=stopped) which are status-only changes that do not +// bump metadata.generation. +func experimentPhaseChangedPredicate() predicate.Predicate { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldDDA, ok1 := e.ObjectOld.(*v2alpha1.DatadogAgent) + newDDA, ok2 := e.ObjectNew.(*v2alpha1.DatadogAgent) + if !ok1 || !ok2 { + return false + } + oldPhase := "" + if oldDDA.Status.Experiment != nil { + oldPhase = string(oldDDA.Status.Experiment.Phase) + } + newPhase := "" + if newDDA.Status.Experiment != nil { + newPhase = string(newDDA.Status.Experiment.Phase) + } + return oldPhase != newPhase + }, + CreateFunc: func(e event.CreateEvent) bool { return false }, + DeleteFunc: func(e event.DeleteEvent) bool { return false }, + GenericFunc: func(e event.GenericEvent) bool { return false }, + } +} + func enqueueIfOwnedByDatadogAgent(ctx context.Context, obj client.Object) []reconcile.Request { labels := obj.GetLabels()