From c9c2242c253d9fba6f6aa3af4c0bee6d437c2b58 Mon Sep 17 00:00:00 2001 From: Abir Abbas Date: Mon, 20 Apr 2026 09:31:19 -0400 Subject: [PATCH] fix(control-plane): rescue nodes stuck in lifecycle_status=starting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agents that register and then send heartbeats indefinitely with status="starting" (notably the Python SDK, whose _current_status is initialized to STARTING and only ever transitions to OFFLINE on shutdown) were left wedged in lifecycle_status="starting" forever: - needsReconciliation() only fired for stuck-starting agents when their heartbeat was ALSO stale, which never happens for a healthy agent heartbeating every 2s. - reconcileAgentStatus() only promoted empty/offline → ready; it preserved "starting" even when the heartbeat was fresh. - The UpdateAgentStatus auto-sync also only promoted offline/empty → ready when state flipped to Active, so a successful HTTP health check couldn't pull an agent out of "starting" either. - Every "starting" heartbeat from the SDK re-asserted lifecycle_status= "starting" via UpdateFromHeartbeat, clobbering any promotion. This patch: - Adds a reconciliation rule for agents stuck in "starting" past MaxTransitionTime since RegisteredAt with a FRESH heartbeat — the fresh heartbeat proves liveness, registration age proves startup is done. - Promotes "starting" → "ready" in reconcileAgentStatus when the heartbeat is fresh. - Promotes "starting" → "ready" in the UpdateAgentStatus auto-sync when state transitions to Active (e.g. successful HTTP health check). - Guards UpdateFromHeartbeat so "starting" heartbeats don't regress an already-promoted "ready"/"degraded" agent. Adds three tests covering the full scenario end-to-end: reconciliation rescues the stuck node, repeated "starting" heartbeats do not regress it, and health-check-driven Active state also promotes "starting" → "ready". Fixes #484 --- .../internal/services/status_manager.go | 51 +++++++- .../internal/services/status_manager_test.go | 118 +++++++++++++++++- 2 files changed, 163 insertions(+), 6 deletions(-) diff --git a/control-plane/internal/services/status_manager.go b/control-plane/internal/services/status_manager.go index 30c101e9..dbe1e645 100644 --- a/control-plane/internal/services/status_manager.go +++ b/control-plane/internal/services/status_manager.go @@ -353,8 +353,16 @@ func (sm *StatusManager) UpdateAgentStatus(ctx context.Context, nodeID string, u newStatus.LifecycleStatus = types.AgentStatusOffline } case types.AgentStateActive: - // Agent is coming online - set lifecycle to ready if it was offline - if newStatus.LifecycleStatus == types.AgentStatusOffline || newStatus.LifecycleStatus == "" { + // Agent is coming online - set lifecycle to ready if it was offline, + // empty, or stuck in "starting". Once the agent is confirmed active + // (via heartbeat priority or successful HTTP health check), we should + // advance it out of "starting" — otherwise SDKs that never explicitly + // transition to "ready" (e.g. the Python SDK, which only ever sends + // status="starting" in its enhanced heartbeats) leave the agent wedged + // in "starting" indefinitely. See issue #484. + if newStatus.LifecycleStatus == types.AgentStatusOffline || + newStatus.LifecycleStatus == "" || + newStatus.LifecycleStatus == types.AgentStatusStarting { newStatus.LifecycleStatus = types.AgentStatusReady } case types.AgentStateStarting: @@ -444,6 +452,21 @@ func (sm *StatusManager) UpdateFromHeartbeat(ctx context.Context, nodeID string, // The health monitor requires consecutive failures before marking inactive, // so there is no need to suppress heartbeats here. + // Don't let a "starting" heartbeat regress an agent that has already been + // promoted to "ready" or "degraded". Some SDKs (notably the Python SDK prior + // to 0.1.69) never transition their internal status out of "starting", and + // every enhanced heartbeat carries status="starting". Without this guard, + // each heartbeat would clobber the promoted lifecycle status and the agent + // would oscillate between "starting" and whatever reconciliation promoted it + // to. The heartbeat itself is still processed (LastSeen/State refresh), we + // just ignore the regressive lifecycle signal. See issue #484. + if lifecycleStatus != nil && *lifecycleStatus == types.AgentStatusStarting { + switch currentStatus.LifecycleStatus { + case types.AgentStatusReady, types.AgentStatusDegraded: + lifecycleStatus = nil + } + } + // Update from heartbeat currentStatus.UpdateFromHeartbeat(lifecycleStatus) @@ -741,6 +764,21 @@ func (sm *StatusManager) needsReconciliation(agent *types.AgentNode) bool { return true } + // Agents stuck in "starting" with a FRESH heartbeat past the startup grace period + // must also be reconciled. This is the common case behind issue #484: SDKs that + // send heartbeats but never explicitly transition out of "starting" (e.g. the + // Python SDK, whose enhanced heartbeats always carry status="starting"). The fresh + // heartbeat proves the agent is alive, and time-since-registration past the grace + // period proves it has finished starting up — without this rule, such agents stay + // wedged in "starting" indefinitely, and the staleness branch above never fires + // because the heartbeat is always fresh. + if agent.LifecycleStatus == types.AgentStatusStarting && + timeSinceHeartbeat <= sm.config.HeartbeatStaleThreshold && + !agent.RegisteredAt.IsZero() && + time.Since(agent.RegisteredAt) > sm.config.MaxTransitionTime { + return true + } + return false } @@ -757,7 +795,14 @@ func (sm *StatusManager) reconcileAgentStatus(ctx context.Context, agent *types. newLifecycleStatus = types.AgentStatusOffline } else { newHealthStatus = types.HealthStatusActive - if agent.LifecycleStatus == "" || agent.LifecycleStatus == types.AgentStatusOffline { + // Promote empty/offline/stuck-starting agents to ready. Before the fix for + // issue #484, "starting" was preserved here, which meant agents that + // reconciliation *should* have rescued (fresh heartbeat, registered past the + // grace period, still in "starting") got re-saved as "starting" and stayed + // stuck forever. + if agent.LifecycleStatus == "" || + agent.LifecycleStatus == types.AgentStatusOffline || + agent.LifecycleStatus == types.AgentStatusStarting { newLifecycleStatus = types.AgentStatusReady } else { newLifecycleStatus = agent.LifecycleStatus diff --git a/control-plane/internal/services/status_manager_test.go b/control-plane/internal/services/status_manager_test.go index b219bae7..d20a3996 100644 --- a/control-plane/internal/services/status_manager_test.go +++ b/control-plane/internal/services/status_manager_test.go @@ -568,13 +568,125 @@ func TestStatusManager_Reconciliation_UsesConfiguredThreshold(t *testing.T) { assert.True(t, sm.needsReconciliation(stuckStartingAgent), "Agent stuck in 'starting' beyond MaxTransitionTime should need reconciliation") - // Agent in "starting" with recent heartbeat — should NOT need reconciliation (still initializing) + // Agent recently registered and still in "starting" with a recent heartbeat — should NOT + // need reconciliation (still within the startup grace period). freshStartingAgent := &types.AgentNode{ ID: "node-fresh-starting", HealthStatus: types.HealthStatusUnknown, LifecycleStatus: types.AgentStatusStarting, - LastHeartbeat: time.Now().Add(-30 * time.Second), + RegisteredAt: time.Now().Add(-30 * time.Second), + LastHeartbeat: time.Now().Add(-2 * time.Second), } assert.False(t, sm.needsReconciliation(freshStartingAgent), - "Agent in 'starting' with recent heartbeat should not need reconciliation yet") + "Agent registered 30s ago in 'starting' with fresh heartbeat should be within startup grace") + + // Issue #484: Agent registered long ago, still in "starting", but sending fresh heartbeats. + // This is the SDK-never-transitions-to-ready case — reconciliation MUST rescue it. + stuckStartingFreshHeartbeat := &types.AgentNode{ + ID: "node-stuck-starting-fresh-hb", + HealthStatus: types.HealthStatusUnknown, + LifecycleStatus: types.AgentStatusStarting, + RegisteredAt: time.Now().Add(-10 * time.Minute), + LastHeartbeat: time.Now().Add(-2 * time.Second), + } + assert.True(t, sm.needsReconciliation(stuckStartingFreshHeartbeat), + "Agent past startup grace with fresh heartbeat but still 'starting' should need reconciliation (issue #484)") +} + +// TestStatusManager_StuckStartingIsReconciledToReady reproduces issue #484 end-to-end: +// an agent registers, sends heartbeats indefinitely with status="starting" (the Python SDK's +// default, since it never transitions _current_status to READY), and is expected to be +// promoted to "ready" by the reconciliation loop once past the startup grace period — then +// stay "ready" across subsequent "starting" heartbeats. +func TestStatusManager_StuckStartingIsReconciledToReady(t *testing.T) { + provider, ctx := setupStatusManagerStorage(t) + + // Register an agent that registered 10 minutes ago (long past any reasonable + // startup grace period) and is still in "starting" with a fresh heartbeat. + node := &types.AgentNode{ + ID: "stuck-starter", + TeamID: "team", + BaseURL: "http://localhost", + Version: "1.0.0", + HealthStatus: types.HealthStatusUnknown, + LifecycleStatus: types.AgentStatusStarting, + RegisteredAt: time.Now().Add(-10 * time.Minute), + LastHeartbeat: time.Now().Add(-1 * time.Second), + Reasoners: []types.ReasonerDefinition{}, + Skills: []types.SkillDefinition{{ID: "greet"}}, + } + require.NoError(t, provider.RegisterAgent(ctx, node)) + + // Use short timings so the test is deterministic. + sm := NewStatusManager(provider, StatusManagerConfig{ + ReconcileInterval: 30 * time.Second, + HeartbeatStaleThreshold: 60 * time.Second, + MaxTransitionTime: 2 * time.Minute, + }, nil, nil) + + // Sanity: the agent is indeed stuck and needs reconciliation. + persisted, err := provider.GetAgent(ctx, "stuck-starter") + require.NoError(t, err) + require.Equal(t, types.AgentStatusStarting, persisted.LifecycleStatus) + require.True(t, sm.needsReconciliation(persisted), + "Agent registered past grace period with fresh heartbeat should need reconciliation") + + // Reconciliation should promote "starting" → "ready". + sm.performReconciliation() + + promoted, err := provider.GetAgent(ctx, "stuck-starter") + require.NoError(t, err) + assert.Equal(t, types.AgentStatusReady, promoted.LifecycleStatus, + "Reconciliation must promote stuck 'starting' with fresh heartbeat to 'ready' (issue #484)") + + // Now simulate what the Python SDK does: keep sending heartbeats with + // status="starting". These must NOT regress the lifecycle status back to + // "starting" — otherwise the agent would oscillate forever. + starting := types.AgentStatusStarting + for i := 0; i < 5; i++ { + require.NoError(t, sm.UpdateFromHeartbeat(ctx, "stuck-starter", &starting, "")) + } + + stable, err := provider.GetAgent(ctx, "stuck-starter") + require.NoError(t, err) + assert.Equal(t, types.AgentStatusReady, stable.LifecycleStatus, + "Subsequent heartbeats carrying status='starting' must not regress a promoted agent (issue #484)") +} + +// TestStatusManager_UpdateAgentStatus_ActivePromotesStarting verifies the other half of the +// fix: when the health monitor marks an agent active (e.g. a successful HTTP /status check), +// the lifecycle status should be promoted out of "starting" too — not only out of +// offline/empty as before. +func TestStatusManager_UpdateAgentStatus_ActivePromotesStarting(t *testing.T) { + provider, ctx := setupStatusManagerStorage(t) + + node := &types.AgentNode{ + ID: "active-transition", + TeamID: "team", + BaseURL: "http://localhost", + Version: "1.0.0", + HealthStatus: types.HealthStatusUnknown, + LifecycleStatus: types.AgentStatusStarting, + RegisteredAt: time.Now().Add(-5 * time.Minute), + LastHeartbeat: time.Now(), + Reasoners: []types.ReasonerDefinition{}, + Skills: []types.SkillDefinition{}, + } + require.NoError(t, provider.RegisterAgent(ctx, node)) + + sm := NewStatusManager(provider, StatusManagerConfig{}, nil, nil) + + // Simulate the health monitor marking the agent active (what happens after a + // successful HTTP health check). + active := types.AgentStateActive + require.NoError(t, sm.UpdateAgentStatus(ctx, "active-transition", &types.AgentStatusUpdate{ + State: &active, + Source: types.StatusSourceHealthCheck, + Reason: "HTTP /status succeeded", + })) + + after, err := provider.GetAgent(ctx, "active-transition") + require.NoError(t, err) + assert.Equal(t, types.AgentStatusReady, after.LifecycleStatus, + "Transitioning to AgentStateActive must promote 'starting' → 'ready' (issue #484)") }