From c9c2242c253d9fba6f6aa3af4c0bee6d437c2b58 Mon Sep 17 00:00:00 2001
From: Abir Abbas <abirabbas1998@gmail.com>
Date: Mon, 20 Apr 2026 09:31:19 -0400
Subject: [PATCH] fix(control-plane): rescue nodes stuck in
 lifecycle_status=starting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agents that register and then send heartbeats indefinitely with
status="starting" (notably the Python SDK, whose _current_status is
initialized to STARTING and only ever transitions to OFFLINE on shutdown)
were left wedged in lifecycle_status="starting" forever:

- needsReconciliation() only fired for stuck-starting agents when their
  heartbeat was ALSO stale, which never happens for a healthy agent
  heartbeating every 2s.
- reconcileAgentStatus() only promoted empty/offline → ready; it preserved
  "starting" even when the heartbeat was fresh.
- The UpdateAgentStatus auto-sync also only promoted offline/empty → ready
  when state flipped to Active, so a successful HTTP health check couldn't
  pull an agent out of "starting" either.
- Every "starting" heartbeat from the SDK re-asserted lifecycle_status=
  "starting" via UpdateFromHeartbeat, clobbering any promotion.

This patch:

- Adds a reconciliation rule for agents stuck in "starting" past
  MaxTransitionTime since RegisteredAt with a FRESH heartbeat — the
  fresh heartbeat proves liveness, registration age proves startup is done.
- Promotes "starting" → "ready" in reconcileAgentStatus when the heartbeat
  is fresh.
- Promotes "starting" → "ready" in the UpdateAgentStatus auto-sync when
  state transitions to Active (e.g. successful HTTP health check).
- Guards UpdateFromHeartbeat so "starting" heartbeats don't regress an
  already-promoted "ready"/"degraded" agent.

Adds three tests covering the full scenario end-to-end: reconciliation
rescues the stuck node, repeated "starting" heartbeats do not regress it,
and health-check-driven Active state also promotes "starting" → "ready".

Fixes #484
---
 .../internal/services/status_manager.go       |  51 +++++++-
 .../internal/services/status_manager_test.go  | 118 +++++++++++++++++-
 2 files changed, 163 insertions(+), 6 deletions(-)

diff --git a/control-plane/internal/services/status_manager.go b/control-plane/internal/services/status_manager.go
index 30c101e9..dbe1e645 100644
--- a/control-plane/internal/services/status_manager.go
+++ b/control-plane/internal/services/status_manager.go
@@ -353,8 +353,16 @@ func (sm *StatusManager) UpdateAgentStatus(ctx context.Context, nodeID string, u
 					newStatus.LifecycleStatus = types.AgentStatusOffline
 				}
 			case types.AgentStateActive:
-				// Agent is coming online - set lifecycle to ready if it was offline
-				if newStatus.LifecycleStatus == types.AgentStatusOffline || newStatus.LifecycleStatus == "" {
+				// Agent is coming online - set lifecycle to ready if it was offline,
+				// empty, or stuck in "starting". Once the agent is confirmed active
+				// (via heartbeat priority or successful HTTP health check), we should
+				// advance it out of "starting" — otherwise SDKs that never explicitly
+				// transition to "ready" (e.g. the Python SDK, which only ever sends
+				// status="starting" in its enhanced heartbeats) leave the agent wedged
+				// in "starting" indefinitely. See issue #484.
+				if newStatus.LifecycleStatus == types.AgentStatusOffline ||
+					newStatus.LifecycleStatus == "" ||
+					newStatus.LifecycleStatus == types.AgentStatusStarting {
 					newStatus.LifecycleStatus = types.AgentStatusReady
 				}
 			case types.AgentStateStarting:
@@ -444,6 +452,21 @@ func (sm *StatusManager) UpdateFromHeartbeat(ctx context.Context, nodeID string,
 	// The health monitor requires consecutive failures before marking inactive,
 	// so there is no need to suppress heartbeats here.
 
+	// Don't let a "starting" heartbeat regress an agent that has already been
+	// promoted to "ready" or "degraded". Some SDKs (notably the Python SDK prior
+	// to 0.1.69) never transition their internal status out of "starting", and
+	// every enhanced heartbeat carries status="starting". Without this guard,
+	// each heartbeat would clobber the promoted lifecycle status and the agent
+	// would oscillate between "starting" and whatever reconciliation promoted it
+	// to. The heartbeat itself is still processed (LastSeen/State refresh), we
+	// just ignore the regressive lifecycle signal. See issue #484.
+	if lifecycleStatus != nil && *lifecycleStatus == types.AgentStatusStarting {
+		switch currentStatus.LifecycleStatus {
+		case types.AgentStatusReady, types.AgentStatusDegraded:
+			lifecycleStatus = nil
+		}
+	}
+
 	// Update from heartbeat
 	currentStatus.UpdateFromHeartbeat(lifecycleStatus)
 
@@ -741,6 +764,21 @@ func (sm *StatusManager) needsReconciliation(agent *types.AgentNode) bool {
 		return true
 	}
 
+	// Agents stuck in "starting" with a FRESH heartbeat past the startup grace period
+	// must also be reconciled. This is the common case behind issue #484: SDKs that
+	// send heartbeats but never explicitly transition out of "starting" (e.g. the
+	// Python SDK, whose enhanced heartbeats always carry status="starting"). The fresh
+	// heartbeat proves the agent is alive, and time-since-registration past the grace
+	// period proves it has finished starting up — without this rule, such agents stay
+	// wedged in "starting" indefinitely, and the staleness branch above never fires
+	// because the heartbeat is always fresh.
+	if agent.LifecycleStatus == types.AgentStatusStarting &&
+		timeSinceHeartbeat <= sm.config.HeartbeatStaleThreshold &&
+		!agent.RegisteredAt.IsZero() &&
+		time.Since(agent.RegisteredAt) > sm.config.MaxTransitionTime {
+		return true
+	}
+
 	return false
 }
 
@@ -757,7 +795,14 @@ func (sm *StatusManager) reconcileAgentStatus(ctx context.Context, agent *types.
 		newLifecycleStatus = types.AgentStatusOffline
 	} else {
 		newHealthStatus = types.HealthStatusActive
-		if agent.LifecycleStatus == "" || agent.LifecycleStatus == types.AgentStatusOffline {
+		// Promote empty/offline/stuck-starting agents to ready. Before the fix for
+		// issue #484, "starting" was preserved here, which meant agents that
+		// reconciliation *should* have rescued (fresh heartbeat, registered past the
+		// grace period, still in "starting") got re-saved as "starting" and stayed
+		// stuck forever.
+		if agent.LifecycleStatus == "" ||
+			agent.LifecycleStatus == types.AgentStatusOffline ||
+			agent.LifecycleStatus == types.AgentStatusStarting {
 			newLifecycleStatus = types.AgentStatusReady
 		} else {
 			newLifecycleStatus = agent.LifecycleStatus
diff --git a/control-plane/internal/services/status_manager_test.go b/control-plane/internal/services/status_manager_test.go
index b219bae7..d20a3996 100644
--- a/control-plane/internal/services/status_manager_test.go
+++ b/control-plane/internal/services/status_manager_test.go
@@ -568,13 +568,125 @@ func TestStatusManager_Reconciliation_UsesConfiguredThreshold(t *testing.T) {
 	assert.True(t, sm.needsReconciliation(stuckStartingAgent),
 		"Agent stuck in 'starting' beyond MaxTransitionTime should need reconciliation")
 
-	// Agent in "starting" with recent heartbeat — should NOT need reconciliation (still initializing)
+	// Agent recently registered and still in "starting" with a recent heartbeat — should NOT
+	// need reconciliation (still within the startup grace period).
 	freshStartingAgent := &types.AgentNode{
 		ID:              "node-fresh-starting",
 		HealthStatus:    types.HealthStatusUnknown,
 		LifecycleStatus: types.AgentStatusStarting,
-		LastHeartbeat:   time.Now().Add(-30 * time.Second),
+		RegisteredAt:    time.Now().Add(-30 * time.Second),
+		LastHeartbeat:   time.Now().Add(-2 * time.Second),
 	}
 	assert.False(t, sm.needsReconciliation(freshStartingAgent),
-		"Agent in 'starting' with recent heartbeat should not need reconciliation yet")
+		"Agent registered 30s ago in 'starting' with fresh heartbeat should be within startup grace")
+
+	// Issue #484: Agent registered long ago, still in "starting", but sending fresh heartbeats.
+	// This is the SDK-never-transitions-to-ready case — reconciliation MUST rescue it.
+	stuckStartingFreshHeartbeat := &types.AgentNode{
+		ID:              "node-stuck-starting-fresh-hb",
+		HealthStatus:    types.HealthStatusUnknown,
+		LifecycleStatus: types.AgentStatusStarting,
+		RegisteredAt:    time.Now().Add(-10 * time.Minute),
+		LastHeartbeat:   time.Now().Add(-2 * time.Second),
+	}
+	assert.True(t, sm.needsReconciliation(stuckStartingFreshHeartbeat),
+		"Agent past startup grace with fresh heartbeat but still 'starting' should need reconciliation (issue #484)")
+}
+
+// TestStatusManager_StuckStartingIsReconciledToReady reproduces issue #484 end-to-end:
+// an agent registers, sends heartbeats indefinitely with status="starting" (the Python SDK's
+// default, since it never transitions _current_status to READY), and is expected to be
+// promoted to "ready" by the reconciliation loop once past the startup grace period — then
+// stay "ready" across subsequent "starting" heartbeats.
+func TestStatusManager_StuckStartingIsReconciledToReady(t *testing.T) {
+	provider, ctx := setupStatusManagerStorage(t)
+
+	// Register an agent that registered 10 minutes ago (long past any reasonable
+	// startup grace period) and is still in "starting" with a fresh heartbeat.
+	node := &types.AgentNode{
+		ID:              "stuck-starter",
+		TeamID:          "team",
+		BaseURL:         "http://localhost",
+		Version:         "1.0.0",
+		HealthStatus:    types.HealthStatusUnknown,
+		LifecycleStatus: types.AgentStatusStarting,
+		RegisteredAt:    time.Now().Add(-10 * time.Minute),
+		LastHeartbeat:   time.Now().Add(-1 * time.Second),
+		Reasoners:       []types.ReasonerDefinition{},
+		Skills:          []types.SkillDefinition{{ID: "greet"}},
+	}
+	require.NoError(t, provider.RegisterAgent(ctx, node))
+
+	// Use short timings so the test is deterministic.
+	sm := NewStatusManager(provider, StatusManagerConfig{
+		ReconcileInterval:       30 * time.Second,
+		HeartbeatStaleThreshold: 60 * time.Second,
+		MaxTransitionTime:       2 * time.Minute,
+	}, nil, nil)
+
+	// Sanity: the agent is indeed stuck and needs reconciliation.
+	persisted, err := provider.GetAgent(ctx, "stuck-starter")
+	require.NoError(t, err)
+	require.Equal(t, types.AgentStatusStarting, persisted.LifecycleStatus)
+	require.True(t, sm.needsReconciliation(persisted),
+		"Agent registered past grace period with fresh heartbeat should need reconciliation")
+
+	// Reconciliation should promote "starting" → "ready".
+	sm.performReconciliation()
+
+	promoted, err := provider.GetAgent(ctx, "stuck-starter")
+	require.NoError(t, err)
+	assert.Equal(t, types.AgentStatusReady, promoted.LifecycleStatus,
+		"Reconciliation must promote stuck 'starting' with fresh heartbeat to 'ready' (issue #484)")
+
+	// Now simulate what the Python SDK does: keep sending heartbeats with
+	// status="starting". These must NOT regress the lifecycle status back to
+	// "starting" — otherwise the agent would oscillate forever.
+	starting := types.AgentStatusStarting
+	for i := 0; i < 5; i++ {
+		require.NoError(t, sm.UpdateFromHeartbeat(ctx, "stuck-starter", &starting, ""))
+	}
+
+	stable, err := provider.GetAgent(ctx, "stuck-starter")
+	require.NoError(t, err)
+	assert.Equal(t, types.AgentStatusReady, stable.LifecycleStatus,
+		"Subsequent heartbeats carrying status='starting' must not regress a promoted agent (issue #484)")
+}
+
+// TestStatusManager_UpdateAgentStatus_ActivePromotesStarting verifies the other half of the
+// fix: when the health monitor marks an agent active (e.g. a successful HTTP /status check),
+// the lifecycle status should be promoted out of "starting" too — not only out of
+// offline/empty as before.
+func TestStatusManager_UpdateAgentStatus_ActivePromotesStarting(t *testing.T) {
+	provider, ctx := setupStatusManagerStorage(t)
+
+	node := &types.AgentNode{
+		ID:              "active-transition",
+		TeamID:          "team",
+		BaseURL:         "http://localhost",
+		Version:         "1.0.0",
+		HealthStatus:    types.HealthStatusUnknown,
+		LifecycleStatus: types.AgentStatusStarting,
+		RegisteredAt:    time.Now().Add(-5 * time.Minute),
+		LastHeartbeat:   time.Now(),
+		Reasoners:       []types.ReasonerDefinition{},
+		Skills:          []types.SkillDefinition{},
+	}
+	require.NoError(t, provider.RegisterAgent(ctx, node))
+
+	sm := NewStatusManager(provider, StatusManagerConfig{}, nil, nil)
+
+	// Simulate the health monitor marking the agent active (what happens after a
+	// successful HTTP health check).
+	active := types.AgentStateActive
+	require.NoError(t, sm.UpdateAgentStatus(ctx, "active-transition", &types.AgentStatusUpdate{
+		State:  &active,
+		Source: types.StatusSourceHealthCheck,
+		Reason: "HTTP /status succeeded",
+	}))
+
+	after, err := provider.GetAgent(ctx, "active-transition")
+	require.NoError(t, err)
+	assert.Equal(t, types.AgentStatusReady, after.LifecycleStatus,
+		"Transitioning to AgentStateActive must promote 'starting' → 'ready' (issue #484)")
 }