Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions control-plane/internal/services/status_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,16 @@ func (sm *StatusManager) UpdateAgentStatus(ctx context.Context, nodeID string, u
newStatus.LifecycleStatus = types.AgentStatusOffline
}
case types.AgentStateActive:
// Agent is coming online - set lifecycle to ready if it was offline
if newStatus.LifecycleStatus == types.AgentStatusOffline || newStatus.LifecycleStatus == "" {
// Agent is coming online - set lifecycle to ready if it was offline,
// empty, or stuck in "starting". Once the agent is confirmed active
// (via heartbeat priority or successful HTTP health check), we should
// advance it out of "starting" — otherwise SDKs that never explicitly
// transition to "ready" (e.g. the Python SDK, which only ever sends
// status="starting" in its enhanced heartbeats) leave the agent wedged
// in "starting" indefinitely. See issue #484.
if newStatus.LifecycleStatus == types.AgentStatusOffline ||
newStatus.LifecycleStatus == "" ||
newStatus.LifecycleStatus == types.AgentStatusStarting {
newStatus.LifecycleStatus = types.AgentStatusReady
}
case types.AgentStateStarting:
Expand Down Expand Up @@ -444,6 +452,21 @@ func (sm *StatusManager) UpdateFromHeartbeat(ctx context.Context, nodeID string,
// The health monitor requires consecutive failures before marking inactive,
// so there is no need to suppress heartbeats here.

// Don't let a "starting" heartbeat regress an agent that has already been
// promoted to "ready" or "degraded". Some SDKs (notably the Python SDK prior
// to 0.1.69) never transition their internal status out of "starting", and
// every enhanced heartbeat carries status="starting". Without this guard,
// each heartbeat would clobber the promoted lifecycle status and the agent
// would oscillate between "starting" and whatever reconciliation promoted it
// to. The heartbeat itself is still processed (LastSeen/State refresh), we
// just ignore the regressive lifecycle signal. See issue #484.
if lifecycleStatus != nil && *lifecycleStatus == types.AgentStatusStarting {
switch currentStatus.LifecycleStatus {
case types.AgentStatusReady, types.AgentStatusDegraded:
lifecycleStatus = nil
}
}

// Update from heartbeat
currentStatus.UpdateFromHeartbeat(lifecycleStatus)

Expand Down Expand Up @@ -741,6 +764,21 @@ func (sm *StatusManager) needsReconciliation(agent *types.AgentNode) bool {
return true
}

// Agents stuck in "starting" with a FRESH heartbeat past the startup grace period
// must also be reconciled. This is the common case behind issue #484: SDKs that
// send heartbeats but never explicitly transition out of "starting" (e.g. the
// Python SDK, whose enhanced heartbeats always carry status="starting"). The fresh
// heartbeat proves the agent is alive, and time-since-registration past the grace
// period proves it has finished starting up — without this rule, such agents stay
// wedged in "starting" indefinitely, and the staleness branch above never fires
// because the heartbeat is always fresh.
if agent.LifecycleStatus == types.AgentStatusStarting &&
timeSinceHeartbeat <= sm.config.HeartbeatStaleThreshold &&
!agent.RegisteredAt.IsZero() &&
time.Since(agent.RegisteredAt) > sm.config.MaxTransitionTime {
return true
}

return false
}

Expand All @@ -757,7 +795,14 @@ func (sm *StatusManager) reconcileAgentStatus(ctx context.Context, agent *types.
newLifecycleStatus = types.AgentStatusOffline
} else {
newHealthStatus = types.HealthStatusActive
if agent.LifecycleStatus == "" || agent.LifecycleStatus == types.AgentStatusOffline {
// Promote empty/offline/stuck-starting agents to ready. Before the fix for
// issue #484, "starting" was preserved here, which meant agents that
// reconciliation *should* have rescued (fresh heartbeat, registered past the
// grace period, still in "starting") got re-saved as "starting" and stayed
// stuck forever.
if agent.LifecycleStatus == "" ||
agent.LifecycleStatus == types.AgentStatusOffline ||
agent.LifecycleStatus == types.AgentStatusStarting {
newLifecycleStatus = types.AgentStatusReady
} else {
newLifecycleStatus = agent.LifecycleStatus
Expand Down
118 changes: 115 additions & 3 deletions control-plane/internal/services/status_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,13 +568,125 @@ func TestStatusManager_Reconciliation_UsesConfiguredThreshold(t *testing.T) {
assert.True(t, sm.needsReconciliation(stuckStartingAgent),
"Agent stuck in 'starting' beyond MaxTransitionTime should need reconciliation")

// Agent in "starting" with recent heartbeat — should NOT need reconciliation (still initializing)
// Agent recently registered and still in "starting" with a recent heartbeat — should NOT
// need reconciliation (still within the startup grace period).
freshStartingAgent := &types.AgentNode{
ID: "node-fresh-starting",
HealthStatus: types.HealthStatusUnknown,
LifecycleStatus: types.AgentStatusStarting,
LastHeartbeat: time.Now().Add(-30 * time.Second),
RegisteredAt: time.Now().Add(-30 * time.Second),
LastHeartbeat: time.Now().Add(-2 * time.Second),
}
assert.False(t, sm.needsReconciliation(freshStartingAgent),
"Agent in 'starting' with recent heartbeat should not need reconciliation yet")
"Agent registered 30s ago in 'starting' with fresh heartbeat should be within startup grace")

// Issue #484: Agent registered long ago, still in "starting", but sending fresh heartbeats.
// This is the SDK-never-transitions-to-ready case — reconciliation MUST rescue it.
stuckStartingFreshHeartbeat := &types.AgentNode{
ID: "node-stuck-starting-fresh-hb",
HealthStatus: types.HealthStatusUnknown,
LifecycleStatus: types.AgentStatusStarting,
RegisteredAt: time.Now().Add(-10 * time.Minute),
LastHeartbeat: time.Now().Add(-2 * time.Second),
}
assert.True(t, sm.needsReconciliation(stuckStartingFreshHeartbeat),
"Agent past startup grace with fresh heartbeat but still 'starting' should need reconciliation (issue #484)")
}

// TestStatusManager_StuckStartingIsReconciledToReady reproduces issue #484 end-to-end:
// an agent registers, sends heartbeats indefinitely with status="starting" (the Python SDK's
// default, since it never transitions _current_status to READY), and is expected to be
// promoted to "ready" by the reconciliation loop once past the startup grace period — then
// stay "ready" across subsequent "starting" heartbeats.
func TestStatusManager_StuckStartingIsReconciledToReady(t *testing.T) {
provider, ctx := setupStatusManagerStorage(t)

// Register an agent that registered 10 minutes ago (long past any reasonable
// startup grace period) and is still in "starting" with a fresh heartbeat.
node := &types.AgentNode{
ID: "stuck-starter",
TeamID: "team",
BaseURL: "http://localhost",
Version: "1.0.0",
HealthStatus: types.HealthStatusUnknown,
LifecycleStatus: types.AgentStatusStarting,
RegisteredAt: time.Now().Add(-10 * time.Minute),
LastHeartbeat: time.Now().Add(-1 * time.Second),
Reasoners: []types.ReasonerDefinition{},
Skills: []types.SkillDefinition{{ID: "greet"}},
}
require.NoError(t, provider.RegisterAgent(ctx, node))

// Use short timings so the test is deterministic.
sm := NewStatusManager(provider, StatusManagerConfig{
ReconcileInterval: 30 * time.Second,
HeartbeatStaleThreshold: 60 * time.Second,
MaxTransitionTime: 2 * time.Minute,
}, nil, nil)

// Sanity: the agent is indeed stuck and needs reconciliation.
persisted, err := provider.GetAgent(ctx, "stuck-starter")
require.NoError(t, err)
require.Equal(t, types.AgentStatusStarting, persisted.LifecycleStatus)
require.True(t, sm.needsReconciliation(persisted),
"Agent registered past grace period with fresh heartbeat should need reconciliation")

// Reconciliation should promote "starting" → "ready".
sm.performReconciliation()

promoted, err := provider.GetAgent(ctx, "stuck-starter")
require.NoError(t, err)
assert.Equal(t, types.AgentStatusReady, promoted.LifecycleStatus,
"Reconciliation must promote stuck 'starting' with fresh heartbeat to 'ready' (issue #484)")

// Now simulate what the Python SDK does: keep sending heartbeats with
// status="starting". These must NOT regress the lifecycle status back to
// "starting" — otherwise the agent would oscillate forever.
starting := types.AgentStatusStarting
for i := 0; i < 5; i++ {
require.NoError(t, sm.UpdateFromHeartbeat(ctx, "stuck-starter", &starting, ""))
}

stable, err := provider.GetAgent(ctx, "stuck-starter")
require.NoError(t, err)
assert.Equal(t, types.AgentStatusReady, stable.LifecycleStatus,
"Subsequent heartbeats carrying status='starting' must not regress a promoted agent (issue #484)")
}

// TestStatusManager_UpdateAgentStatus_ActivePromotesStarting verifies the other half of the
// fix: when the health monitor marks an agent active (e.g. a successful HTTP /status check),
// the lifecycle status should be promoted out of "starting" too — not only out of
// offline/empty as before.
func TestStatusManager_UpdateAgentStatus_ActivePromotesStarting(t *testing.T) {
provider, ctx := setupStatusManagerStorage(t)

node := &types.AgentNode{
ID: "active-transition",
TeamID: "team",
BaseURL: "http://localhost",
Version: "1.0.0",
HealthStatus: types.HealthStatusUnknown,
LifecycleStatus: types.AgentStatusStarting,
RegisteredAt: time.Now().Add(-5 * time.Minute),
LastHeartbeat: time.Now(),
Reasoners: []types.ReasonerDefinition{},
Skills: []types.SkillDefinition{},
}
require.NoError(t, provider.RegisterAgent(ctx, node))

sm := NewStatusManager(provider, StatusManagerConfig{}, nil, nil)

// Simulate the health monitor marking the agent active (what happens after a
// successful HTTP health check).
active := types.AgentStateActive
require.NoError(t, sm.UpdateAgentStatus(ctx, "active-transition", &types.AgentStatusUpdate{
State: &active,
Source: types.StatusSourceHealthCheck,
Reason: "HTTP /status succeeded",
}))

after, err := provider.GetAgent(ctx, "active-transition")
require.NoError(t, err)
assert.Equal(t, types.AgentStatusReady, after.LifecycleStatus,
"Transitioning to AgentStateActive must promote 'starting' → 'ready' (issue #484)")
}
Loading