From 4b5254c17da91914ab80f0102487e080b5ad4646 Mon Sep 17 00:00:00 2001 From: yishuiliunian Date: Thu, 16 Apr 2026 09:53:51 +0800 Subject: [PATCH] fix(ipc): retry initialize() on transient failures AgentClient::initialize() now retries up to 5 times with exponential backoff when the agent process is slow to start. Fixes flaky cluster e2e tests on CI where the process isn't ready to handle JSON-RPC when initialize is called immediately after spawn. --- crates/loopal-agent-client/src/client.rs | 46 +++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/crates/loopal-agent-client/src/client.rs b/crates/loopal-agent-client/src/client.rs index 2b454a7..59f639d 100644 --- a/crates/loopal-agent-client/src/client.rs +++ b/crates/loopal-agent-client/src/client.rs @@ -46,17 +46,47 @@ impl AgentClient { } /// Send `initialize` and wait for response. + /// Retries on transient failures (e.g. agent process still starting up). pub async fn initialize(&self) -> anyhow::Result { - let result = self - .connection - .send_request( - methods::INITIALIZE.name, - serde_json::json!({"protocol_version": 1}), + use std::time::Duration; + const MAX_ATTEMPTS: u32 = 5; + const TIMEOUT: Duration = Duration::from_secs(2); + + for attempt in 1..=MAX_ATTEMPTS { + match tokio::time::timeout( + TIMEOUT, + self.connection.send_request( + methods::INITIALIZE.name, + serde_json::json!({"protocol_version": 1}), + ), ) .await - .map_err(|e| anyhow::anyhow!("initialize failed: {e}"))?; - info!("IPC initialized: {result}"); - Ok(result) + { + Ok(Ok(result)) => { + info!("IPC initialized: {result}"); + return Ok(result); + } + Ok(Err(e)) if attempt < MAX_ATTEMPTS => { + tracing::warn!(attempt, error = %e, "initialize failed, retrying"); + tokio::time::sleep(Duration::from_millis(100 * attempt as u64)).await; + } + Ok(Err(e)) => { + return Err(anyhow::anyhow!( + "initialize failed after {MAX_ATTEMPTS} attempts: {e}" + )); + } + Err(_) if attempt < MAX_ATTEMPTS => { + tracing::warn!(attempt, "initialize timed out, retrying"); + tokio::time::sleep(Duration::from_millis(100 * attempt as u64)).await; + } + Err(_) => { + return Err(anyhow::anyhow!( + "initialize timed out after {MAX_ATTEMPTS} attempts" + )); + } + } + } + unreachable!() } /// Send `agent/start` to begin the agent loop.