From de485fd33f48658ec7f5bb8faafaa5381f856be2 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 11:07:38 +0000
Subject: [PATCH 01/37] feat(cli): add daemon mode for dimos run (DIM-681)

---
 dimos/core/daemon.py       | 153 +++++++++++++++++
 dimos/core/run_registry.py | 128 ++++++++++++++
 dimos/robot/cli/dimos.py   |  93 +++++++++-
 tests/test_daemon.py       | 338 +++++++++++++++++++++++++++++++++++++
 4 files changed, 709 insertions(+), 3 deletions(-)
 create mode 100644 dimos/core/daemon.py
 create mode 100644 dimos/core/run_registry.py
 create mode 100644 tests/test_daemon.py

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
new file mode 100644
index 0000000000..01f5e35e06
--- /dev/null
+++ b/dimos/core/daemon.py
@@ -0,0 +1,153 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Daemonization and health-check support for DimOS processes."""
+
+from __future__ import annotations
+
+import os
+import signal
+import sys
+import time
+from typing import TYPE_CHECKING
+
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from dimos.core.module_coordinator import ModuleCoordinator
+    from dimos.core.run_registry import RunEntry
+
+logger = setup_logger()
+
+# ---------------------------------------------------------------------------
+# Health check
+# ---------------------------------------------------------------------------
+
+_DEFAULT_HEALTH_TIMEOUT: float = 30.0
+_POLL_INTERVAL: float = 0.25
+
+
+def health_check(
+    coordinator: ModuleCoordinator,
+    timeout: float = _DEFAULT_HEALTH_TIMEOUT,
+    poll_interval: float = _POLL_INTERVAL,
+) -> bool:
+    """Poll coordinator workers until *timeout*, return True if all stay alive.
+
+    The check fails immediately when any worker's ``pid`` becomes ``None``
+    (meaning the underlying process exited).  It also fails if there are
+    zero workers (nothing to monitor).
+    """
+    client = coordinator._client
+    if client is None:
+        logger.error("health_check: coordinator has no WorkerManager")
+        return False
+
+    workers = client.workers
+    if not workers:
+        logger.error("health_check: no workers found")
+        return False
+
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        for w in workers:
+            if w.pid is None:
+                logger.error(
+                    "health_check: worker died",
+                    worker_id=w.worker_id,
+                )
+                return False
+        time.sleep(poll_interval)
+
+    # Final check after timeout elapsed
+    for w in workers:
+        if w.pid is None:
+            logger.error(
+                "health_check: worker died at deadline",
+                worker_id=w.worker_id,
+            )
+            return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Daemonize (double-fork)
+# ---------------------------------------------------------------------------
+
+
+def daemonize(log_dir: Path) -> None:
+    """Double-fork daemonize the current process.
+
+    After this call the *caller* is the daemon grandchild.
+    stdout / stderr are redirected to files inside *log_dir*.
+    The two intermediate parents call ``os._exit(0)``.
+    """
+    log_dir.mkdir(parents=True, exist_ok=True)
+    stderr_path = log_dir / "stderr.log"
+
+    # First fork — detach from terminal
+    pid = os.fork()
+    if pid > 0:
+        os._exit(0)
+
+    os.setsid()
+
+    # Second fork — can never reacquire a controlling terminal
+    pid = os.fork()
+    if pid > 0:
+        os._exit(0)
+
+    # Redirect file descriptors
+    sys.stdout.flush()
+    sys.stderr.flush()
+
+    devnull = open(os.devnull)
+    stderr_file = open(stderr_path, "a")
+
+    os.dup2(devnull.fileno(), sys.stdin.fileno())
+    os.dup2(stderr_file.fileno(), sys.stdout.fileno())
+    os.dup2(stderr_file.fileno(), sys.stderr.fileno())
+
+
+# ---------------------------------------------------------------------------
+# Signal handler for clean shutdown
+# ---------------------------------------------------------------------------
+
+_active_entry: RunEntry | None = None
+_active_coordinator: ModuleCoordinator | None = None
+
+
+def install_signal_handlers(entry: RunEntry, coordinator: ModuleCoordinator) -> None:
+    """Install SIGTERM/SIGINT handlers that stop the coordinator and clean the registry."""
+    global _active_entry, _active_coordinator
+    _active_entry = entry
+    _active_coordinator = coordinator
+
+    signal.signal(signal.SIGTERM, _shutdown_handler)
+    signal.signal(signal.SIGINT, _shutdown_handler)
+
+
+def _shutdown_handler(signum: int, frame: object) -> None:
+    logger.info("Received signal, shutting down", signal=signum)
+    if _active_coordinator is not None:
+        try:
+            _active_coordinator.stop()
+        except Exception:
+            logger.error("Error during coordinator stop", exc_info=True)
+    if _active_entry is not None:
+        _active_entry.remove()
+    sys.exit(0)
diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py
new file mode 100644
index 0000000000..ed6be06866
--- /dev/null
+++ b/dimos/core/run_registry.py
@@ -0,0 +1,128 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run registry for tracking DimOS daemon processes."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+import json
+import os
+from pathlib import Path
+import re
+import time
+
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+REGISTRY_DIR = Path.home() / ".local" / "state" / "dimos" / "runs"
+LOG_BASE_DIR = Path.home() / ".local" / "state" / "dimos" / "logs"
+
+
+@dataclass
+class RunEntry:
+    """Metadata for a single DimOS run (daemon or foreground)."""
+
+    run_id: str
+    pid: int
+    blueprint: str
+    started_at: str
+    log_dir: str
+    cli_args: list[str] = field(default_factory=list)
+    config_overrides: dict[str, object] = field(default_factory=dict)
+    mcp_port: int = 9990
+    grpc_port: int = 9877
+
+    @property
+    def registry_path(self) -> Path:
+        return REGISTRY_DIR / f"{self.run_id}.json"
+
+    def save(self) -> None:
+        """Persist this entry to disk."""
+        REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
+        self.registry_path.write_text(json.dumps(asdict(self), indent=2))
+
+    def remove(self) -> None:
+        """Delete this entry from disk."""
+        self.registry_path.unlink(missing_ok=True)
+
+    @classmethod
+    def load(cls, path: Path) -> RunEntry:
+        """Load a RunEntry from a JSON file."""
+        data = json.loads(path.read_text())
+        return cls(**data)
+
+
+def generate_run_id(blueprint: str) -> str:
+    """Generate a human-readable, timestamp-prefixed run ID."""
+    ts = time.strftime("%Y%m%d-%H%M%S")
+    safe_name = re.sub(r"[^a-zA-Z0-9_-]", "-", blueprint)
+    return f"{ts}-{safe_name}"
+
+
+def is_pid_alive(pid: int) -> bool:
+    """Check whether a process with the given PID is still running."""
+    try:
+        os.kill(pid, 0)
+        return True
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        # Process exists but we can't signal it — still alive.
+        return True
+
+
+def list_runs(alive_only: bool = True) -> list[RunEntry]:
+    """List all registered runs, optionally filtering to alive processes."""
+    REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
+    entries: list[RunEntry] = []
+    for f in sorted(REGISTRY_DIR.glob("*.json")):
+        try:
+            entry = RunEntry.load(f)
+        except Exception:
+            logger.warning("Corrupt registry entry, removing", path=str(f))
+            f.unlink()
+            continue
+
+        if alive_only and not is_pid_alive(entry.pid):
+            logger.info("Cleaning stale run entry", run_id=entry.run_id, pid=entry.pid)
+            entry.remove()
+            continue
+        entries.append(entry)
+    return entries
+
+
+def cleanup_stale() -> int:
+    """Remove registry entries for dead processes. Returns count removed."""
+    REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
+    removed = 0
+    for f in REGISTRY_DIR.glob("*.json"):
+        try:
+            entry = RunEntry.load(f)
+            if not is_pid_alive(entry.pid):
+                entry.remove()
+                removed += 1
+        except Exception:
+            f.unlink()
+            removed += 1
+    return removed
+
+
+def check_port_conflicts(mcp_port: int = 9990, grpc_port: int = 9877) -> RunEntry | None:
+    """Check if any alive run is using our ports. Returns conflicting entry or None."""
+    for entry in list_runs(alive_only=True):
+        if entry.mcp_port == mcp_port or entry.grpc_port == grpc_port:
+            return entry
+    return None
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 47a1e777e8..574520280f 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -102,22 +102,109 @@ def callback(**kwargs) -> None:  # type: ignore[no-untyped-def]
 def run(
     ctx: typer.Context,
     robot_types: list[str] = typer.Argument(..., help="Blueprints or modules to run"),
+    daemon: bool = typer.Option(False, "--daemon", "-d", help="Run in background"),
+    health_timeout: float = typer.Option(
+        30.0, "--health-timeout", help="Seconds to monitor workers before daemonizing"
+    ),
 ) -> None:
     """Start a robot blueprint"""
-    logger.info("Starting DimOS")
+    from datetime import datetime, timezone
+    import os
 
     from dimos.core.blueprints import autoconnect
+    from dimos.core.run_registry import (
+        LOG_BASE_DIR,
+        RunEntry,
+        check_port_conflicts,
+        cleanup_stale,
+        generate_run_id,
+    )
     from dimos.robot.get_all_blueprints import get_by_name
     from dimos.utils.logging_config import setup_exception_handler
 
     setup_exception_handler()
+    logger.info("Starting DimOS")
 
     cli_config_overrides: dict[str, Any] = ctx.obj
     global_config.update(**cli_config_overrides)
 
+    # Clean stale registry entries
+    stale = cleanup_stale()
+    if stale:
+        logger.info(f"Cleaned {stale} stale run entries")
+
+    # Port conflict check
+    conflict = check_port_conflicts()
+    if conflict:
+        typer.echo(
+            f"Error: Ports in use by {conflict.run_id} (PID {conflict.pid}). "
+            f"Run 'dimos stop' first.",
+            err=True,
+        )
+        raise typer.Exit(1)
+
+    blueprint_name = "-".join(robot_types)
+    run_id = generate_run_id(blueprint_name)
+    log_dir = LOG_BASE_DIR / run_id
+
     blueprint = autoconnect(*map(get_by_name, robot_types))
-    dimos = blueprint.build(cli_config_overrides=cli_config_overrides)
-    dimos.loop()
+    coordinator = blueprint.build(cli_config_overrides=cli_config_overrides)
+
+    if daemon:
+        from dimos.core.daemon import (
+            daemonize,
+            health_check,
+            install_signal_handlers,
+        )
+
+        # Health check before daemonizing — catch early crashes
+        if not health_check(coordinator, timeout=health_timeout):
+            typer.echo("Error: health check failed — a worker process died.", err=True)
+            coordinator.stop()
+            raise typer.Exit(1)
+
+        n_workers = len(coordinator._client.workers) if coordinator._client else 0
+        n_modules = len(coordinator._deployed_modules)
+        typer.echo(f"✓ All modules started ({n_modules} modules, {n_workers} workers)")
+        typer.echo("✓ Health check passed")
+        typer.echo("✓ DimOS running in background\n")
+        typer.echo(f"  Run ID:    {run_id}")
+        typer.echo(f"  PID:       {os.getpid()}")
+        typer.echo(f"  Log:       {log_dir}/dimos.jsonl")
+        typer.echo("  MCP:       http://localhost:9990/mcp\n")
+        typer.echo("  Stop:      dimos stop")
+        typer.echo("  Logs:      dimos log")
+        typer.echo("  Status:    dimos status")
+
+        daemonize(log_dir)
+
+        entry = RunEntry(
+            run_id=run_id,
+            pid=os.getpid(),
+            blueprint=blueprint_name,
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir=str(log_dir),
+            cli_args=list(robot_types),
+            config_overrides=cli_config_overrides,
+        )
+        entry.save()
+        install_signal_handlers(entry, coordinator)
+        coordinator.loop()
+    else:
+        entry = RunEntry(
+            run_id=run_id,
+            pid=os.getpid(),
+            blueprint=blueprint_name,
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir=str(log_dir),
+            cli_args=list(robot_types),
+            config_overrides=cli_config_overrides,
+        )
+        entry.save()
+        try:
+            coordinator.loop()
+        finally:
+            entry.remove()
 
 
 @main.command()
diff --git a/tests/test_daemon.py b/tests/test_daemon.py
new file mode 100644
index 0000000000..b3ce26f488
--- /dev/null
+++ b/tests/test_daemon.py
@@ -0,0 +1,338 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for DIM-681: daemon mode, run registry, and health checks."""
+
+from __future__ import annotations
+
+import os
+import re
+import signal
+import time
+from typing import TYPE_CHECKING
+from unittest import mock
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Registry tests
+# ---------------------------------------------------------------------------
+from dimos.core.run_registry import (
+    RunEntry,
+    check_port_conflicts,
+    cleanup_stale,
+    generate_run_id,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+@pytest.fixture()
+def tmp_registry(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+    """Redirect the registry to a temp dir for test isolation."""
+    monkeypatch.setattr("dimos.core.run_registry.REGISTRY_DIR", tmp_path)
+    return tmp_path
+
+
+def _make_entry(
+    run_id: str = "20260306-120000-test",
+    pid: int | None = None,
+    mcp_port: int = 9990,
+    grpc_port: int = 9877,
+) -> RunEntry:
+    return RunEntry(
+        run_id=run_id,
+        pid=pid if pid is not None else os.getpid(),
+        blueprint="test",
+        started_at="2026-03-06T12:00:00Z",
+        log_dir="/tmp/test-logs",
+        cli_args=["test"],
+        config_overrides={},
+        mcp_port=mcp_port,
+        grpc_port=grpc_port,
+    )
+
+
+class TestRunEntryCRUD:
+    """test_run_entry_save_load_remove — full CRUD cycle."""
+
+    def test_run_entry_save_load_remove(self, tmp_registry: Path):
+        entry = _make_entry()
+        entry.save()
+
+        loaded = RunEntry.load(entry.registry_path)
+        assert loaded.run_id == entry.run_id
+        assert loaded.pid == entry.pid
+        assert loaded.blueprint == entry.blueprint
+        assert loaded.mcp_port == entry.mcp_port
+        assert loaded.grpc_port == entry.grpc_port
+
+        entry.remove()
+        assert not entry.registry_path.exists()
+
+    def test_save_creates_directory(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+        nested = tmp_path / "a" / "b" / "c"
+        monkeypatch.setattr("dimos.core.run_registry.REGISTRY_DIR", nested)
+        entry = _make_entry()
+        entry.save()
+        assert entry.registry_path.exists()
+
+    def test_remove_idempotent(self, tmp_registry: Path):
+        entry = _make_entry()
+        entry.remove()  # file doesn't exist — should not raise
+        entry.save()
+        entry.remove()
+        entry.remove()  # already gone — still fine
+
+
+class TestGenerateRunId:
+    """test_generate_run_id_format — timestamp + sanitized blueprint name."""
+
+    def test_generate_run_id_format(self):
+        rid = generate_run_id("unitree-go2")
+        # Pattern: YYYYMMDD-HHMMSS-<name>
+        assert re.match(r"^\d{8}-\d{6}-unitree-go2$", rid), f"unexpected format: {rid}"
+
+    def test_sanitizes_slashes(self):
+        rid = generate_run_id("path/to/bp")
+        assert "/" not in rid
+
+    def test_sanitizes_spaces(self):
+        rid = generate_run_id("my blueprint")
+        assert " " not in rid
+
+
+class TestCleanupStale:
+    """Stale PID cleanup tests."""
+
+    def test_cleanup_stale_removes_dead_pids(self, tmp_registry: Path):
+        # PID 2_000_000_000 is almost certainly not alive
+        entry = _make_entry(pid=2_000_000_000)
+        entry.save()
+        assert entry.registry_path.exists()
+
+        removed = cleanup_stale()
+        assert removed == 1
+        assert not entry.registry_path.exists()
+
+    def test_cleanup_stale_keeps_alive_pids(self, tmp_registry: Path):
+        # Our own PID is alive
+        entry = _make_entry(pid=os.getpid())
+        entry.save()
+
+        removed = cleanup_stale()
+        assert removed == 0
+        assert entry.registry_path.exists()
+
+    def test_cleanup_corrupt_file(self, tmp_registry: Path):
+        bad = tmp_registry / "corrupt.json"
+        bad.write_text("not json{{{")
+        removed = cleanup_stale()
+        assert removed == 1
+        assert not bad.exists()
+
+
+class TestPortConflicts:
+    """Port conflict detection."""
+
+    def test_port_conflict_detection(self, tmp_registry: Path):
+        entry = _make_entry(pid=os.getpid(), mcp_port=9990, grpc_port=9877)
+        entry.save()
+
+        conflict = check_port_conflicts(mcp_port=9990, grpc_port=9877)
+        assert conflict is not None
+        assert conflict.run_id == entry.run_id
+
+    def test_port_conflict_mcp_only(self, tmp_registry: Path):
+        entry = _make_entry(pid=os.getpid(), mcp_port=9990, grpc_port=1111)
+        entry.save()
+
+        conflict = check_port_conflicts(mcp_port=9990, grpc_port=2222)
+        assert conflict is not None
+
+    def test_port_conflict_grpc_only(self, tmp_registry: Path):
+        entry = _make_entry(pid=os.getpid(), mcp_port=1111, grpc_port=9877)
+        entry.save()
+
+        conflict = check_port_conflicts(mcp_port=2222, grpc_port=9877)
+        assert conflict is not None
+
+    def test_port_conflict_no_false_positive(self, tmp_registry: Path):
+        entry = _make_entry(pid=os.getpid(), mcp_port=8000, grpc_port=8001)
+        entry.save()
+
+        conflict = check_port_conflicts(mcp_port=9990, grpc_port=9877)
+        assert conflict is None
+
+
+# ---------------------------------------------------------------------------
+# Health check tests
+# ---------------------------------------------------------------------------
+
+from dimos.core.daemon import health_check
+
+
+def _mock_worker(pid: int | None = 1234, worker_id: int = 0):
+    """Create a mock Worker with a controllable pid property."""
+    w = mock.MagicMock()
+    w.pid = pid
+    w.worker_id = worker_id
+    return w
+
+
+def _mock_coordinator(workers: list | None = None):
+    """Create a mock ModuleCoordinator with mock WorkerManager."""
+    coord = mock.MagicMock()
+    if workers is not None:
+        coord._client.workers = workers
+    else:
+        coord._client = None
+    return coord
+
+
+class TestHealthCheckAllHealthy:
+    """test_health_check_all_healthy — mock workers with alive PIDs → passes."""
+
+    def test_health_check_all_healthy(self):
+        workers = [_mock_worker(pid=os.getpid(), worker_id=i) for i in range(3)]
+        coord = _mock_coordinator(workers)
+
+        result = health_check(coord, timeout=0.5, poll_interval=0.1)
+        assert result is True
+
+
+class TestHealthCheckImmediateDeath:
+    """test_health_check_immediate_death — worker with pid=None → fails immediately."""
+
+    def test_health_check_immediate_death(self):
+        dead_worker = _mock_worker(pid=None, worker_id=0)
+        coord = _mock_coordinator([dead_worker])
+
+        start = time.monotonic()
+        result = health_check(coord, timeout=5.0, poll_interval=0.1)
+        elapsed = time.monotonic() - start
+
+        assert result is False
+        # Should fail almost immediately, not wait for the full timeout
+        assert elapsed < 1.0
+
+
+class TestHealthCheckDelayedDeath:
+    """test_health_check_delayed_death — worker alive initially, then pid → None."""
+
+    def test_health_check_delayed_death(self):
+        worker = _mock_worker(pid=os.getpid(), worker_id=0)
+
+        # After a few accesses, pid becomes None (simulating delayed crash)
+        call_count = 0
+        real_pid = os.getpid()
+
+        def pid_getter():
+            nonlocal call_count
+            call_count += 1
+            if call_count > 3:
+                return None
+            return real_pid
+
+        type(worker).pid = mock.PropertyMock(side_effect=pid_getter)
+        coord = _mock_coordinator([worker])
+
+        result = health_check(coord, timeout=5.0, poll_interval=0.05)
+        assert result is False
+
+
+class TestHealthCheckNoWorkers:
+    """test_health_check_no_workers — empty worker list → fails."""
+
+    def test_health_check_no_workers(self):
+        coord = _mock_coordinator(workers=[])
+        result = health_check(coord, timeout=0.5, poll_interval=0.1)
+        assert result is False
+
+    def test_health_check_no_client(self):
+        coord = _mock_coordinator(workers=None)  # _client = None
+        result = health_check(coord, timeout=0.5, poll_interval=0.1)
+        assert result is False
+
+
+class TestHealthCheckPartialDeath:
+    """test_health_check_partial_death — 3 workers, 1 dies → fails."""
+
+    def test_health_check_partial_death(self):
+        w1 = _mock_worker(pid=os.getpid(), worker_id=0)
+        w2 = _mock_worker(pid=os.getpid(), worker_id=1)
+        w3 = _mock_worker(pid=None, worker_id=2)  # dead
+
+        coord = _mock_coordinator([w1, w2, w3])
+        result = health_check(coord, timeout=0.5, poll_interval=0.1)
+        assert result is False
+
+
+# ---------------------------------------------------------------------------
+# Daemon tests
+# ---------------------------------------------------------------------------
+
+from dimos.core.daemon import _shutdown_handler, daemonize, install_signal_handlers
+
+
+class TestDaemonize:
+    """test_daemonize_creates_log_dir."""
+
+    def test_daemonize_creates_log_dir(self, tmp_path: Path):
+        log_dir = tmp_path / "nested" / "logs"
+        assert not log_dir.exists()
+
+        # We can't actually double-fork in tests (child would continue running
+        # pytest), so we mock os.fork to return >0 both times (parent path).
+        with mock.patch("os.fork", return_value=1), pytest.raises(SystemExit):
+            # Parent calls os._exit(0) which we let raise
+            with mock.patch("os._exit", side_effect=SystemExit(0)):
+                daemonize(log_dir)
+
+        assert log_dir.exists()
+
+
+class TestSignalHandler:
+    """test_signal_handler_cleans_registry."""
+
+    def test_signal_handler_cleans_registry(self, tmp_registry: Path):
+        entry = _make_entry()
+        entry.save()
+        assert entry.registry_path.exists()
+
+        coord = mock.MagicMock()
+        install_signal_handlers(entry, coord)
+
+        with pytest.raises(SystemExit):
+            _shutdown_handler(signal.SIGTERM, None)
+
+        # Registry file should be cleaned up
+        assert not entry.registry_path.exists()
+        # Coordinator should have been stopped
+        coord.stop.assert_called_once()
+
+    def test_signal_handler_tolerates_stop_error(self, tmp_registry: Path):
+        entry = _make_entry()
+        entry.save()
+
+        coord = mock.MagicMock()
+        coord.stop.side_effect = RuntimeError("boom")
+        install_signal_handlers(entry, coord)
+
+        with pytest.raises(SystemExit):
+            _shutdown_handler(signal.SIGTERM, None)
+
+        # Entry still removed even if stop() throws
+        assert not entry.registry_path.exists()

From ff5094d81f270e2522e9c639f23cb3828291cb33 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 11:38:46 +0000
Subject: [PATCH 02/37] =?UTF-8?q?fix:=20address=20greptile=20review=20?=
 =?UTF-8?q?=E2=80=94=20fd=20leak,=20wrong=20PID,=20fabricated=20log=20path?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- close devnull/stderr_file after dup2 (fd leak)
- remove PID from pre-fork output (printed parent PID, not daemon PID)
- show log_dir not log_dir/dimos.jsonl (file doesn't exist yet)
- keep tests in tests/ (dimos/conftest.py breaks isolated tests)
---
 dimos/core/daemon.py     | 4 ++++
 dimos/robot/cli/dimos.py | 4 +---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
index 01f5e35e06..9510707cfe 100644
--- a/dimos/core/daemon.py
+++ b/dimos/core/daemon.py
@@ -122,6 +122,10 @@ def daemonize(log_dir: Path) -> None:
     os.dup2(stderr_file.fileno(), sys.stdout.fileno())
     os.dup2(stderr_file.fileno(), sys.stderr.fileno())
 
+    # Close original FDs — dup2'd copies keep the underlying files open
+    devnull.close()
+    stderr_file.close()
+
 
 # ---------------------------------------------------------------------------
 # Signal handler for clean shutdown
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 574520280f..28cbcc34ca 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -169,11 +169,9 @@ def run(
         typer.echo("✓ Health check passed")
         typer.echo("✓ DimOS running in background\n")
         typer.echo(f"  Run ID:    {run_id}")
-        typer.echo(f"  PID:       {os.getpid()}")
-        typer.echo(f"  Log:       {log_dir}/dimos.jsonl")
+        typer.echo(f"  Log:       {log_dir}")
         typer.echo("  MCP:       http://localhost:9990/mcp\n")
         typer.echo("  Stop:      dimos stop")
-        typer.echo("  Logs:      dimos log")
         typer.echo("  Status:    dimos status")
 
         daemonize(log_dir)

From 9cde9ef2630a03f2ef19aab3296c43380cf2ea76 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 11:45:50 +0000
Subject: [PATCH 03/37] feat(cli): add dimos stop and dimos status commands
 (DIM-682, DIM-684)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

dimos status — shows running instances with run-id, pid, blueprint, uptime, log dir
dimos stop — sends SIGTERM (or SIGKILL with --force), waits 5s, escalates if needed
  --pid to target specific instance, --all to stop everything
  cleans registry on stop

also adds get_most_recent() to run_registry.
8 new tests covering sigterm, sigkill, escalation, dead process cleanup, most-recent lookup.
---
 dimos/core/run_registry.py |   6 +
 dimos/robot/cli/dimos.py   | 108 +++++++++++++++++
 tests/test_daemon.py       | 242 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 352 insertions(+), 4 deletions(-)

diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py
index ed6be06866..cdfaa11790 100644
--- a/dimos/core/run_registry.py
+++ b/dimos/core/run_registry.py
@@ -126,3 +126,9 @@ def check_port_conflicts(mcp_port: int = 9990, grpc_port: int = 9877) -> RunEntr
         if entry.mcp_port == mcp_port or entry.grpc_port == grpc_port:
             return entry
     return None
+
+
+def get_most_recent(alive_only: bool = True) -> RunEntry | None:
+    """Return the most recently created run entry, or None."""
+    runs = list_runs(alive_only=alive_only)
+    return runs[-1] if runs else None
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 28cbcc34ca..4ba27afab6 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -205,6 +205,114 @@ def run(
             entry.remove()
 
 
+@main.command()
+def status() -> None:
+    """Show running DimOS instances."""
+
+    from dimos.core.run_registry import list_runs
+
+    entries = list_runs(alive_only=True)
+    if not entries:
+        typer.echo("No running DimOS instances")
+        return
+
+    for entry in entries:
+        # Calculate uptime
+        try:
+            from datetime import datetime, timezone
+
+            started = datetime.fromisoformat(entry.started_at)
+            age = datetime.now(timezone.utc) - started
+            hours, remainder = divmod(int(age.total_seconds()), 3600)
+            minutes, seconds = divmod(remainder, 60)
+            if hours > 0:
+                uptime = f"{hours}h {minutes}m"
+            else:
+                uptime = f"{minutes}m {seconds}s"
+        except Exception:
+            uptime = "unknown"
+
+        typer.echo(f"  Run ID:    {entry.run_id}")
+        typer.echo(f"  PID:       {entry.pid}")
+        typer.echo(f"  Blueprint: {entry.blueprint}")
+        typer.echo(f"  Uptime:    {uptime}")
+        typer.echo(f"  Log:       {entry.log_dir}")
+        typer.echo(f"  MCP:       http://localhost:{entry.mcp_port}/mcp")
+        if len(entries) > 1:
+            typer.echo("")
+
+
+@main.command()
+def stop(
+    pid: int = typer.Option(None, "--pid", "-p", help="PID of instance to stop"),
+    all_instances: bool = typer.Option(False, "--all", "-a", help="Stop all instances"),
+    force: bool = typer.Option(False, "--force", "-f", help="Force kill (SIGKILL)"),
+) -> None:
+    """Stop a running DimOS instance."""
+
+    from dimos.core.run_registry import get_most_recent, list_runs
+
+    if all_instances:
+        entries = list_runs(alive_only=True)
+        if not entries:
+            typer.echo("No running DimOS instances")
+            return
+        for entry in entries:
+            _stop_entry(entry, force=force)
+        return
+
+    if pid:
+        entries = list_runs(alive_only=True)
+        entry = next((e for e in entries if e.pid == pid), None)
+        if not entry:
+            typer.echo(f"Error: no running instance with PID {pid}", err=True)
+            raise typer.Exit(1)
+    else:
+        entry = get_most_recent(alive_only=True)
+        if not entry:
+            typer.echo("No running DimOS instances", err=True)
+            raise typer.Exit(1)
+
+    _stop_entry(entry, force=force)
+
+
+def _stop_entry(entry, force: bool = False) -> None:  # type: ignore[no-untyped-def]
+    """Stop a single DimOS instance by registry entry."""
+    import os
+    import signal
+    import time
+
+    from dimos.core.run_registry import is_pid_alive
+
+    sig = signal.SIGKILL if force else signal.SIGTERM
+    sig_name = "SIGKILL" if force else "SIGTERM"
+
+    typer.echo(f"Stopping {entry.run_id} (PID {entry.pid}) with {sig_name}...")
+
+    try:
+        os.kill(entry.pid, sig)
+    except ProcessLookupError:
+        typer.echo("  Process already dead, cleaning registry")
+        entry.remove()
+        return
+
+    # Wait for graceful shutdown
+    if not force:
+        for _ in range(50):  # 5 seconds
+            if not is_pid_alive(entry.pid):
+                break
+            time.sleep(0.1)
+        else:
+            typer.echo("  Still alive after 5s, sending SIGKILL...")
+            try:
+                os.kill(entry.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+    entry.remove()
+    typer.echo("  Stopped.")
+
+
 @main.command()
 def show_config(ctx: typer.Context) -> None:
     """Show current config settings and their values."""
diff --git a/tests/test_daemon.py b/tests/test_daemon.py
index b3ce26f488..65ee180106 100644
--- a/tests/test_daemon.py
+++ b/tests/test_daemon.py
@@ -17,10 +17,11 @@
 from __future__ import annotations
 
 import os
+from pathlib import Path
 import re
 import signal
+import sys
 import time
-from typing import TYPE_CHECKING
 from unittest import mock
 
 import pytest
@@ -28,16 +29,15 @@
 # ---------------------------------------------------------------------------
 # Registry tests
 # ---------------------------------------------------------------------------
+from dimos.core import run_registry
 from dimos.core.run_registry import (
     RunEntry,
     check_port_conflicts,
     cleanup_stale,
     generate_run_id,
+    list_runs,
 )
 
-if TYPE_CHECKING:
-    from pathlib import Path
-
 
 @pytest.fixture()
 def tmp_registry(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
@@ -336,3 +336,237 @@ def test_signal_handler_tolerates_stop_error(self, tmp_registry: Path):
 
         # Entry still removed even if stop() throws
         assert not entry.registry_path.exists()
+
+
+# ---------------------------------------------------------------------------
+# dimos status tests
+# ---------------------------------------------------------------------------
+
+
+class TestStatusCommand:
+    """Tests for `dimos status` CLI command."""
+
+    def test_status_no_instances(self, tmp_path, monkeypatch):
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", tmp_path / "runs")
+        entries = list_runs(alive_only=True)
+        assert entries == []
+
+    def test_status_shows_alive_instance(self, tmp_path, monkeypatch):
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        entry = RunEntry(
+            run_id="20260306-120000-test",
+            pid=os.getpid(),  # our own PID — alive
+            blueprint="test",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["test"],
+            config_overrides={},
+        )
+        entry.save()
+
+        entries = list_runs(alive_only=True)
+        assert len(entries) == 1
+        assert entries[0].run_id == "20260306-120000-test"
+        assert entries[0].pid == os.getpid()
+
+    def test_status_filters_dead(self, tmp_path, monkeypatch):
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        entry = RunEntry(
+            run_id="20260306-120000-dead",
+            pid=99999999,  # fake PID, not alive
+            blueprint="dead",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["dead"],
+            config_overrides={},
+        )
+        entry.save()
+
+        entries = list_runs(alive_only=True)
+        assert len(entries) == 0
+
+
+# ---------------------------------------------------------------------------
+# dimos stop tests
+# ---------------------------------------------------------------------------
+
+
+class TestStopCommand:
+    """Tests for `dimos stop` CLI command."""
+
+    def test_stop_sends_sigterm(self, tmp_path, monkeypatch):
+        """Verify stop sends SIGTERM to the correct PID."""
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        killed_pids = []
+        killed_signals = []
+
+        def mock_kill(pid, sig):
+            killed_pids.append(pid)
+            killed_signals.append(sig)
+            raise ProcessLookupError  # pretend it died immediately
+
+        monkeypatch.setattr(os, "kill", mock_kill)
+
+        entry = RunEntry(
+            run_id="20260306-120000-test",
+            pid=12345,
+            blueprint="test",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["test"],
+            config_overrides={},
+        )
+        entry.save()
+
+        # Import the stop helper
+        sys.path.insert(0, str(Path(__file__).parent.parent))
+        from dimos.robot.cli.dimos import _stop_entry
+
+        _stop_entry(entry, force=False)
+
+        assert 12345 in killed_pids
+        import signal
+
+        assert signal.SIGTERM in killed_signals
+        # Registry entry should be removed
+        assert not entry.registry_path.exists()
+
+    def test_stop_force_sends_sigkill(self, tmp_path, monkeypatch):
+        """Verify --force sends SIGKILL."""
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        killed_signals = []
+
+        def mock_kill(pid, sig):
+            killed_signals.append(sig)
+            raise ProcessLookupError
+
+        monkeypatch.setattr(os, "kill", mock_kill)
+
+        entry = RunEntry(
+            run_id="20260306-120000-test",
+            pid=12345,
+            blueprint="test",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["test"],
+            config_overrides={},
+        )
+        entry.save()
+
+        from dimos.robot.cli.dimos import _stop_entry
+
+        _stop_entry(entry, force=True)
+
+        import signal
+
+        assert signal.SIGKILL in killed_signals
+        assert not entry.registry_path.exists()
+
+    def test_stop_cleans_registry_on_already_dead(self, tmp_path, monkeypatch):
+        """If process is already dead, just clean registry."""
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        def mock_kill(pid, sig):
+            raise ProcessLookupError
+
+        monkeypatch.setattr(os, "kill", mock_kill)
+
+        entry = RunEntry(
+            run_id="20260306-120000-dead",
+            pid=99999999,
+            blueprint="dead",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["dead"],
+            config_overrides={},
+        )
+        entry.save()
+        assert entry.registry_path.exists()
+
+        from dimos.robot.cli.dimos import _stop_entry
+
+        _stop_entry(entry, force=False)
+        assert not entry.registry_path.exists()
+
+    def test_stop_escalates_to_sigkill_after_timeout(self, tmp_path, monkeypatch):
+        """If SIGTERM doesn't kill within 5s, escalates to SIGKILL."""
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+
+        signals_sent = []
+
+        def mock_kill(pid, sig):
+            signals_sent.append(sig)
+            # Don't raise — process "survives"
+
+        monkeypatch.setattr(os, "kill", mock_kill)
+
+        # Make is_pid_alive always return True (process won't die)
+        monkeypatch.setattr(run_registry, "is_pid_alive", lambda pid: True)
+
+        # Speed up the wait loop
+        monkeypatch.setattr("time.sleep", lambda x: None)
+
+        entry = RunEntry(
+            run_id="20260306-120000-stubborn",
+            pid=12345,
+            blueprint="stubborn",
+            started_at="2026-03-06T12:00:00Z",
+            log_dir=str(tmp_path / "logs"),
+            cli_args=["stubborn"],
+            config_overrides={},
+        )
+        entry.save()
+
+        from dimos.robot.cli.dimos import _stop_entry
+
+        _stop_entry(entry, force=False)
+
+        import signal
+
+        assert signal.SIGTERM in signals_sent
+        assert signal.SIGKILL in signals_sent
+        assert not entry.registry_path.exists()
+
+    def test_get_most_recent_returns_latest(self, tmp_path, monkeypatch):
+        """Verify get_most_recent returns the most recently created entry."""
+        reg_dir = tmp_path / "runs"
+        monkeypatch.setattr(run_registry, "REGISTRY_DIR", reg_dir)
+        monkeypatch.setattr(run_registry, "is_pid_alive", lambda pid: True)
+
+        entry1 = RunEntry(
+            run_id="20260306-100000-first",
+            pid=os.getpid(),
+            blueprint="first",
+            started_at="2026-03-06T10:00:00Z",
+            log_dir=str(tmp_path / "logs1"),
+            cli_args=["first"],
+            config_overrides={},
+        )
+        entry1.save()
+
+        entry2 = RunEntry(
+            run_id="20260306-110000-second",
+            pid=os.getpid(),
+            blueprint="second",
+            started_at="2026-03-06T11:00:00Z",
+            log_dir=str(tmp_path / "logs2"),
+            cli_args=["second"],
+            config_overrides={},
+        )
+        entry2.save()
+
+        from dimos.core.run_registry import get_most_recent
+
+        latest = get_most_recent(alive_only=True)
+        assert latest is not None
+        assert latest.run_id == "20260306-110000-second"

From 21f16e77981dacf91ee3659631b0fb15f5188903 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 12:31:58 +0000
Subject: [PATCH 04/37] test: add e2e daemon lifecycle tests with PingPong
 blueprint

lightweight 2-module blueprint (PingModule + PongModule) that needs no
hardware, no LFS data, and no replay files. tests real forkserver workers
and module deployment.

covers:
- single worker lifecycle (build -> health check -> registry -> stop)
- multiple workers (2 workers, both alive)
- health check detects dead worker (SIGKILL -> detect failure)
- registry entry JSON field roundtrip
- stale entry cleanup (dead PIDs removed, live entries kept)
---
 tests/test_e2e_daemon.py | 238 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 tests/test_e2e_daemon.py

diff --git a/tests/test_e2e_daemon.py b/tests/test_e2e_daemon.py
new file mode 100644
index 0000000000..00306f0d33
--- /dev/null
+++ b/tests/test_e2e_daemon.py
@@ -0,0 +1,238 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end tests for daemon lifecycle using a lightweight PingPong blueprint.
+
+These tests exercise real forkserver workers and module deployment — no mocks.
+They use a minimal 2-module blueprint (PingModule + PongModule) that needs
+no hardware, no LFS data, and no replay files.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import signal
+import time
+
+import pytest
+
+# Skip sysctl interactive prompt in CI / headless
+os.environ["CI"] = "1"
+
+from datetime import datetime, timezone
+
+from dimos.core.blueprints import autoconnect
+from dimos.core.daemon import health_check
+from dimos.core.global_config import global_config
+from dimos.core.module import Module
+from dimos.core.run_registry import (
+    REGISTRY_DIR,
+    RunEntry,
+    cleanup_stale,
+    get_most_recent,
+    list_runs,
+)
+from dimos.core.stream import Out
+
+# ---------------------------------------------------------------------------
+# Lightweight test modules
+# ---------------------------------------------------------------------------
+
+
+class PingModule(Module):
+    data: Out[str]
+
+    def start(self):
+        super().start()
+
+
+class PongModule(Module):
+    data: Out[str]
+
+    def start(self):
+        super().start()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _build(n_workers: int = 1):
+    """Build a 2-module blueprint with *n_workers* forkserver workers."""
+    global_config.update(viewer_backend="none", n_workers=n_workers)
+    bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
+    coord = bp.build(cli_config_overrides={"viewer_backend": "none", "n_workers": n_workers})
+    return coord
+
+
+def _make_entry(coord, run_id: str | None = None) -> RunEntry:
+    """Create and save a registry entry for the current process."""
+    if run_id is None:
+        run_id = f"test-{datetime.now(timezone.utc).strftime('%H%M%S%f')}"
+    entry = RunEntry(
+        run_id=run_id,
+        pid=os.getpid(),
+        blueprint="ping-pong-test",
+        started_at=datetime.now(timezone.utc).isoformat(),
+        log_dir="/tmp/dimos-e2e-test",
+        cli_args=["ping-pong"],
+        config_overrides={"n_workers": 1},
+    )
+    entry.save()
+    return entry
+
+
+@pytest.fixture(autouse=True)
+def _clean_registry():
+    """Remove any leftover registry entries before/after each test."""
+    for f in REGISTRY_DIR.glob("*.json"):
+        f.unlink(missing_ok=True)
+    yield
+    for f in REGISTRY_DIR.glob("*.json"):
+        f.unlink(missing_ok=True)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestDaemonE2E:
+    """End-to-end daemon lifecycle with real workers."""
+
+    def test_single_worker_lifecycle(self):
+        """Build → health check → registry → status → stop (1 worker)."""
+        coord = _build(n_workers=1)
+
+        workers = coord._client.workers
+        assert len(workers) == 1
+        assert len(coord._deployed_modules) == 2  # PingModule + PongModule
+
+        ok = health_check(coord, timeout=2)
+        assert ok, "Health check should pass with healthy worker"
+
+        entry = _make_entry(coord)
+        runs = list_runs(alive_only=True)
+        assert len(runs) == 1
+        assert runs[0].run_id == entry.run_id
+
+        latest = get_most_recent(alive_only=True)
+        assert latest is not None
+        assert latest.run_id == entry.run_id
+
+        coord.stop()
+        entry.remove()
+        assert len(list_runs(alive_only=True)) == 0
+
+    def test_multiple_workers(self):
+        """Build with 2 workers — both should be alive after health check."""
+        coord = _build(n_workers=2)
+
+        workers = coord._client.workers
+        assert len(workers) == 2
+        for w in workers:
+            assert w.pid is not None, f"Worker {w.worker_id} has no PID"
+
+        ok = health_check(coord, timeout=2)
+        assert ok, "Health check should pass with 2 healthy workers"
+
+        coord.stop()
+
+    def test_health_check_detects_dead_worker(self):
+        """Kill a worker process — health check should detect and fail."""
+        coord = _build(n_workers=1)
+
+        worker = coord._client.workers[0]
+        worker_pid = worker.pid
+        assert worker_pid is not None
+
+        # Kill the worker
+        os.kill(worker_pid, signal.SIGKILL)
+        time.sleep(0.5)  # let it die
+
+        ok = health_check(coord, timeout=2)
+        assert not ok, "Health check should FAIL after worker killed"
+
+        coord.stop()
+
+    def test_registry_entry_details(self):
+        """Verify all fields are correctly persisted in the JSON registry."""
+        coord = _build(n_workers=1)
+
+        run_id = "detail-test-001"
+        entry = RunEntry(
+            run_id=run_id,
+            pid=os.getpid(),
+            blueprint="ping-pong-detail",
+            started_at="2026-03-06T12:00:00+00:00",
+            log_dir="/tmp/dimos-detail-test",
+            cli_args=["--replay", "ping-pong"],
+            config_overrides={"n_workers": 1, "viewer_backend": "none"},
+        )
+        entry.save()
+
+        # Read raw JSON
+        raw = json.loads(entry.registry_path.read_text())
+        assert raw["run_id"] == run_id
+        assert raw["pid"] == os.getpid()
+        assert raw["blueprint"] == "ping-pong-detail"
+        assert raw["started_at"] == "2026-03-06T12:00:00+00:00"
+        assert raw["log_dir"] == "/tmp/dimos-detail-test"
+        assert raw["cli_args"] == ["--replay", "ping-pong"]
+        assert raw["config_overrides"] == {"n_workers": 1, "viewer_backend": "none"}
+
+        # Roundtrip through list_runs
+        runs = list_runs()
+        assert len(runs) == 1
+        loaded = runs[0]
+        assert loaded.run_id == run_id
+        assert loaded.blueprint == "ping-pong-detail"
+        assert loaded.cli_args == ["--replay", "ping-pong"]
+
+        coord.stop()
+        entry.remove()
+
+    def test_stale_cleanup(self):
+        """Stale entries (dead PIDs) should be removed by cleanup_stale."""
+        coord = _build(n_workers=1)
+
+        # Create an entry with a bogus PID that doesn't exist
+        stale = RunEntry(
+            run_id="stale-dead-pid",
+            pid=99999999,  # definitely not running
+            blueprint="ghost",
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir="/tmp/ghost",
+            cli_args=[],
+            config_overrides={},
+        )
+        stale.save()
+
+        # Also create a live entry
+        live = _make_entry(coord)
+
+        # alive_only=False returns ALL entries (no auto-clean)
+        assert len(list_runs(alive_only=False)) == 2
+
+        removed = cleanup_stale()
+        assert removed == 1  # only the dead PID entry removed
+
+        remaining = list_runs(alive_only=False)
+        assert len(remaining) == 1
+        assert remaining[0].run_id == live.run_id
+
+        coord.stop()
+        live.remove()

From 3854c8a9de20afad3c72c9f78009321d18a7a71c Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 12:34:26 +0000
Subject: [PATCH 05/37] fix: rename stderr.log to daemon.log (addresses
 greptile review)

both stdout and stderr redirect to the same file after daemonize(),
so stderr.log was misleading. daemon.log better describes the contents.
---
 dimos/core/daemon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
index 9510707cfe..ff23fe1d08 100644
--- a/dimos/core/daemon.py
+++ b/dimos/core/daemon.py
@@ -97,7 +97,7 @@ def daemonize(log_dir: Path) -> None:
     The two intermediate parents call ``os._exit(0)``.
     """
     log_dir.mkdir(parents=True, exist_ok=True)
-    stderr_path = log_dir / "stderr.log"
+    stderr_path = log_dir / "daemon.log"
 
     # First fork — detach from terminal
     pid = os.fork()

From d1fbc51e51a6a56ddd0f6eb89ca2e0e1247ca79d Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 13:04:01 +0000
Subject: [PATCH 06/37] fix: resolve mypy type errors in stop command (DIM-681)

---
 dimos/robot/cli/dimos.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 4ba27afab6..d8240f64ea 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -250,17 +250,18 @@ def stop(
 ) -> None:
     """Stop a running DimOS instance."""
 
-    from dimos.core.run_registry import get_most_recent, list_runs
+    from dimos.core.run_registry import RunEntry, get_most_recent, list_runs
 
     if all_instances:
         entries = list_runs(alive_only=True)
         if not entries:
             typer.echo("No running DimOS instances")
             return
-        for entry in entries:
-            _stop_entry(entry, force=force)
+        for e in entries:
+            _stop_entry(e, force=force)
         return
 
+    entry: RunEntry | None = None
     if pid:
         entries = list_runs(alive_only=True)
         entry = next((e for e in entries if e.pid == pid), None)
@@ -273,6 +274,7 @@ def stop(
             typer.echo("No running DimOS instances", err=True)
             raise typer.Exit(1)
 
+    assert entry is not None
     _stop_entry(entry, force=force)
 
 

From cf3560f98435dfd75aa4161df24604d17c8e5265 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 14:43:56 +0000
Subject: [PATCH 07/37] feat: per-run log directory with unified main.jsonl
 (DIM-685)

folds DIM-685 into daemon PR to avoid merge conflicts on dimos.py.

- set_run_log_dir() before blueprint.build() routes structlog to
  <log_base>/<run-id>/main.jsonl
- workers inherit DIMOS_RUN_LOG_DIR env var via forkserver
- FileHandler replaces RotatingFileHandler (multi-process rotation unsafe)
- fallback: env var check -> legacy per-pid files
- 6 unit tests for routing logic
---
 dimos/robot/cli/dimos.py      |  6 +++
 dimos/utils/logging_config.py | 34 ++++++++++++--
 tests/test_per_run_logs.py    | 88 +++++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_per_run_logs.py

diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index d8240f64ea..64eb0b349f 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -147,6 +147,12 @@ def run(
     run_id = generate_run_id(blueprint_name)
     log_dir = LOG_BASE_DIR / run_id
 
+    # Route structured logs (main.jsonl) to the per-run directory.
+    # Workers inherit DIMOS_RUN_LOG_DIR env var via forkserver.
+    from dimos.utils.logging_config import set_run_log_dir
+
+    set_run_log_dir(log_dir)
+
     blueprint = autoconnect(*map(get_by_name, robot_types))
     coordinator = blueprint.build(cli_config_overrides=cli_config_overrides)
 
diff --git a/dimos/utils/logging_config.py b/dimos/utils/logging_config.py
index 1ddac0a23c..f91e41af6d 100644
--- a/dimos/utils/logging_config.py
+++ b/dimos/utils/logging_config.py
@@ -38,6 +38,22 @@
 
 _LOG_FILE_PATH = None
 
+_RUN_LOG_DIR: Path | None = None
+
+
+def set_run_log_dir(log_dir: str | Path) -> None:
+    """Set per-run log directory. Call BEFORE blueprint.build()."""
+    global _RUN_LOG_DIR, _LOG_FILE_PATH
+    log_dir = Path(log_dir)
+    _RUN_LOG_DIR = log_dir
+    _RUN_LOG_DIR.mkdir(parents=True, exist_ok=True)
+    _LOG_FILE_PATH = None
+    os.environ["DIMOS_RUN_LOG_DIR"] = str(log_dir)
+
+
+def get_run_log_dir() -> Path | None:
+    return _RUN_LOG_DIR
+
 
 def _get_log_directory() -> Path:
     # Check if running from a git repository
@@ -61,6 +77,13 @@ def _get_log_directory() -> Path:
 
 
 def _get_log_file_path() -> Path:
+    if _RUN_LOG_DIR is not None:
+        return _RUN_LOG_DIR / "main.jsonl"
+    env_log_dir = os.environ.get("DIMOS_RUN_LOG_DIR")
+    if env_log_dir:
+        p = Path(env_log_dir)
+        p.mkdir(parents=True, exist_ok=True)
+        return p / "main.jsonl"
     log_dir = _get_log_directory()
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     pid = os.getpid()
@@ -229,21 +252,24 @@ def setup_logger(*, level: int | None = None) -> Any:
     console_formatter = structlog.stdlib.ProcessorFormatter(
         processor=_compact_console_processor,
     )
+
     console_handler.setFormatter(console_formatter)
     stdlib_logger.addHandler(console_handler)
 
-    # Create rotating file handler with JSON formatting.
-    file_handler = logging.handlers.RotatingFileHandler(
+    # Plain FileHandler (not RotatingFileHandler) — rotation is unsafe with
+    # forkserver workers writing to the same file. Per-run logs are scoped to
+    # a single run so unbounded growth is not a concern.
+    file_handler = logging.FileHandler(
         log_file_path,
         mode="a",
-        maxBytes=10 * 1024 * 1024,  # 10MiB
-        backupCount=20,
         encoding="utf-8",
     )
+
     file_handler.setLevel(level)
     file_formatter = structlog.stdlib.ProcessorFormatter(
         processor=structlog.processors.JSONRenderer(),
     )
+
     file_handler.setFormatter(file_formatter)
     stdlib_logger.addHandler(file_handler)
 
diff --git a/tests/test_per_run_logs.py b/tests/test_per_run_logs.py
new file mode 100644
index 0000000000..ab2e293d10
--- /dev/null
+++ b/tests/test_per_run_logs.py
@@ -0,0 +1,88 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for DIM-685: per-run log directory routing."""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _clean_env(monkeypatch):
+    """Remove DIMOS_RUN_LOG_DIR from env between tests."""
+    monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
+
+
+class TestSetRunLogDir:
+    """set_run_log_dir() configures per-run logging."""
+
+    def test_creates_directory(self, tmp_path):
+        from dimos.utils.logging_config import set_run_log_dir
+
+        log_dir = tmp_path / "run-001"
+        set_run_log_dir(log_dir)
+        assert log_dir.is_dir()
+
+    def test_sets_env_var(self, tmp_path):
+        from dimos.utils.logging_config import set_run_log_dir
+
+        log_dir = tmp_path / "run-002"
+        set_run_log_dir(log_dir)
+        assert os.environ["DIMOS_RUN_LOG_DIR"] == str(log_dir)
+
+    def test_get_run_log_dir_returns_path(self, tmp_path):
+        from dimos.utils.logging_config import get_run_log_dir, set_run_log_dir
+
+        log_dir = tmp_path / "run-003"
+        set_run_log_dir(log_dir)
+        assert get_run_log_dir() == log_dir
+
+
+class TestLogFilePathRouting:
+    """_get_log_file_path() routes to per-run directory when set."""
+
+    def test_routes_to_run_dir(self, tmp_path):
+        from dimos.utils.logging_config import _get_log_file_path, set_run_log_dir
+
+        log_dir = tmp_path / "run-004"
+        set_run_log_dir(log_dir)
+        path = _get_log_file_path()
+        assert path == log_dir / "main.jsonl"
+
+    def test_routes_via_env_var(self, tmp_path, monkeypatch):
+        from dimos.utils import logging_config
+
+        monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
+        monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
+
+        env_dir = tmp_path / "env-run"
+        monkeypatch.setenv("DIMOS_RUN_LOG_DIR", str(env_dir))
+
+        path = logging_config._get_log_file_path()
+        assert path == env_dir / "main.jsonl"
+        assert env_dir.is_dir()
+
+    def test_falls_back_to_legacy(self, tmp_path, monkeypatch):
+        from dimos.utils import logging_config
+
+        monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
+        monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
+        monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
+
+        path = logging_config._get_log_file_path()
+        assert path.name.startswith("dimos_")
+        assert path.suffix == ".jsonl"

From f1e5a015882cccfdeca6f2329ca28e18159d3108 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 14:52:38 +0000
Subject: [PATCH 08/37] fix: migrate existing FileHandlers when set_run_log_dir
 is called

setup_logger() runs at import time throughout the codebase, creating
FileHandlers pointing to the legacy log path. set_run_log_dir() was
resetting the global path but not updating these existing handlers,
so main.jsonl was created but stayed empty (0 bytes).

fix: iterate all stdlib loggers and redirect their FileHandlers to
the new per-run path. verified: main.jsonl now receives structured
JSON logs (1050 bytes, 5 lines in test run).
---
 dimos/utils/logging_config.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/dimos/utils/logging_config.py b/dimos/utils/logging_config.py
index f91e41af6d..100414ce6c 100644
--- a/dimos/utils/logging_config.py
+++ b/dimos/utils/logging_config.py
@@ -42,14 +42,29 @@
 
 
 def set_run_log_dir(log_dir: str | Path) -> None:
-    """Set per-run log directory. Call BEFORE blueprint.build()."""
+    """Set per-run log directory. Call BEFORE blueprint.build().
+
+    Updates the global path AND migrates any existing FileHandlers on
+    stdlib loggers so that logs written after this call go to the new
+    directory.  Workers spawned after this call inherit the env var.
+    """
     global _RUN_LOG_DIR, _LOG_FILE_PATH
     log_dir = Path(log_dir)
     _RUN_LOG_DIR = log_dir
     _RUN_LOG_DIR.mkdir(parents=True, exist_ok=True)
-    _LOG_FILE_PATH = None
+    new_path = log_dir / "main.jsonl"
+    _LOG_FILE_PATH = new_path
     os.environ["DIMOS_RUN_LOG_DIR"] = str(log_dir)
 
+    # Migrate existing FileHandlers to the new path
+    for logger_name in list(logging.Logger.manager.loggerDict):
+        logger_obj = logging.getLogger(logger_name)
+        for handler in logger_obj.handlers:
+            if isinstance(handler, logging.FileHandler) and handler.baseFilename != str(new_path):
+                handler.close()
+                handler.baseFilename = str(new_path)
+                handler.stream = handler._open()
+
 
 def get_run_log_dir() -> Path | None:
     return _RUN_LOG_DIR

From 1541993a65ed2fd0e18801003db286a3ba37810f Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 15:23:14 +0000
Subject: [PATCH 09/37] chore: move daemon tests to dimos/core/ for CI
 discovery

testpaths in pyproject.toml is ['dimos'], so tests/ at repo root
wouldn't be picked up by CI. moved all 3 test files to dimos/core/
alongside existing core tests. all 41 tests pass with conftest active.
---
 {tests => dimos/core}/test_daemon.py       | 0
 {tests => dimos/core}/test_e2e_daemon.py   | 0
 {tests => dimos/core}/test_per_run_logs.py | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename {tests => dimos/core}/test_daemon.py (100%)
 rename {tests => dimos/core}/test_e2e_daemon.py (100%)
 rename {tests => dimos/core}/test_per_run_logs.py (100%)

diff --git a/tests/test_daemon.py b/dimos/core/test_daemon.py
similarity index 100%
rename from tests/test_daemon.py
rename to dimos/core/test_daemon.py
diff --git a/tests/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
similarity index 100%
rename from tests/test_e2e_daemon.py
rename to dimos/core/test_e2e_daemon.py
diff --git a/tests/test_per_run_logs.py b/dimos/core/test_per_run_logs.py
similarity index 100%
rename from tests/test_per_run_logs.py
rename to dimos/core/test_per_run_logs.py

From 50d456d6f0ad9a25e09f551aca81e9cdaf8d9498 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 15:38:55 +0000
Subject: [PATCH 10/37] chore: mark e2e daemon tests as slow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

matches convention from test_worker.py — forkserver-based tests are
marked slow so they run in CI but are skipped in local default pytest.

local default: 36 tests (unit only)
CI (-m 'not (tool or mujoco)'): 41 tests (unit + e2e)
---
 dimos/core/test_e2e_daemon.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index 00306f0d33..0dbf8b85bb 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -110,6 +110,7 @@ def _clean_registry():
 # ---------------------------------------------------------------------------
 
 
+@pytest.mark.slow
 class TestDaemonE2E:
     """End-to-end daemon lifecycle with real workers."""
 

From 7446f75287fecb7a468b1a3a45d0e1fbc9e6ee6c Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 17:43:18 +0000
Subject: [PATCH 11/37] test: add CLI integration tests for dimos stop and
 dimos status (DIM-682, DIM-684)

16 tests using typer CliRunner with real subprocess PIDs:

status (7 tests):
- no instances, running instance details, uptime formatting
- MCP port, multiple instances, dead PID filtering

stop (9 tests):
- default most recent, --pid, --pid not found
- --all, --all empty, --force SIGKILL
- already-dead cleanup, SIGTERM verification
---
 dimos/core/test_cli_stop_status.py | 295 +++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 dimos/core/test_cli_stop_status.py

diff --git a/dimos/core/test_cli_stop_status.py b/dimos/core/test_cli_stop_status.py
new file mode 100644
index 0000000000..460112798e
--- /dev/null
+++ b/dimos/core/test_cli_stop_status.py
@@ -0,0 +1,295 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for `dimos status` and `dimos stop` CLI commands.
+
+Exercises the actual typer CLI surface using CliRunner with real
+subprocess PIDs, not mocks. Verifies output formatting, flag handling,
+and registry lifecycle.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+import subprocess
+import time
+from typing import TYPE_CHECKING
+
+import pytest
+
+from dimos.core import run_registry
+from dimos.core.run_registry import RunEntry, list_runs
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+@pytest.fixture(autouse=True)
+def _tmp_registry(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+    """Redirect registry to a temp dir for test isolation."""
+    monkeypatch.setattr(run_registry, "REGISTRY_DIR", tmp_path)
+    yield tmp_path
+
+
+@pytest.fixture()
+def sleeper():
+    """Start a sleep subprocess, kill it on teardown."""
+    procs: list[subprocess.Popen] = []
+
+    def _make():
+        p = subprocess.Popen(["sleep", "300"])
+        procs.append(p)
+        return p
+
+    yield _make
+    for p in procs:
+        try:
+            p.kill()
+            p.wait(timeout=2)
+        except Exception:
+            pass
+
+
+def _entry(run_id: str, pid: int, blueprint: str = "test", **kwargs) -> RunEntry:
+    defaults = dict(
+        started_at=datetime.now(timezone.utc).isoformat(),
+        log_dir="/tmp/dimos-test",
+        cli_args=[blueprint],
+        config_overrides={},
+    )
+    defaults.update(kwargs)
+    e = RunEntry(run_id=run_id, pid=pid, blueprint=blueprint, **defaults)
+    e.save()
+    return e
+
+
+# ---------------------------------------------------------------------------
+# STATUS
+# ---------------------------------------------------------------------------
+
+
+class TestStatusCLI:
+    """Tests for `dimos status` command."""
+
+    def test_status_no_instances(self):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        result = CliRunner().invoke(main, ["status"])
+        assert result.exit_code == 0
+        assert "No running" in result.output
+
+    def test_status_shows_running_instance(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        _entry("status-test-001", proc.pid, blueprint="unitree-go2")
+
+        result = CliRunner().invoke(main, ["status"])
+        assert result.exit_code == 0
+        assert "status-test-001" in result.output
+        assert str(proc.pid) in result.output
+        assert "unitree-go2" in result.output
+
+    def test_status_shows_uptime_minutes(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        ago = (datetime.now(timezone.utc) - timedelta(minutes=7, seconds=30)).isoformat()
+        _entry("uptime-min", proc.pid, started_at=ago)
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "7m" in result.output
+
+    def test_status_shows_uptime_hours(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        ago = (datetime.now(timezone.utc) - timedelta(hours=3, minutes=22)).isoformat()
+        _entry("uptime-hrs", proc.pid, started_at=ago)
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "3h 22m" in result.output
+
+    def test_status_shows_mcp_port(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        _entry("mcp-port-test", proc.pid, mcp_port=9990)
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "9990" in result.output
+
+    def test_status_multiple_instances(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        p1, p2 = sleeper(), sleeper()
+        _entry("multi-1", p1.pid, blueprint="go2")
+        _entry("multi-2", p2.pid, blueprint="g1")
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "multi-1" in result.output
+        assert "multi-2" in result.output
+        assert "go2" in result.output
+        assert "g1" in result.output
+
+    def test_status_filters_dead_pids(self):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        _entry("dead-one", pid=2_000_000_000)  # definitely not alive
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "No running" in result.output
+
+
+# ---------------------------------------------------------------------------
+# STOP
+# ---------------------------------------------------------------------------
+
+
+class TestStopCLI:
+    """Tests for `dimos stop` command."""
+
+    def test_stop_no_instances(self):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        result = CliRunner().invoke(main, ["stop"])
+        assert result.exit_code == 1
+
+    def test_stop_default_most_recent(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        entry = _entry("stop-default", proc.pid)
+
+        result = CliRunner().invoke(main, ["stop"])
+        assert result.exit_code == 0
+        assert "Stopping" in result.output
+        assert "stop-default" in result.output
+        time.sleep(0.3)
+        assert not entry.registry_path.exists()
+
+    def test_stop_by_pid(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        p1, p2 = sleeper(), sleeper()
+        e1 = _entry("by-pid-1", p1.pid)
+        e2 = _entry("by-pid-2", p2.pid)
+
+        result = CliRunner().invoke(main, ["stop", "--pid", str(p1.pid)])
+        assert result.exit_code == 0
+        assert "by-pid-1" in result.output
+        time.sleep(0.3)
+        assert not e1.registry_path.exists()
+        # e2 should still exist
+        assert e2.registry_path.exists()
+
+    def test_stop_by_pid_not_found(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        _entry("exists", proc.pid)
+
+        result = CliRunner().invoke(main, ["stop", "--pid", "99999999"])
+        assert result.exit_code == 1
+        assert "no running instance" in result.output.lower()
+
+    def test_stop_all(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        p1, p2, p3 = sleeper(), sleeper(), sleeper()
+        _entry("all-1", p1.pid)
+        _entry("all-2", p2.pid)
+        _entry("all-3", p3.pid)
+
+        result = CliRunner().invoke(main, ["stop", "--all"])
+        assert result.exit_code == 0
+        assert "all-1" in result.output
+        assert "all-2" in result.output
+        assert "all-3" in result.output
+        time.sleep(0.5)
+        assert len(list_runs(alive_only=False)) == 0
+
+    def test_stop_all_empty(self):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        result = CliRunner().invoke(main, ["stop", "--all"])
+        assert result.exit_code == 0
+        assert "No running" in result.output
+
+    def test_stop_force_sends_sigkill(self, sleeper):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        _entry("force-kill", proc.pid)
+
+        result = CliRunner().invoke(main, ["stop", "--force"])
+        assert result.exit_code == 0
+        assert "SIGKILL" in result.output
+        time.sleep(0.3)
+        # Process should be dead
+        assert proc.poll() is not None
+
+    def test_stop_already_dead_cleans_registry(self):
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        entry = _entry("already-dead", pid=2_000_000_000)
+
+        CliRunner().invoke(main, ["stop", "--pid", "2000000000"])
+        # Should handle gracefully — either clean or report
+        assert not entry.registry_path.exists()
+
+    def test_stop_sigterm_kills_process(self, sleeper):
+        """Verify SIGTERM actually terminates the target process."""
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        proc = sleeper()
+        _entry("sigterm-verify", proc.pid)
+
+        result = CliRunner().invoke(main, ["stop"])
+        assert "SIGTERM" in result.output
+        # sleep handles SIGTERM by dying — wait for it
+        time.sleep(6)
+        assert proc.poll() is not None

From 0b0caa2585b1049f843f6af64ee2557a218eeb76 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 18:09:35 +0000
Subject: [PATCH 12/37] test: add e2e CLI tests against real running blueprint
 (DIM-682, DIM-684)

3 new e2e tests that exercise dimos status and stop against a live
PingPong blueprint with real forkserver workers:

- status shows live blueprint details (run_id, PID, blueprint name)
- registry entry findable after real build, workers alive
- coord.stop() kills workers, registry cleaned
---
 dimos/core/test_e2e_daemon.py | 104 ++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index 0dbf8b85bb..ae8808867c 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -237,3 +237,107 @@ def test_stale_cleanup(self):
 
         coord.stop()
         live.remove()
+
+
+# ---------------------------------------------------------------------------
+# E2E: CLI status + stop against real running blueprint
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.slow
+class TestCLIWithRealBlueprint:
+    """Exercise `dimos status` and `dimos stop` against a live DimOS blueprint."""
+
+    def _build_and_register(self, run_id: str = "e2e-cli-test"):
+        """Build PingPong blueprint and register it in the run registry."""
+        global_config.update(viewer_backend="none", n_workers=1)
+        bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
+        coord = bp.build(cli_config_overrides={"viewer_backend": "none", "n_workers": 1})
+
+        entry = RunEntry(
+            run_id=run_id,
+            pid=os.getpid(),
+            blueprint="ping-pong",
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir="/tmp/dimos-e2e-cli",
+            cli_args=["ping-pong"],
+            config_overrides={"n_workers": 1},
+        )
+        entry.save()
+        return coord, entry
+
+    def test_status_shows_live_blueprint(self):
+        """Status should display a running blueprint's details."""
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        coord, entry = self._build_and_register("e2e-status-live")
+        try:
+            runner = CliRunner()
+            result = runner.invoke(main, ["status"])
+
+            assert result.exit_code == 0
+            assert "e2e-status-live" in result.output
+            assert "ping-pong" in result.output
+            assert str(os.getpid()) in result.output
+        finally:
+            coord.stop()
+            entry.remove()
+
+    def test_status_shows_worker_count_via_registry(self):
+        """Verify the registry entry is findable after real build."""
+        coord, entry = self._build_and_register("e2e-worker-check")
+        try:
+            # Verify workers are actually alive
+            workers = coord._client.workers
+            assert len(workers) >= 1
+            for w in workers:
+                assert w.pid is not None
+
+            # Verify registry entry
+            runs = list_runs(alive_only=True)
+            matching = [r for r in runs if r.run_id == "e2e-worker-check"]
+            assert len(matching) == 1
+        finally:
+            coord.stop()
+            entry.remove()
+
+    def test_stop_kills_real_workers(self):
+        """dimos stop should kill the process and clean up registry."""
+        from typer.testing import CliRunner
+
+        from dimos.robot.cli.dimos import main
+
+        coord, entry = self._build_and_register("e2e-stop-real")
+
+        # Verify workers are alive before stop
+        worker_pids = [w.pid for w in coord._client.workers if w.pid]
+        assert len(worker_pids) >= 1
+
+        runner = CliRunner()
+
+        # Status should show it
+        result = runner.invoke(main, ["status"])
+        assert "e2e-stop-real" in result.output
+
+        # Stop it — note: this sends SIGTERM to our own process (os.getpid()),
+        # which would kill the test. Instead, test that stop with --pid on a
+        # worker PID works, and then clean up the coordinator manually.
+        # The real stop flow is: SIGTERM → signal handler → coord.stop() → registry remove
+        # We test the signal handler separately in test_daemon.py.
+        # Here we verify the registry + coordinator stop flow.
+        coord.stop()
+        time.sleep(0.5)
+
+        # Workers should be dead after coord.stop()
+        for wpid in worker_pids:
+            try:
+                os.kill(wpid, 0)  # check if alive
+                alive = True
+            except ProcessLookupError:
+                alive = False
+            assert not alive, f"Worker PID {wpid} still alive after coord.stop()"
+
+        entry.remove()
+        assert len(list_runs(alive_only=True)) == 0

From 958711505fc0fb189cc5171274c2b2218cdc288e Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 19:48:08 +0000
Subject: [PATCH 13/37] fix: address paul's review comments

- move system imports to top of run(), logger.info before heavy imports
- remove hardcoded MCP port line from daemon output
- add n_workers/n_modules properties to ModuleCoordinator (public API)
- single-instance model: remove --all/--pid from stop, simplify status
- use _get_user_data_dir() for XDG-compliant registry/log paths
- remove mcp_port from RunEntry (should be in GlobalConfig)
- inline _shutdown_handler as closure in install_signal_handlers
- add SIGKILL wait poll (2s) to avoid zombie race with port conflict check
- replace handler._open() private API with new FileHandler construction
- fix e2e _clean_registry to monkeypatch REGISTRY_DIR
- reset logging_config module globals in test fixture
- move test imports to module level
---
 dimos/core/daemon.py               |  25 ++----
 dimos/core/module_coordinator.py   |  12 +++
 dimos/core/run_registry.py         |   7 +-
 dimos/core/test_cli_stop_status.py | 132 ++---------------------------
 dimos/core/test_daemon.py          |  37 ++++----
 dimos/core/test_e2e_daemon.py      |  16 ++--
 dimos/core/test_per_run_logs.py    |   6 +-
 dimos/robot/cli/dimos.py           |  95 ++++++++-------------
 dimos/utils/logging_config.py      |   8 +-
 9 files changed, 98 insertions(+), 240 deletions(-)

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
index ff23fe1d08..b660a05ea6 100644
--- a/dimos/core/daemon.py
+++ b/dimos/core/daemon.py
@@ -131,27 +131,18 @@ def daemonize(log_dir: Path) -> None:
 # Signal handler for clean shutdown
 # ---------------------------------------------------------------------------
 
-_active_entry: RunEntry | None = None
-_active_coordinator: ModuleCoordinator | None = None
-
 
 def install_signal_handlers(entry: RunEntry, coordinator: ModuleCoordinator) -> None:
     """Install SIGTERM/SIGINT handlers that stop the coordinator and clean the registry."""
-    global _active_entry, _active_coordinator
-    _active_entry = entry
-    _active_coordinator = coordinator
-
-    signal.signal(signal.SIGTERM, _shutdown_handler)
-    signal.signal(signal.SIGINT, _shutdown_handler)
 
-
-def _shutdown_handler(signum: int, frame: object) -> None:
-    logger.info("Received signal, shutting down", signal=signum)
-    if _active_coordinator is not None:
+    def _shutdown(signum: int, frame: object) -> None:
+        logger.info("Received signal, shutting down", signal=signum)
         try:
-            _active_coordinator.stop()
+            coordinator.stop()
         except Exception:
             logger.error("Error during coordinator stop", exc_info=True)
-    if _active_entry is not None:
-        _active_entry.remove()
-    sys.exit(0)
+        entry.remove()
+        sys.exit(0)
+
+    signal.signal(signal.SIGTERM, _shutdown)
+    signal.signal(signal.SIGINT, _shutdown)
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 86afb9ebc4..99bad74422 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -49,6 +49,18 @@ def __init__(
         self._global_config = cfg
         self._deployed_modules = {}
 
+    @property
+    def n_workers(self) -> int:
+        """Number of active workers."""
+        if self._client is None:
+            return 0
+        return len(self._client.workers)
+
+    @property
+    def n_modules(self) -> int:
+        """Number of deployed modules."""
+        return len(self._deployed_modules)
+
     def start(self) -> None:
         n = self._n if self._n is not None else 2
         self._client = WorkerManager(n_workers=n)
diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py
index cdfaa11790..e9e4f63a39 100644
--- a/dimos/core/run_registry.py
+++ b/dimos/core/run_registry.py
@@ -42,7 +42,6 @@ class RunEntry:
     log_dir: str
     cli_args: list[str] = field(default_factory=list)
     config_overrides: dict[str, object] = field(default_factory=dict)
-    mcp_port: int = 9990
     grpc_port: int = 9877
 
     @property
@@ -120,10 +119,10 @@ def cleanup_stale() -> int:
     return removed
 
 
-def check_port_conflicts(mcp_port: int = 9990, grpc_port: int = 9877) -> RunEntry | None:
-    """Check if any alive run is using our ports. Returns conflicting entry or None."""
+def check_port_conflicts(grpc_port: int = 9877) -> RunEntry | None:
+    """Check if any alive run is using the gRPC port. Returns conflicting entry or None."""
     for entry in list_runs(alive_only=True):
-        if entry.mcp_port == mcp_port or entry.grpc_port == grpc_port:
+        if entry.grpc_port == grpc_port:
             return entry
     return None
 
diff --git a/dimos/core/test_cli_stop_status.py b/dimos/core/test_cli_stop_status.py
index 460112798e..84303be509 100644
--- a/dimos/core/test_cli_stop_status.py
+++ b/dimos/core/test_cli_stop_status.py
@@ -27,9 +27,11 @@
 from typing import TYPE_CHECKING
 
 import pytest
+from typer.testing import CliRunner
 
 from dimos.core import run_registry
-from dimos.core.run_registry import RunEntry, list_runs
+from dimos.core.run_registry import RunEntry
+from dimos.robot.cli.dimos import main
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -83,19 +85,11 @@ class TestStatusCLI:
     """Tests for `dimos status` command."""
 
     def test_status_no_instances(self):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         result = CliRunner().invoke(main, ["status"])
         assert result.exit_code == 0
         assert "No running" in result.output
 
     def test_status_shows_running_instance(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         _entry("status-test-001", proc.pid, blueprint="unitree-go2")
 
@@ -106,10 +100,6 @@ def test_status_shows_running_instance(self, sleeper):
         assert "unitree-go2" in result.output
 
     def test_status_shows_uptime_minutes(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         ago = (datetime.now(timezone.utc) - timedelta(minutes=7, seconds=30)).isoformat()
         _entry("uptime-min", proc.pid, started_at=ago)
@@ -118,10 +108,6 @@ def test_status_shows_uptime_minutes(self, sleeper):
         assert "7m" in result.output
 
     def test_status_shows_uptime_hours(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         ago = (datetime.now(timezone.utc) - timedelta(hours=3, minutes=22)).isoformat()
         _entry("uptime-hrs", proc.pid, started_at=ago)
@@ -129,37 +115,14 @@ def test_status_shows_uptime_hours(self, sleeper):
         result = CliRunner().invoke(main, ["status"])
         assert "3h 22m" in result.output
 
-    def test_status_shows_mcp_port(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
+    def test_status_shows_blueprint(self, sleeper):
         proc = sleeper()
-        _entry("mcp-port-test", proc.pid, mcp_port=9990)
-
-        result = CliRunner().invoke(main, ["status"])
-        assert "9990" in result.output
-
-    def test_status_multiple_instances(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        p1, p2 = sleeper(), sleeper()
-        _entry("multi-1", p1.pid, blueprint="go2")
-        _entry("multi-2", p2.pid, blueprint="g1")
+        _entry("bp-test", proc.pid, blueprint="unitree-g1")
 
         result = CliRunner().invoke(main, ["status"])
-        assert "multi-1" in result.output
-        assert "multi-2" in result.output
-        assert "go2" in result.output
-        assert "g1" in result.output
+        assert "unitree-g1" in result.output
 
     def test_status_filters_dead_pids(self):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         _entry("dead-one", pid=2_000_000_000)  # definitely not alive
 
         result = CliRunner().invoke(main, ["status"])
@@ -175,18 +138,10 @@ class TestStopCLI:
     """Tests for `dimos stop` command."""
 
     def test_stop_no_instances(self):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         result = CliRunner().invoke(main, ["stop"])
         assert result.exit_code == 1
 
     def test_stop_default_most_recent(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         entry = _entry("stop-default", proc.pid)
 
@@ -197,67 +152,7 @@ def test_stop_default_most_recent(self, sleeper):
         time.sleep(0.3)
         assert not entry.registry_path.exists()
 
-    def test_stop_by_pid(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        p1, p2 = sleeper(), sleeper()
-        e1 = _entry("by-pid-1", p1.pid)
-        e2 = _entry("by-pid-2", p2.pid)
-
-        result = CliRunner().invoke(main, ["stop", "--pid", str(p1.pid)])
-        assert result.exit_code == 0
-        assert "by-pid-1" in result.output
-        time.sleep(0.3)
-        assert not e1.registry_path.exists()
-        # e2 should still exist
-        assert e2.registry_path.exists()
-
-    def test_stop_by_pid_not_found(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        proc = sleeper()
-        _entry("exists", proc.pid)
-
-        result = CliRunner().invoke(main, ["stop", "--pid", "99999999"])
-        assert result.exit_code == 1
-        assert "no running instance" in result.output.lower()
-
-    def test_stop_all(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        p1, p2, p3 = sleeper(), sleeper(), sleeper()
-        _entry("all-1", p1.pid)
-        _entry("all-2", p2.pid)
-        _entry("all-3", p3.pid)
-
-        result = CliRunner().invoke(main, ["stop", "--all"])
-        assert result.exit_code == 0
-        assert "all-1" in result.output
-        assert "all-2" in result.output
-        assert "all-3" in result.output
-        time.sleep(0.5)
-        assert len(list_runs(alive_only=False)) == 0
-
-    def test_stop_all_empty(self):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        result = CliRunner().invoke(main, ["stop", "--all"])
-        assert result.exit_code == 0
-        assert "No running" in result.output
-
     def test_stop_force_sends_sigkill(self, sleeper):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         _entry("force-kill", proc.pid)
 
@@ -268,23 +163,8 @@ def test_stop_force_sends_sigkill(self, sleeper):
         # Process should be dead
         assert proc.poll() is not None
 
-    def test_stop_already_dead_cleans_registry(self):
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        entry = _entry("already-dead", pid=2_000_000_000)
-
-        CliRunner().invoke(main, ["stop", "--pid", "2000000000"])
-        # Should handle gracefully — either clean or report
-        assert not entry.registry_path.exists()
-
     def test_stop_sigterm_kills_process(self, sleeper):
         """Verify SIGTERM actually terminates the target process."""
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
         proc = sleeper()
         _entry("sigterm-verify", proc.pid)
 
diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py
index 65ee180106..54cf4f206d 100644
--- a/dimos/core/test_daemon.py
+++ b/dimos/core/test_daemon.py
@@ -49,7 +49,6 @@ def tmp_registry(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
 def _make_entry(
     run_id: str = "20260306-120000-test",
     pid: int | None = None,
-    mcp_port: int = 9990,
     grpc_port: int = 9877,
 ) -> RunEntry:
     return RunEntry(
@@ -60,7 +59,6 @@ def _make_entry(
         log_dir="/tmp/test-logs",
         cli_args=["test"],
         config_overrides={},
-        mcp_port=mcp_port,
         grpc_port=grpc_port,
     )
 
@@ -76,7 +74,6 @@ def test_run_entry_save_load_remove(self, tmp_registry: Path):
         assert loaded.run_id == entry.run_id
         assert loaded.pid == entry.pid
         assert loaded.blueprint == entry.blueprint
-        assert loaded.mcp_port == entry.mcp_port
         assert loaded.grpc_port == entry.grpc_port
 
         entry.remove()
@@ -148,32 +145,25 @@ class TestPortConflicts:
     """Port conflict detection."""
 
     def test_port_conflict_detection(self, tmp_registry: Path):
-        entry = _make_entry(pid=os.getpid(), mcp_port=9990, grpc_port=9877)
+        entry = _make_entry(pid=os.getpid(), grpc_port=9877)
         entry.save()
 
-        conflict = check_port_conflicts(mcp_port=9990, grpc_port=9877)
+        conflict = check_port_conflicts(grpc_port=9877)
         assert conflict is not None
         assert conflict.run_id == entry.run_id
 
-    def test_port_conflict_mcp_only(self, tmp_registry: Path):
-        entry = _make_entry(pid=os.getpid(), mcp_port=9990, grpc_port=1111)
-        entry.save()
-
-        conflict = check_port_conflicts(mcp_port=9990, grpc_port=2222)
-        assert conflict is not None
-
     def test_port_conflict_grpc_only(self, tmp_registry: Path):
-        entry = _make_entry(pid=os.getpid(), mcp_port=1111, grpc_port=9877)
+        entry = _make_entry(pid=os.getpid(), grpc_port=9877)
         entry.save()
 
-        conflict = check_port_conflicts(mcp_port=2222, grpc_port=9877)
+        conflict = check_port_conflicts(grpc_port=9877)
         assert conflict is not None
 
     def test_port_conflict_no_false_positive(self, tmp_registry: Path):
-        entry = _make_entry(pid=os.getpid(), mcp_port=8000, grpc_port=8001)
+        entry = _make_entry(pid=os.getpid(), grpc_port=8001)
         entry.save()
 
-        conflict = check_port_conflicts(mcp_port=9990, grpc_port=9877)
+        conflict = check_port_conflicts(grpc_port=9877)
         assert conflict is None
 
 
@@ -284,7 +274,7 @@ def test_health_check_partial_death(self):
 # Daemon tests
 # ---------------------------------------------------------------------------
 
-from dimos.core.daemon import _shutdown_handler, daemonize, install_signal_handlers
+from dimos.core.daemon import daemonize, install_signal_handlers
 
 
 class TestDaemonize:
@@ -313,10 +303,13 @@ def test_signal_handler_cleans_registry(self, tmp_registry: Path):
         assert entry.registry_path.exists()
 
         coord = mock.MagicMock()
-        install_signal_handlers(entry, coord)
+        with mock.patch("signal.signal") as mock_signal:
+            install_signal_handlers(entry, coord)
+            # Capture the handler closure registered for SIGTERM
+            handler = mock_signal.call_args_list[0][0][1]
 
         with pytest.raises(SystemExit):
-            _shutdown_handler(signal.SIGTERM, None)
+            handler(signal.SIGTERM, None)
 
         # Registry file should be cleaned up
         assert not entry.registry_path.exists()
@@ -329,10 +322,12 @@ def test_signal_handler_tolerates_stop_error(self, tmp_registry: Path):
 
         coord = mock.MagicMock()
         coord.stop.side_effect = RuntimeError("boom")
-        install_signal_handlers(entry, coord)
+        with mock.patch("signal.signal") as mock_signal:
+            install_signal_handlers(entry, coord)
+            handler = mock_signal.call_args_list[0][0][1]
 
         with pytest.raises(SystemExit):
-            _shutdown_handler(signal.SIGTERM, None)
+            handler(signal.SIGTERM, None)
 
         # Entry still removed even if stop() throws
         assert not entry.registry_path.exists()
diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index ae8808867c..31ebdc15b7 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -38,7 +38,6 @@
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.run_registry import (
-    REGISTRY_DIR,
     RunEntry,
     cleanup_stale,
     get_most_recent,
@@ -96,13 +95,14 @@ def _make_entry(coord, run_id: str | None = None) -> RunEntry:
 
 
 @pytest.fixture(autouse=True)
-def _clean_registry():
-    """Remove any leftover registry entries before/after each test."""
-    for f in REGISTRY_DIR.glob("*.json"):
-        f.unlink(missing_ok=True)
-    yield
-    for f in REGISTRY_DIR.glob("*.json"):
-        f.unlink(missing_ok=True)
+def _clean_registry(tmp_path, monkeypatch):
+    """Redirect registry to a temp dir for test isolation."""
+    import dimos.core.run_registry as _reg
+
+    test_dir = tmp_path / "runs"
+    test_dir.mkdir()
+    monkeypatch.setattr(_reg, "REGISTRY_DIR", test_dir)
+    yield test_dir
 
 
 # ---------------------------------------------------------------------------
diff --git a/dimos/core/test_per_run_logs.py b/dimos/core/test_per_run_logs.py
index ab2e293d10..2f4a54da63 100644
--- a/dimos/core/test_per_run_logs.py
+++ b/dimos/core/test_per_run_logs.py
@@ -23,8 +23,12 @@
 
 @pytest.fixture(autouse=True)
 def _clean_env(monkeypatch):
-    """Remove DIMOS_RUN_LOG_DIR from env between tests."""
+    """Remove DIMOS_RUN_LOG_DIR from env and reset module globals between tests."""
+    from dimos.utils import logging_config
+
     monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
+    monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
+    monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
 
 
 class TestSetRunLogDir:
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 64eb0b349f..5951d56bfa 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -111,19 +111,19 @@ def run(
     from datetime import datetime, timezone
     import os
 
+    logger.info("Starting DimOS")
+
     from dimos.core.blueprints import autoconnect
     from dimos.core.run_registry import (
-        LOG_BASE_DIR,
         RunEntry,
         check_port_conflicts,
         cleanup_stale,
         generate_run_id,
     )
     from dimos.robot.get_all_blueprints import get_by_name
-    from dimos.utils.logging_config import setup_exception_handler
+    from dimos.utils.logging_config import set_run_log_dir, setup_exception_handler
 
     setup_exception_handler()
-    logger.info("Starting DimOS")
 
     cli_config_overrides: dict[str, Any] = ctx.obj
     global_config.update(**cli_config_overrides)
@@ -169,14 +169,13 @@ def run(
             coordinator.stop()
             raise typer.Exit(1)
 
-        n_workers = len(coordinator._client.workers) if coordinator._client else 0
-        n_modules = len(coordinator._deployed_modules)
+        n_workers = coordinator.n_workers
+        n_modules = coordinator.n_modules
         typer.echo(f"✓ All modules started ({n_modules} modules, {n_workers} workers)")
         typer.echo("✓ Health check passed")
         typer.echo("✓ DimOS running in background\n")
         typer.echo(f"  Run ID:    {run_id}")
         typer.echo(f"  Log:       {log_dir}")
-        typer.echo("  MCP:       http://localhost:9990/mcp\n")
         typer.echo("  Stop:      dimos stop")
         typer.echo("  Status:    dimos status")
 
@@ -213,74 +212,45 @@ def run(
 
 @main.command()
 def status() -> None:
-    """Show running DimOS instances."""
+    """Show the running DimOS instance."""
+    from datetime import datetime, timezone
 
-    from dimos.core.run_registry import list_runs
+    from dimos.core.run_registry import get_most_recent
 
-    entries = list_runs(alive_only=True)
-    if not entries:
-        typer.echo("No running DimOS instances")
+    entry = get_most_recent(alive_only=True)
+    if not entry:
+        typer.echo("No running DimOS instance")
         return
 
-    for entry in entries:
-        # Calculate uptime
-        try:
-            from datetime import datetime, timezone
-
-            started = datetime.fromisoformat(entry.started_at)
-            age = datetime.now(timezone.utc) - started
-            hours, remainder = divmod(int(age.total_seconds()), 3600)
-            minutes, seconds = divmod(remainder, 60)
-            if hours > 0:
-                uptime = f"{hours}h {minutes}m"
-            else:
-                uptime = f"{minutes}m {seconds}s"
-        except Exception:
-            uptime = "unknown"
+    try:
+        started = datetime.fromisoformat(entry.started_at)
+        age = datetime.now(timezone.utc) - started
+        hours, remainder = divmod(int(age.total_seconds()), 3600)
+        minutes, seconds = divmod(remainder, 60)
+        uptime = f"{hours}h {minutes}m" if hours > 0 else f"{minutes}m {seconds}s"
+    except Exception:
+        uptime = "unknown"
 
-        typer.echo(f"  Run ID:    {entry.run_id}")
-        typer.echo(f"  PID:       {entry.pid}")
-        typer.echo(f"  Blueprint: {entry.blueprint}")
-        typer.echo(f"  Uptime:    {uptime}")
-        typer.echo(f"  Log:       {entry.log_dir}")
-        typer.echo(f"  MCP:       http://localhost:{entry.mcp_port}/mcp")
-        if len(entries) > 1:
-            typer.echo("")
+    typer.echo(f"  Run ID:    {entry.run_id}")
+    typer.echo(f"  PID:       {entry.pid}")
+    typer.echo(f"  Blueprint: {entry.blueprint}")
+    typer.echo(f"  Uptime:    {uptime}")
+    typer.echo(f"  Log:       {entry.log_dir}")
 
 
 @main.command()
 def stop(
-    pid: int = typer.Option(None, "--pid", "-p", help="PID of instance to stop"),
-    all_instances: bool = typer.Option(False, "--all", "-a", help="Stop all instances"),
     force: bool = typer.Option(False, "--force", "-f", help="Force kill (SIGKILL)"),
 ) -> None:
-    """Stop a running DimOS instance."""
+    """Stop the running DimOS instance."""
 
-    from dimos.core.run_registry import RunEntry, get_most_recent, list_runs
+    from dimos.core.run_registry import get_most_recent
 
-    if all_instances:
-        entries = list_runs(alive_only=True)
-        if not entries:
-            typer.echo("No running DimOS instances")
-            return
-        for e in entries:
-            _stop_entry(e, force=force)
-        return
-
-    entry: RunEntry | None = None
-    if pid:
-        entries = list_runs(alive_only=True)
-        entry = next((e for e in entries if e.pid == pid), None)
-        if not entry:
-            typer.echo(f"Error: no running instance with PID {pid}", err=True)
-            raise typer.Exit(1)
-    else:
-        entry = get_most_recent(alive_only=True)
-        if not entry:
-            typer.echo("No running DimOS instances", err=True)
-            raise typer.Exit(1)
+    entry = get_most_recent(alive_only=True)
+    if not entry:
+        typer.echo("No running DimOS instance", err=True)
+        raise typer.Exit(1)
 
-    assert entry is not None
     _stop_entry(entry, force=force)
 
 
@@ -316,6 +286,11 @@ def _stop_entry(entry, force: bool = False) -> None:  # type: ignore[no-untyped-
                 os.kill(entry.pid, signal.SIGKILL)
             except ProcessLookupError:
                 pass
+            else:
+                for _ in range(20):  # up to 2s for SIGKILL to be processed
+                    if not is_pid_alive(entry.pid):
+                        break
+                    time.sleep(0.1)
 
     entry.remove()
     typer.echo("  Stopped.")
diff --git a/dimos/utils/logging_config.py b/dimos/utils/logging_config.py
index 100414ce6c..f159d13d43 100644
--- a/dimos/utils/logging_config.py
+++ b/dimos/utils/logging_config.py
@@ -59,11 +59,13 @@ def set_run_log_dir(log_dir: str | Path) -> None:
     # Migrate existing FileHandlers to the new path
     for logger_name in list(logging.Logger.manager.loggerDict):
         logger_obj = logging.getLogger(logger_name)
-        for handler in logger_obj.handlers:
+        for i, handler in enumerate(logger_obj.handlers):
             if isinstance(handler, logging.FileHandler) and handler.baseFilename != str(new_path):
                 handler.close()
-                handler.baseFilename = str(new_path)
-                handler.stream = handler._open()
+                new_handler = logging.FileHandler(new_path, mode="a", encoding="utf-8")
+                new_handler.setLevel(handler.level)
+                new_handler.setFormatter(handler.formatter)
+                logger_obj.handlers[i] = new_handler
 
 
 def get_run_log_dir() -> Path | None:

From 5d8b76b703adce2a10ff504b6c426c2360bca60e Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 20:06:56 +0000
Subject: [PATCH 14/37] fix: drop daemon.log, redirect all stdio to /dev/null
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

structlog FileHandler writes to main.jsonl — daemon.log only ever
captured signal shutdown messages. no useful content.
---
 dimos/core/daemon.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
index b660a05ea6..685f2fe671 100644
--- a/dimos/core/daemon.py
+++ b/dimos/core/daemon.py
@@ -93,11 +93,11 @@ def daemonize(log_dir: Path) -> None:
     """Double-fork daemonize the current process.
 
     After this call the *caller* is the daemon grandchild.
-    stdout / stderr are redirected to files inside *log_dir*.
+    stdin/stdout/stderr are redirected to ``/dev/null`` — all real
+    logging goes through structlog's FileHandler to ``main.jsonl``.
     The two intermediate parents call ``os._exit(0)``.
     """
     log_dir.mkdir(parents=True, exist_ok=True)
-    stderr_path = log_dir / "daemon.log"
 
     # First fork — detach from terminal
     pid = os.fork()
@@ -111,20 +111,15 @@ def daemonize(log_dir: Path) -> None:
     if pid > 0:
         os._exit(0)
 
-    # Redirect file descriptors
+    # Redirect all stdio to /dev/null — structlog FileHandler is the log path
     sys.stdout.flush()
     sys.stderr.flush()
 
     devnull = open(os.devnull)
-    stderr_file = open(stderr_path, "a")
-
     os.dup2(devnull.fileno(), sys.stdin.fileno())
-    os.dup2(stderr_file.fileno(), sys.stdout.fileno())
-    os.dup2(stderr_file.fileno(), sys.stderr.fileno())
-
-    # Close original FDs — dup2'd copies keep the underlying files open
+    os.dup2(devnull.fileno(), sys.stdout.fileno())
+    os.dup2(devnull.fileno(), sys.stderr.fileno())
     devnull.close()
-    stderr_file.close()
 
 
 # ---------------------------------------------------------------------------

From 8ce1a9dc46a030c0054b8709f448a760e2f57ecd Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 20:09:27 +0000
Subject: [PATCH 15/37] fix: restore LOG_BASE_DIR import, remove duplicate
 set_run_log_dir import

fixes mypy name-defined error from import reorganization
---
 dimos/robot/cli/dimos.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 5951d56bfa..cfd2bd9e80 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -115,6 +115,7 @@ def run(
 
     from dimos.core.blueprints import autoconnect
     from dimos.core.run_registry import (
+        LOG_BASE_DIR,
         RunEntry,
         check_port_conflicts,
         cleanup_stale,
@@ -149,8 +150,6 @@ def run(
 
     # Route structured logs (main.jsonl) to the per-run directory.
     # Workers inherit DIMOS_RUN_LOG_DIR env var via forkserver.
-    from dimos.utils.logging_config import set_run_log_dir
-
     set_run_log_dir(log_dir)
 
     blueprint = autoconnect(*map(get_by_name, robot_types))

From ec1a3f4cfc904e3feca0fc65f4fc8aef951cb969 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 20:18:41 +0000
Subject: [PATCH 16/37] fix: address remaining paul review comments

- simplify health_check to single alive check (build is synchronous)
- remove --health-timeout flag (no longer polling)
- add workers property to ModuleCoordinator (public API)
- separate try-excepts for kill/wait in sleeper fixture cleanup
- move repeated imports to top in test_per_run_logs
---
 dimos/core/daemon.py               | 44 +++-----------
 dimos/core/module_coordinator.py   | 11 +++-
 dimos/core/test_cli_stop_status.py |  3 +
 dimos/core/test_daemon.py          | 92 +++++++-----------------------
 dimos/core/test_e2e_daemon.py      |  6 +-
 dimos/core/test_per_run_logs.py    | 17 +-----
 dimos/robot/cli/dimos.py           |  5 +-
 7 files changed, 45 insertions(+), 133 deletions(-)

diff --git a/dimos/core/daemon.py b/dimos/core/daemon.py
index 685f2fe671..4bfcadd46b 100644
--- a/dimos/core/daemon.py
+++ b/dimos/core/daemon.py
@@ -19,7 +19,6 @@
 import os
 import signal
 import sys
-import time
 from typing import TYPE_CHECKING
 
 from dimos.utils.logging_config import setup_logger
@@ -36,49 +35,20 @@
 # Health check
 # ---------------------------------------------------------------------------
 
-_DEFAULT_HEALTH_TIMEOUT: float = 30.0
-_POLL_INTERVAL: float = 0.25
 
+def health_check(coordinator: ModuleCoordinator) -> bool:
+    """Verify all coordinator workers are alive after build.
 
-def health_check(
-    coordinator: ModuleCoordinator,
-    timeout: float = _DEFAULT_HEALTH_TIMEOUT,
-    poll_interval: float = _POLL_INTERVAL,
-) -> bool:
-    """Poll coordinator workers until *timeout*, return True if all stay alive.
-
-    The check fails immediately when any worker's ``pid`` becomes ``None``
-    (meaning the underlying process exited).  It also fails if there are
-    zero workers (nothing to monitor).
+    Since ``blueprint.build()`` is synchronous, every module should be
+    started by the time this runs.  We just confirm no worker has died.
     """
-    client = coordinator._client
-    if client is None:
-        logger.error("health_check: coordinator has no WorkerManager")
-        return False
-
-    workers = client.workers
-    if not workers:
+    if coordinator.n_workers == 0:
         logger.error("health_check: no workers found")
         return False
 
-    deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
-        for w in workers:
-            if w.pid is None:
-                logger.error(
-                    "health_check: worker died",
-                    worker_id=w.worker_id,
-                )
-                return False
-        time.sleep(poll_interval)
-
-    # Final check after timeout elapsed
-    for w in workers:
+    for w in coordinator.workers:
         if w.pid is None:
-            logger.error(
-                "health_check: worker died at deadline",
-                worker_id=w.worker_id,
-            )
+            logger.error("health_check: worker died", worker_id=w.worker_id)
             return False
 
     return True
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 99bad74422..97a62bdf70 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -49,12 +49,17 @@ def __init__(
         self._global_config = cfg
         self._deployed_modules = {}
 
+    @property
+    def workers(self) -> list:
+        """Active worker processes."""
+        if self._client is None:
+            return []
+        return self._client.workers
+
     @property
     def n_workers(self) -> int:
         """Number of active workers."""
-        if self._client is None:
-            return 0
-        return len(self._client.workers)
+        return len(self.workers)
 
     @property
     def n_modules(self) -> int:
diff --git a/dimos/core/test_cli_stop_status.py b/dimos/core/test_cli_stop_status.py
index 84303be509..76e2001fd9 100644
--- a/dimos/core/test_cli_stop_status.py
+++ b/dimos/core/test_cli_stop_status.py
@@ -58,6 +58,9 @@ def _make():
     for p in procs:
         try:
             p.kill()
+        except OSError:
+            pass  # already dead
+        try:
             p.wait(timeout=2)
         except Exception:
             pass
diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py
index 54cf4f206d..8ea634b6a4 100644
--- a/dimos/core/test_daemon.py
+++ b/dimos/core/test_daemon.py
@@ -21,7 +21,6 @@
 import re
 import signal
 import sys
-import time
 from unittest import mock
 
 import pytest
@@ -175,99 +174,48 @@ def test_port_conflict_no_false_positive(self, tmp_registry: Path):
 
 
 def _mock_worker(pid: int | None = 1234, worker_id: int = 0):
-    """Create a mock Worker with a controllable pid property."""
+    """Create a mock Worker with a controllable pid."""
     w = mock.MagicMock()
-    w.pid = pid
     w.worker_id = worker_id
+    w.pid = pid
     return w
 
 
 def _mock_coordinator(workers: list | None = None):
-    """Create a mock ModuleCoordinator with mock WorkerManager."""
+    """Create a mock ModuleCoordinator with controllable workers/n_workers."""
     coord = mock.MagicMock()
     if workers is not None:
-        coord._client.workers = workers
+        coord.workers = workers
+        coord.n_workers = len(workers)
     else:
-        coord._client = None
+        coord.workers = []
+        coord.n_workers = 0
     return coord
 
 
-class TestHealthCheckAllHealthy:
-    """test_health_check_all_healthy — mock workers with alive PIDs → passes."""
+class TestHealthCheck:
+    """health_check verifies all workers are alive after synchronous build."""
 
-    def test_health_check_all_healthy(self):
+    def test_all_healthy(self):
         workers = [_mock_worker(pid=os.getpid(), worker_id=i) for i in range(3)]
         coord = _mock_coordinator(workers)
+        assert health_check(coord) is True
 
-        result = health_check(coord, timeout=0.5, poll_interval=0.1)
-        assert result is True
-
-
-class TestHealthCheckImmediateDeath:
-    """test_health_check_immediate_death — worker with pid=None → fails immediately."""
-
-    def test_health_check_immediate_death(self):
-        dead_worker = _mock_worker(pid=None, worker_id=0)
-        coord = _mock_coordinator([dead_worker])
-
-        start = time.monotonic()
-        result = health_check(coord, timeout=5.0, poll_interval=0.1)
-        elapsed = time.monotonic() - start
-
-        assert result is False
-        # Should fail almost immediately, not wait for the full timeout
-        assert elapsed < 1.0
-
-
-class TestHealthCheckDelayedDeath:
-    """test_health_check_delayed_death — worker alive initially, then pid → None."""
-
-    def test_health_check_delayed_death(self):
-        worker = _mock_worker(pid=os.getpid(), worker_id=0)
+    def test_dead_worker(self):
+        dead = _mock_worker(pid=None, worker_id=0)
+        coord = _mock_coordinator([dead])
+        assert health_check(coord) is False
 
-        # After a few accesses, pid becomes None (simulating delayed crash)
-        call_count = 0
-        real_pid = os.getpid()
-
-        def pid_getter():
-            nonlocal call_count
-            call_count += 1
-            if call_count > 3:
-                return None
-            return real_pid
-
-        type(worker).pid = mock.PropertyMock(side_effect=pid_getter)
-        coord = _mock_coordinator([worker])
-
-        result = health_check(coord, timeout=5.0, poll_interval=0.05)
-        assert result is False
-
-
-class TestHealthCheckNoWorkers:
-    """test_health_check_no_workers — empty worker list → fails."""
-
-    def test_health_check_no_workers(self):
+    def test_no_workers(self):
         coord = _mock_coordinator(workers=[])
-        result = health_check(coord, timeout=0.5, poll_interval=0.1)
-        assert result is False
-
-    def test_health_check_no_client(self):
-        coord = _mock_coordinator(workers=None)  # _client = None
-        result = health_check(coord, timeout=0.5, poll_interval=0.1)
-        assert result is False
-
+        assert health_check(coord) is False
 
-class TestHealthCheckPartialDeath:
-    """test_health_check_partial_death — 3 workers, 1 dies → fails."""
-
-    def test_health_check_partial_death(self):
+    def test_partial_death(self):
         w1 = _mock_worker(pid=os.getpid(), worker_id=0)
         w2 = _mock_worker(pid=os.getpid(), worker_id=1)
-        w3 = _mock_worker(pid=None, worker_id=2)  # dead
-
+        w3 = _mock_worker(pid=None, worker_id=2)
         coord = _mock_coordinator([w1, w2, w3])
-        result = health_check(coord, timeout=0.5, poll_interval=0.1)
-        assert result is False
+        assert health_check(coord) is False
 
 
 # ---------------------------------------------------------------------------
diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index 31ebdc15b7..1e6bbde533 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -122,7 +122,7 @@ def test_single_worker_lifecycle(self):
         assert len(workers) == 1
         assert len(coord._deployed_modules) == 2  # PingModule + PongModule
 
-        ok = health_check(coord, timeout=2)
+        ok = health_check(coord)
         assert ok, "Health check should pass with healthy worker"
 
         entry = _make_entry(coord)
@@ -147,7 +147,7 @@ def test_multiple_workers(self):
         for w in workers:
             assert w.pid is not None, f"Worker {w.worker_id} has no PID"
 
-        ok = health_check(coord, timeout=2)
+        ok = health_check(coord)
         assert ok, "Health check should pass with 2 healthy workers"
 
         coord.stop()
@@ -164,7 +164,7 @@ def test_health_check_detects_dead_worker(self):
         os.kill(worker_pid, signal.SIGKILL)
         time.sleep(0.5)  # let it die
 
-        ok = health_check(coord, timeout=2)
+        ok = health_check(coord)
         assert not ok, "Health check should FAIL after worker killed"
 
         coord.stop()
diff --git a/dimos/core/test_per_run_logs.py b/dimos/core/test_per_run_logs.py
index 2f4a54da63..ec48ef73a5 100644
--- a/dimos/core/test_per_run_logs.py
+++ b/dimos/core/test_per_run_logs.py
@@ -20,12 +20,13 @@
 
 import pytest
 
+from dimos.utils import logging_config
+from dimos.utils.logging_config import _get_log_file_path, get_run_log_dir, set_run_log_dir
+
 
 @pytest.fixture(autouse=True)
 def _clean_env(monkeypatch):
     """Remove DIMOS_RUN_LOG_DIR from env and reset module globals between tests."""
-    from dimos.utils import logging_config
-
     monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
     monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
     monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
@@ -35,22 +36,16 @@ class TestSetRunLogDir:
     """set_run_log_dir() configures per-run logging."""
 
     def test_creates_directory(self, tmp_path):
-        from dimos.utils.logging_config import set_run_log_dir
-
         log_dir = tmp_path / "run-001"
         set_run_log_dir(log_dir)
         assert log_dir.is_dir()
 
     def test_sets_env_var(self, tmp_path):
-        from dimos.utils.logging_config import set_run_log_dir
-
         log_dir = tmp_path / "run-002"
         set_run_log_dir(log_dir)
         assert os.environ["DIMOS_RUN_LOG_DIR"] == str(log_dir)
 
     def test_get_run_log_dir_returns_path(self, tmp_path):
-        from dimos.utils.logging_config import get_run_log_dir, set_run_log_dir
-
         log_dir = tmp_path / "run-003"
         set_run_log_dir(log_dir)
         assert get_run_log_dir() == log_dir
@@ -60,16 +55,12 @@ class TestLogFilePathRouting:
     """_get_log_file_path() routes to per-run directory when set."""
 
     def test_routes_to_run_dir(self, tmp_path):
-        from dimos.utils.logging_config import _get_log_file_path, set_run_log_dir
-
         log_dir = tmp_path / "run-004"
         set_run_log_dir(log_dir)
         path = _get_log_file_path()
         assert path == log_dir / "main.jsonl"
 
     def test_routes_via_env_var(self, tmp_path, monkeypatch):
-        from dimos.utils import logging_config
-
         monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
         monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
 
@@ -81,8 +72,6 @@ def test_routes_via_env_var(self, tmp_path, monkeypatch):
         assert env_dir.is_dir()
 
     def test_falls_back_to_legacy(self, tmp_path, monkeypatch):
-        from dimos.utils import logging_config
-
         monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
         monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
         monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index cfd2bd9e80..ca7b18f44e 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -103,9 +103,6 @@ def run(
     ctx: typer.Context,
     robot_types: list[str] = typer.Argument(..., help="Blueprints or modules to run"),
     daemon: bool = typer.Option(False, "--daemon", "-d", help="Run in background"),
-    health_timeout: float = typer.Option(
-        30.0, "--health-timeout", help="Seconds to monitor workers before daemonizing"
-    ),
 ) -> None:
     """Start a robot blueprint"""
     from datetime import datetime, timezone
@@ -163,7 +160,7 @@ def run(
         )
 
         # Health check before daemonizing — catch early crashes
-        if not health_check(coordinator, timeout=health_timeout):
+        if not health_check(coordinator):
             typer.echo("Error: health check failed — a worker process died.", err=True)
             coordinator.stop()
             raise typer.Exit(1)

From c33bd9b0bcbc5e7ea00939775e28fa5e8045206f Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 21:30:01 +0000
Subject: [PATCH 17/37] fix: address all remaining paul review comments

- convert all test helpers to fixtures with cleanup
- replace os.environ['CI']='1' at import time with monkeypatch fixture
- replace all time.sleep() with polling loops in tests
- mark slow stop tests @pytest.mark.slow
- move stop_entry logic from CLI to run_registry
- remove # type: ignore, properly type _stop_entry
- remove duplicate port conflict test
- remove redundant monkeypatch cleanup in test_per_run_logs
- list(glob()) to avoid mutation during iteration in cleanup_stale
- XDG_STATE_HOME compliant _get_state_dir() for registry/log paths
- remove redundant cli_config_overrides in e2e builds
- move duplicate imports to module level in e2e tests
---
 dimos/core/module_coordinator.py   |   3 +-
 dimos/core/run_registry.py         |  54 ++++-
 dimos/core/test_cli_stop_status.py |  41 ++--
 dimos/core/test_daemon.py          |  23 +--
 dimos/core/test_e2e_daemon.py      | 304 ++++++++++++-----------------
 dimos/core/test_per_run_logs.py    |  11 +-
 dimos/robot/cli/dimos.py           |  40 +---
 7 files changed, 216 insertions(+), 260 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 97a62bdf70..d97fa8d3c8 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -27,6 +27,7 @@
     from dimos.core.module import Module, ModuleT
     from dimos.core.resource_monitor.monitor import StatsMonitor
     from dimos.core.rpc_client import ModuleProxy
+    from dimos.core.worker import Worker
 
 logger = setup_logger()
 
@@ -50,7 +51,7 @@ def __init__(
         self._deployed_modules = {}
 
     @property
-    def workers(self) -> list:
+    def workers(self) -> list[Worker]:
         """Active worker processes."""
         if self._client is None:
             return []
diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py
index e9e4f63a39..9f8e7f3358 100644
--- a/dimos/core/run_registry.py
+++ b/dimos/core/run_registry.py
@@ -27,8 +27,17 @@
 
 logger = setup_logger()
 
-REGISTRY_DIR = Path.home() / ".local" / "state" / "dimos" / "runs"
-LOG_BASE_DIR = Path.home() / ".local" / "state" / "dimos" / "logs"
+
+def _get_state_dir() -> Path:
+    """XDG_STATE_HOME compliant state directory for dimos."""
+    xdg = os.environ.get("XDG_STATE_HOME")
+    if xdg:
+        return Path(xdg) / "dimos"
+    return Path.home() / ".local" / "state" / "dimos"
+
+
+REGISTRY_DIR = _get_state_dir() / "runs"
+LOG_BASE_DIR = _get_state_dir() / "logs"
 
 
 @dataclass
@@ -107,7 +116,7 @@ def cleanup_stale() -> int:
     """Remove registry entries for dead processes. Returns count removed."""
     REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
     removed = 0
-    for f in REGISTRY_DIR.glob("*.json"):
+    for f in list(REGISTRY_DIR.glob("*.json")):
         try:
             entry = RunEntry.load(f)
             if not is_pid_alive(entry.pid):
@@ -131,3 +140,42 @@ def get_most_recent(alive_only: bool = True) -> RunEntry | None:
     """Return the most recently created run entry, or None."""
     runs = list_runs(alive_only=alive_only)
     return runs[-1] if runs else None
+
+
+import signal
+
+
+def stop_entry(entry: RunEntry, force: bool = False) -> tuple[str, bool]:
+    """Stop a DimOS instance by registry entry.
+
+    Returns (message, success) for the CLI to display.
+    """
+    sig = signal.SIGKILL if force else signal.SIGTERM
+    sig_name = "SIGKILL" if force else "SIGTERM"
+
+    try:
+        os.kill(entry.pid, sig)
+    except ProcessLookupError:
+        entry.remove()
+        return ("Process already dead, cleaning registry", True)
+
+    if not force:
+        for _ in range(50):  # 5 seconds
+            if not is_pid_alive(entry.pid):
+                break
+            time.sleep(0.1)
+        else:
+            try:
+                os.kill(entry.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            else:
+                for _ in range(20):
+                    if not is_pid_alive(entry.pid):
+                        break
+                    time.sleep(0.1)
+            entry.remove()
+            return (f"Escalated to SIGKILL after {sig_name} timeout", True)
+
+    entry.remove()
+    return (f"Stopped with {sig_name}", True)
diff --git a/dimos/core/test_cli_stop_status.py b/dimos/core/test_cli_stop_status.py
index 76e2001fd9..c04d8d2499 100644
--- a/dimos/core/test_cli_stop_status.py
+++ b/dimos/core/test_cli_stop_status.py
@@ -12,13 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for `dimos status` and `dimos stop` CLI commands.
-
-Exercises the actual typer CLI surface using CliRunner with real
-subprocess PIDs, not mocks. Verifies output formatting, flag handling,
-and registry lifecycle.
-"""
-
 from __future__ import annotations
 
 from datetime import datetime, timedelta, timezone
@@ -59,7 +52,7 @@ def _make():
         try:
             p.kill()
         except OSError:
-            pass  # already dead
+            pass
         try:
             p.wait(timeout=2)
         except Exception:
@@ -118,6 +111,13 @@ def test_status_shows_uptime_hours(self, sleeper):
         result = CliRunner().invoke(main, ["status"])
         assert "3h 22m" in result.output
 
+    def test_status_shows_log_dir(self, sleeper):
+        proc = sleeper()
+        _entry("log-dir-test", proc.pid, log_dir="/tmp/custom-logs")
+
+        result = CliRunner().invoke(main, ["status"])
+        assert "/tmp/custom-logs" in result.output
+
     def test_status_shows_blueprint(self, sleeper):
         proc = sleeper()
         _entry("bp-test", proc.pid, blueprint="unitree-g1")
@@ -126,7 +126,7 @@ def test_status_shows_blueprint(self, sleeper):
         assert "unitree-g1" in result.output
 
     def test_status_filters_dead_pids(self):
-        _entry("dead-one", pid=2_000_000_000)  # definitely not alive
+        _entry("dead-one", pid=2_000_000_000)
 
         result = CliRunner().invoke(main, ["status"])
         assert "No running" in result.output
@@ -144,6 +144,7 @@ def test_stop_no_instances(self):
         result = CliRunner().invoke(main, ["stop"])
         assert result.exit_code == 1
 
+    @pytest.mark.slow
     def test_stop_default_most_recent(self, sleeper):
         proc = sleeper()
         entry = _entry("stop-default", proc.pid)
@@ -152,9 +153,14 @@ def test_stop_default_most_recent(self, sleeper):
         assert result.exit_code == 0
         assert "Stopping" in result.output
         assert "stop-default" in result.output
-        time.sleep(0.3)
+        # Poll for registry cleanup
+        for _ in range(30):
+            if not entry.registry_path.exists():
+                break
+            time.sleep(0.1)
         assert not entry.registry_path.exists()
 
+    @pytest.mark.slow
     def test_stop_force_sends_sigkill(self, sleeper):
         proc = sleeper()
         _entry("force-kill", proc.pid)
@@ -162,10 +168,13 @@ def test_stop_force_sends_sigkill(self, sleeper):
         result = CliRunner().invoke(main, ["stop", "--force"])
         assert result.exit_code == 0
         assert "SIGKILL" in result.output
-        time.sleep(0.3)
-        # Process should be dead
+        for _ in range(30):
+            if proc.poll() is not None:
+                break
+            time.sleep(0.1)
         assert proc.poll() is not None
 
+    @pytest.mark.slow
     def test_stop_sigterm_kills_process(self, sleeper):
         """Verify SIGTERM actually terminates the target process."""
         proc = sleeper()
@@ -173,6 +182,8 @@ def test_stop_sigterm_kills_process(self, sleeper):
 
         result = CliRunner().invoke(main, ["stop"])
         assert "SIGTERM" in result.output
-        # sleep handles SIGTERM by dying — wait for it
-        time.sleep(6)
-        assert proc.poll() is not None
+        for _ in range(100):  # up to 10s
+            if proc.poll() is not None:
+                break
+            time.sleep(0.1)
+        assert proc.poll() is not None, "Process should be dead after SIGTERM"
diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py
index 8ea634b6a4..a3f85f8908 100644
--- a/dimos/core/test_daemon.py
+++ b/dimos/core/test_daemon.py
@@ -151,13 +151,6 @@ def test_port_conflict_detection(self, tmp_registry: Path):
         assert conflict is not None
         assert conflict.run_id == entry.run_id
 
-    def test_port_conflict_grpc_only(self, tmp_registry: Path):
-        entry = _make_entry(pid=os.getpid(), grpc_port=9877)
-        entry.save()
-
-        conflict = check_port_conflicts(grpc_port=9877)
-        assert conflict is not None
-
     def test_port_conflict_no_false_positive(self, tmp_registry: Path):
         entry = _make_entry(pid=os.getpid(), grpc_port=8001)
         entry.save()
@@ -369,9 +362,9 @@ def mock_kill(pid, sig):
 
         # Import the stop helper
         sys.path.insert(0, str(Path(__file__).parent.parent))
-        from dimos.robot.cli.dimos import _stop_entry
+        from dimos.core.run_registry import stop_entry
 
-        _stop_entry(entry, force=False)
+        stop_entry(entry, force=False)
 
         assert 12345 in killed_pids
         import signal
@@ -404,9 +397,9 @@ def mock_kill(pid, sig):
         )
         entry.save()
 
-        from dimos.robot.cli.dimos import _stop_entry
+        from dimos.core.run_registry import stop_entry
 
-        _stop_entry(entry, force=True)
+        stop_entry(entry, force=True)
 
         import signal
 
@@ -435,9 +428,9 @@ def mock_kill(pid, sig):
         entry.save()
         assert entry.registry_path.exists()
 
-        from dimos.robot.cli.dimos import _stop_entry
+        from dimos.core.run_registry import stop_entry
 
-        _stop_entry(entry, force=False)
+        stop_entry(entry, force=False)
         assert not entry.registry_path.exists()
 
     def test_stop_escalates_to_sigkill_after_timeout(self, tmp_path, monkeypatch):
@@ -470,9 +463,9 @@ def mock_kill(pid, sig):
         )
         entry.save()
 
-        from dimos.robot.cli.dimos import _stop_entry
+        from dimos.core.run_registry import stop_entry
 
-        _stop_entry(entry, force=False)
+        stop_entry(entry, force=False)
 
         import signal
 
diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index 1e6bbde533..bb5897b766 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -12,26 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""End-to-end tests for daemon lifecycle using a lightweight PingPong blueprint.
-
-These tests exercise real forkserver workers and module deployment — no mocks.
-They use a minimal 2-module blueprint (PingModule + PongModule) that needs
-no hardware, no LFS data, and no replay files.
-"""
-
 from __future__ import annotations
 
+from datetime import datetime, timezone
 import json
 import os
 import signal
 import time
 
 import pytest
-
-# Skip sysctl interactive prompt in CI / headless
-os.environ["CI"] = "1"
-
-from datetime import datetime, timezone
+from typer.testing import CliRunner
 
 from dimos.core.blueprints import autoconnect
 from dimos.core.daemon import health_check
@@ -44,6 +34,7 @@
     list_runs,
 )
 from dimos.core.stream import Out
+from dimos.robot.cli.dimos import main
 
 # ---------------------------------------------------------------------------
 # Lightweight test modules
@@ -65,22 +56,51 @@ def start(self):
 
 
 # ---------------------------------------------------------------------------
-# Helpers
+# Fixtures
 # ---------------------------------------------------------------------------
 
 
-def _build(n_workers: int = 1):
-    """Build a 2-module blueprint with *n_workers* forkserver workers."""
-    global_config.update(viewer_backend="none", n_workers=n_workers)
+@pytest.fixture(autouse=True)
+def _ci_env(monkeypatch):
+    """Set CI=1 to skip sysctl interactive prompt — scoped per test, not module."""
+    monkeypatch.setenv("CI", "1")
+
+
+@pytest.fixture(autouse=True)
+def _clean_registry(tmp_path, monkeypatch):
+    """Redirect registry to a temp dir for test isolation."""
+    import dimos.core.run_registry as _reg
+
+    test_dir = tmp_path / "runs"
+    test_dir.mkdir()
+    monkeypatch.setattr(_reg, "REGISTRY_DIR", test_dir)
+    yield test_dir
+
+
+@pytest.fixture()
+def coordinator():
+    """Build a PingPong blueprint (1 worker) and yield the coordinator."""
+    global_config.update(viewer_backend="none", n_workers=1)
     bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
-    coord = bp.build(cli_config_overrides={"viewer_backend": "none", "n_workers": n_workers})
-    return coord
+    coord = bp.build()
+    yield coord
+    coord.stop()
 
 
-def _make_entry(coord, run_id: str | None = None) -> RunEntry:
-    """Create and save a registry entry for the current process."""
-    if run_id is None:
-        run_id = f"test-{datetime.now(timezone.utc).strftime('%H%M%S%f')}"
+@pytest.fixture()
+def coordinator_2w():
+    """Build a PingPong blueprint with 2 workers."""
+    global_config.update(viewer_backend="none", n_workers=2)
+    bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
+    coord = bp.build()
+    yield coord
+    coord.stop()
+
+
+@pytest.fixture()
+def registry_entry():
+    """Create and save a registry entry. Removes on teardown."""
+    run_id = f"test-{datetime.now(timezone.utc).strftime('%H%M%S%f')}"
     entry = RunEntry(
         run_id=run_id,
         pid=os.getpid(),
@@ -91,18 +111,8 @@ def _make_entry(coord, run_id: str | None = None) -> RunEntry:
         config_overrides={"n_workers": 1},
     )
     entry.save()
-    return entry
-
-
-@pytest.fixture(autouse=True)
-def _clean_registry(tmp_path, monkeypatch):
-    """Redirect registry to a temp dir for test isolation."""
-    import dimos.core.run_registry as _reg
-
-    test_dir = tmp_path / "runs"
-    test_dir.mkdir()
-    monkeypatch.setattr(_reg, "REGISTRY_DIR", test_dir)
-    yield test_dir
+    yield entry
+    entry.remove()
 
 
 # ---------------------------------------------------------------------------
@@ -114,65 +124,47 @@ def _clean_registry(tmp_path, monkeypatch):
 class TestDaemonE2E:
     """End-to-end daemon lifecycle with real workers."""
 
-    def test_single_worker_lifecycle(self):
-        """Build → health check → registry → status → stop (1 worker)."""
-        coord = _build(n_workers=1)
-
-        workers = coord._client.workers
-        assert len(workers) == 1
-        assert len(coord._deployed_modules) == 2  # PingModule + PongModule
+    def test_single_worker_lifecycle(self, coordinator, registry_entry):
+        """Build -> health check -> registry -> status (1 worker)."""
+        assert len(coordinator.workers) == 1
+        assert coordinator.n_modules == 2
 
-        ok = health_check(coord)
-        assert ok, "Health check should pass with healthy worker"
+        assert health_check(coordinator), "Health check should pass"
 
-        entry = _make_entry(coord)
         runs = list_runs(alive_only=True)
         assert len(runs) == 1
-        assert runs[0].run_id == entry.run_id
+        assert runs[0].run_id == registry_entry.run_id
 
         latest = get_most_recent(alive_only=True)
         assert latest is not None
-        assert latest.run_id == entry.run_id
+        assert latest.run_id == registry_entry.run_id
 
-        coord.stop()
-        entry.remove()
-        assert len(list_runs(alive_only=True)) == 0
-
-    def test_multiple_workers(self):
-        """Build with 2 workers — both should be alive after health check."""
-        coord = _build(n_workers=2)
-
-        workers = coord._client.workers
-        assert len(workers) == 2
-        for w in workers:
+    def test_multiple_workers(self, coordinator_2w):
+        """Build with 2 workers — both should be alive."""
+        assert len(coordinator_2w.workers) == 2
+        for w in coordinator_2w.workers:
             assert w.pid is not None, f"Worker {w.worker_id} has no PID"
 
-        ok = health_check(coord)
-        assert ok, "Health check should pass with 2 healthy workers"
+        assert health_check(coordinator_2w), "Health check should pass"
 
-        coord.stop()
-
-    def test_health_check_detects_dead_worker(self):
-        """Kill a worker process — health check should detect and fail."""
-        coord = _build(n_workers=1)
-
-        worker = coord._client.workers[0]
+    def test_health_check_detects_dead_worker(self, coordinator):
+        """Kill a worker process — health check should fail."""
+        worker = coordinator.workers[0]
         worker_pid = worker.pid
         assert worker_pid is not None
 
-        # Kill the worker
         os.kill(worker_pid, signal.SIGKILL)
-        time.sleep(0.5)  # let it die
-
-        ok = health_check(coord)
-        assert not ok, "Health check should FAIL after worker killed"
+        for _ in range(50):
+            try:
+                os.kill(worker_pid, 0)
+            except ProcessLookupError:
+                break
+            time.sleep(0.1)
 
-        coord.stop()
+        assert not health_check(coordinator), "Health check should FAIL"
 
-    def test_registry_entry_details(self):
+    def test_registry_entry_details(self, coordinator):
         """Verify all fields are correctly persisted in the JSON registry."""
-        coord = _build(n_workers=1)
-
         run_id = "detail-test-001"
         entry = RunEntry(
             run_id=run_id,
@@ -185,7 +177,6 @@ def test_registry_entry_details(self):
         )
         entry.save()
 
-        # Read raw JSON
         raw = json.loads(entry.registry_path.read_text())
         assert raw["run_id"] == run_id
         assert raw["pid"] == os.getpid()
@@ -195,25 +186,19 @@ def test_registry_entry_details(self):
         assert raw["cli_args"] == ["--replay", "ping-pong"]
         assert raw["config_overrides"] == {"n_workers": 1, "viewer_backend": "none"}
 
-        # Roundtrip through list_runs
         runs = list_runs()
         assert len(runs) == 1
         loaded = runs[0]
         assert loaded.run_id == run_id
-        assert loaded.blueprint == "ping-pong-detail"
         assert loaded.cli_args == ["--replay", "ping-pong"]
 
-        coord.stop()
         entry.remove()
 
-    def test_stale_cleanup(self):
+    def test_stale_cleanup(self, coordinator, registry_entry):
         """Stale entries (dead PIDs) should be removed by cleanup_stale."""
-        coord = _build(n_workers=1)
-
-        # Create an entry with a bogus PID that doesn't exist
         stale = RunEntry(
             run_id="stale-dead-pid",
-            pid=99999999,  # definitely not running
+            pid=99999999,
             blueprint="ghost",
             started_at=datetime.now(timezone.utc).isoformat(),
             log_dir="/tmp/ghost",
@@ -222,21 +207,14 @@ def test_stale_cleanup(self):
         )
         stale.save()
 
-        # Also create a live entry
-        live = _make_entry(coord)
-
-        # alive_only=False returns ALL entries (no auto-clean)
         assert len(list_runs(alive_only=False)) == 2
 
         removed = cleanup_stale()
-        assert removed == 1  # only the dead PID entry removed
+        assert removed == 1
 
         remaining = list_runs(alive_only=False)
         assert len(remaining) == 1
-        assert remaining[0].run_id == live.run_id
-
-        coord.stop()
-        live.remove()
+        assert remaining[0].run_id == registry_entry.run_id
 
 
 # ---------------------------------------------------------------------------
@@ -244,100 +222,66 @@ def test_stale_cleanup(self):
 # ---------------------------------------------------------------------------
 
 
+@pytest.fixture()
+def live_blueprint():
+    """Build PingPong and register. Yields (coord, entry). Cleans up on teardown."""
+    global_config.update(viewer_backend="none", n_workers=1)
+    bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
+    coord = bp.build()
+    run_id = f"e2e-cli-{datetime.now(timezone.utc).strftime('%H%M%S%f')}"
+    entry = RunEntry(
+        run_id=run_id,
+        pid=os.getpid(),
+        blueprint="ping-pong",
+        started_at=datetime.now(timezone.utc).isoformat(),
+        log_dir="/tmp/dimos-e2e-cli",
+        cli_args=["ping-pong"],
+        config_overrides={"n_workers": 1},
+    )
+    entry.save()
+    yield coord, entry
+    coord.stop()
+    entry.remove()
+
+
 @pytest.mark.slow
 class TestCLIWithRealBlueprint:
-    """Exercise `dimos status` and `dimos stop` against a live DimOS blueprint."""
+    """Exercise dimos status and dimos stop against a live DimOS blueprint."""
 
-    def _build_and_register(self, run_id: str = "e2e-cli-test"):
-        """Build PingPong blueprint and register it in the run registry."""
-        global_config.update(viewer_backend="none", n_workers=1)
-        bp = autoconnect(PingModule.blueprint(), PongModule.blueprint())
-        coord = bp.build(cli_config_overrides={"viewer_backend": "none", "n_workers": 1})
+    def test_status_shows_live_blueprint(self, live_blueprint):
+        _coord, entry = live_blueprint
+        result = CliRunner().invoke(main, ["status"])
 
-        entry = RunEntry(
-            run_id=run_id,
-            pid=os.getpid(),
-            blueprint="ping-pong",
-            started_at=datetime.now(timezone.utc).isoformat(),
-            log_dir="/tmp/dimos-e2e-cli",
-            cli_args=["ping-pong"],
-            config_overrides={"n_workers": 1},
-        )
-        entry.save()
-        return coord, entry
-
-    def test_status_shows_live_blueprint(self):
-        """Status should display a running blueprint's details."""
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        coord, entry = self._build_and_register("e2e-status-live")
-        try:
-            runner = CliRunner()
-            result = runner.invoke(main, ["status"])
-
-            assert result.exit_code == 0
-            assert "e2e-status-live" in result.output
-            assert "ping-pong" in result.output
-            assert str(os.getpid()) in result.output
-        finally:
-            coord.stop()
-            entry.remove()
-
-    def test_status_shows_worker_count_via_registry(self):
-        """Verify the registry entry is findable after real build."""
-        coord, entry = self._build_and_register("e2e-worker-check")
-        try:
-            # Verify workers are actually alive
-            workers = coord._client.workers
-            assert len(workers) >= 1
-            for w in workers:
-                assert w.pid is not None
-
-            # Verify registry entry
-            runs = list_runs(alive_only=True)
-            matching = [r for r in runs if r.run_id == "e2e-worker-check"]
-            assert len(matching) == 1
-        finally:
-            coord.stop()
-            entry.remove()
-
-    def test_stop_kills_real_workers(self):
-        """dimos stop should kill the process and clean up registry."""
-        from typer.testing import CliRunner
-
-        from dimos.robot.cli.dimos import main
-
-        coord, entry = self._build_and_register("e2e-stop-real")
-
-        # Verify workers are alive before stop
-        worker_pids = [w.pid for w in coord._client.workers if w.pid]
-        assert len(worker_pids) >= 1
+        assert result.exit_code == 0
+        assert entry.run_id in result.output
+        assert "ping-pong" in result.output
+        assert str(os.getpid()) in result.output
 
-        runner = CliRunner()
+    def test_status_shows_worker_count_via_registry(self, live_blueprint):
+        coord, entry = live_blueprint
 
-        # Status should show it
-        result = runner.invoke(main, ["status"])
-        assert "e2e-stop-real" in result.output
+        assert len(coord.workers) >= 1
+        for w in coord.workers:
+            assert w.pid is not None
+
+        runs = list_runs(alive_only=True)
+        matching = [r for r in runs if r.run_id == entry.run_id]
+        assert len(matching) == 1
+
+    def test_stop_kills_real_workers(self, live_blueprint):
+        coord, _entry = live_blueprint
+
+        worker_pids = [w.pid for w in coord.workers if w.pid]
+        assert len(worker_pids) >= 1
 
-        # Stop it — note: this sends SIGTERM to our own process (os.getpid()),
-        # which would kill the test. Instead, test that stop with --pid on a
-        # worker PID works, and then clean up the coordinator manually.
-        # The real stop flow is: SIGTERM → signal handler → coord.stop() → registry remove
-        # We test the signal handler separately in test_daemon.py.
-        # Here we verify the registry + coordinator stop flow.
         coord.stop()
-        time.sleep(0.5)
 
-        # Workers should be dead after coord.stop()
         for wpid in worker_pids:
-            try:
-                os.kill(wpid, 0)  # check if alive
-                alive = True
-            except ProcessLookupError:
-                alive = False
-            assert not alive, f"Worker PID {wpid} still alive after coord.stop()"
-
-        entry.remove()
-        assert len(list_runs(alive_only=True)) == 0
+            for _ in range(50):
+                try:
+                    os.kill(wpid, 0)
+                except ProcessLookupError:
+                    break
+                time.sleep(0.1)
+            else:
+                pytest.fail(f"Worker PID {wpid} still alive after coord.stop()")
diff --git a/dimos/core/test_per_run_logs.py b/dimos/core/test_per_run_logs.py
index ec48ef73a5..73d669b335 100644
--- a/dimos/core/test_per_run_logs.py
+++ b/dimos/core/test_per_run_logs.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for DIM-685: per-run log directory routing."""
-
 from __future__ import annotations
 
 import os
@@ -61,9 +59,6 @@ def test_routes_to_run_dir(self, tmp_path):
         assert path == log_dir / "main.jsonl"
 
     def test_routes_via_env_var(self, tmp_path, monkeypatch):
-        monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
-        monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
-
         env_dir = tmp_path / "env-run"
         monkeypatch.setenv("DIMOS_RUN_LOG_DIR", str(env_dir))
 
@@ -71,11 +66,7 @@ def test_routes_via_env_var(self, tmp_path, monkeypatch):
         assert path == env_dir / "main.jsonl"
         assert env_dir.is_dir()
 
-    def test_falls_back_to_legacy(self, tmp_path, monkeypatch):
-        monkeypatch.setattr(logging_config, "_RUN_LOG_DIR", None)
-        monkeypatch.setattr(logging_config, "_LOG_FILE_PATH", None)
-        monkeypatch.delenv("DIMOS_RUN_LOG_DIR", raising=False)
-
+    def test_falls_back_to_legacy(self):
         path = logging_config._get_log_file_path()
         assert path.name.startswith("dimos_")
         assert path.suffix == ".jsonl"
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index ca7b18f44e..83be2efbf8 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -250,46 +250,14 @@ def stop(
     _stop_entry(entry, force=force)
 
 
-def _stop_entry(entry, force: bool = False) -> None:  # type: ignore[no-untyped-def]
+def _stop_entry(entry: "RunEntry", force: bool = False) -> None:
     """Stop a single DimOS instance by registry entry."""
-    import os
-    import signal
-    import time
-
-    from dimos.core.run_registry import is_pid_alive
+    from dimos.core.run_registry import stop_entry
 
-    sig = signal.SIGKILL if force else signal.SIGTERM
     sig_name = "SIGKILL" if force else "SIGTERM"
-
     typer.echo(f"Stopping {entry.run_id} (PID {entry.pid}) with {sig_name}...")
-
-    try:
-        os.kill(entry.pid, sig)
-    except ProcessLookupError:
-        typer.echo("  Process already dead, cleaning registry")
-        entry.remove()
-        return
-
-    # Wait for graceful shutdown
-    if not force:
-        for _ in range(50):  # 5 seconds
-            if not is_pid_alive(entry.pid):
-                break
-            time.sleep(0.1)
-        else:
-            typer.echo("  Still alive after 5s, sending SIGKILL...")
-            try:
-                os.kill(entry.pid, signal.SIGKILL)
-            except ProcessLookupError:
-                pass
-            else:
-                for _ in range(20):  # up to 2s for SIGKILL to be processed
-                    if not is_pid_alive(entry.pid):
-                        break
-                    time.sleep(0.1)
-
-    entry.remove()
-    typer.echo("  Stopped.")
+    msg, _ok = stop_entry(entry, force=force)
+    typer.echo(f"  {msg}")
 
 
 @main.command()

From 6f7c4c5f1e9062d15c4c11bacc3f8230fa24b0c9 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 21:32:28 +0000
Subject: [PATCH 18/37] fix: remove module docstring from test_daemon.py

---
 dimos/core/test_daemon.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py
index a3f85f8908..dde2f97e42 100644
--- a/dimos/core/test_daemon.py
+++ b/dimos/core/test_daemon.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for DIM-681: daemon mode, run registry, and health checks."""
 
 from __future__ import annotations
 

From 32f26618a0a38395039b28db59a06260c354286d Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 22:16:38 +0000
Subject: [PATCH 19/37] feat: MCP server enhancements, dimos mcp CLI, agent
 context, stress tests (DIM-686, DIM-687)

MCP server:
- new dimos/status and dimos/list_modules JSON-RPC methods

CLI:
- dimos mcp list-tools/call/status/modules commands
- uses stdlib urllib (no requests dependency)

agent context (DIM-687):
- generate_context() writes context.md on dimos run

stress tests (23 tests):
- MCP lifecycle, tools, error handling, rapid calls, restart cycles
- all tests use fixtures with cleanup, poll loops (no sleep)

Closes DIM-686, Closes DIM-687
---
 dimos/agents/mcp/mcp_server.py            |  29 ++
 dimos/core/agent_context.py               | 107 ++++++
 dimos/core/test_e2e_mcp_stress.py         | 416 ++++++++++++++++++++++
 dimos/core/tests/stress_test_blueprint.py |  29 ++
 dimos/core/tests/stress_test_module.py    |  59 +++
 dimos/robot/cli/dimos.py                  | 107 +++++-
 6 files changed, 746 insertions(+), 1 deletion(-)
 create mode 100644 dimos/core/agent_context.py
 create mode 100644 dimos/core/test_e2e_mcp_stress.py
 create mode 100644 dimos/core/tests/stress_test_blueprint.py
 create mode 100644 dimos/core/tests/stress_test_module.py

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index 87c27302db..1e0d6bd657 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -114,6 +114,31 @@ async def _handle_tools_call(
     return _jsonrpc_result_text(req_id, str(result))
 
 
+def _handle_dimos_status(req_id: Any) -> dict[str, Any]:
+    """Return running modules, skills, and server info."""
+    import os
+
+    skills = app.state.skills
+    return _jsonrpc_result(
+        req_id,
+        {
+            "pid": os.getpid(),
+            "modules": list({s.class_name for s in skills}),
+            "skills": [s.func_name for s in skills],
+            "skill_count": len(skills),
+        },
+    )
+
+
+def _handle_dimos_list_modules(req_id: Any) -> dict[str, Any]:
+    """Return module names and their skills."""
+    skills = app.state.skills
+    modules: dict[str, list[str]] = {}
+    for s in skills:
+        modules.setdefault(s.class_name, []).append(s.func_name)
+    return _jsonrpc_result(req_id, {"modules": modules})
+
+
 async def handle_request(
     request: dict[str, Any],
     skills: list[SkillInfo],
@@ -138,6 +163,10 @@ async def handle_request(
         return _handle_tools_list(req_id, skills)
     if method == "tools/call":
         return await _handle_tools_call(req_id, params, rpc_calls)
+    if method == "dimos/status":
+        return _handle_dimos_status(req_id)
+    if method == "dimos/list_modules":
+        return _handle_dimos_list_modules(req_id)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
 
 
diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
new file mode 100644
index 0000000000..76763ce5aa
--- /dev/null
+++ b/dimos/core/agent_context.py
@@ -0,0 +1,107 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate agent context file for a running DimOS instance (DIM-687).
+
+Writes a markdown file with module diagram, available skills, CLI commands,
+and MCP endpoint info — everything an AI agent needs to interact with the
+running system.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from dimos.core.module_coordinator import ModuleCoordinator
+
+logger = setup_logger()
+
+
+def generate_context(
+    coordinator: ModuleCoordinator,
+    blueprint_name: str,
+    run_id: str,
+    log_dir: Path,
+    mcp_port: int = 9990,
+) -> Path:
+    """Generate a context.md file for agents in the log directory.
+
+    Returns the path to the generated file.
+    """
+    log_dir.mkdir(parents=True, exist_ok=True)
+    context_path = log_dir / "context.md"
+
+    lines = [
+        f"# DimOS Agent Context — {blueprint_name}",
+        "",
+        f"**Run ID:** {run_id}",
+        f"**PID:** {os.getpid()}",
+        f"**Blueprint:** {blueprint_name}",
+        "",
+        "## MCP Endpoint",
+        "",
+        f"- URL: `http://localhost:{mcp_port}/mcp`",
+        "- Protocol: JSON-RPC 2.0",
+        "- Methods: `initialize`, `tools/list`, `tools/call`, `dimos/status`, `dimos/list_modules`",
+        "",
+        "## CLI Commands",
+        "",
+        "```",
+        "dimos status              # show running instance",
+        "dimos stop                # stop the instance",
+        "dimos stop --force        # force kill (SIGKILL)",
+        "dimos restart             # restart with same args",
+        "dimos mcp list-tools      # list available MCP tools",
+        "dimos mcp call <tool>     # call a tool",
+        "dimos mcp status          # module/skill info via MCP",
+        "dimos mcp modules         # module-skill mapping",
+        "```",
+        "",
+    ]
+
+    # Module info
+    lines.append("## Deployed Modules")
+    lines.append("")
+    for w in coordinator.workers:
+        for name in w.module_names:
+            lines.append(f"- {name} (worker {w.worker_id}, pid {w.pid})")
+    lines.append("")
+
+    # Skills info (via MCP)
+    lines.append("## Available Skills (MCP Tools)")
+    lines.append("")
+    lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
+    lines.append("```")
+    lines.append('curl -s localhost:9990/mcp -d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\'')
+    lines.append("```")
+    lines.append("")
+
+    # Log info
+    lines.append("## Logs")
+    lines.append("")
+    lines.append(f"- Directory: `{log_dir}`")
+    lines.append(f"- Structured log: `{log_dir}/main.jsonl`")
+    lines.append("- Format: structlog JSON, one line per event")
+    lines.append('- Filter by module: `grep \'"logger":"module_name"\' main.jsonl`')
+    lines.append("")
+
+    context_path.write_text("\n".join(lines))
+    logger.info("Agent context written", path=str(context_path))
+    return context_path
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
new file mode 100644
index 0000000000..4445290228
--- /dev/null
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -0,0 +1,416 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+import json
+import os
+import time
+
+import pytest
+import requests
+from typer.testing import CliRunner
+
+from dimos.agents.mcp.mcp_server import McpServer
+from dimos.core.blueprints import autoconnect
+from dimos.core.daemon import health_check
+from dimos.core.global_config import global_config
+from dimos.core.run_registry import (
+    RunEntry,
+    cleanup_stale,
+    list_runs,
+)
+from dimos.core.tests.stress_test_module import StressTestModule
+from dimos.robot.cli.dimos import main
+
+MCP_PORT = 9990  # non-default to avoid conflicts
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _ci_env(monkeypatch):
+    monkeypatch.setenv("CI", "1")
+
+
+@pytest.fixture(autouse=True)
+def _clean_registry(tmp_path, monkeypatch):
+    import dimos.core.run_registry as _reg
+
+    test_dir = tmp_path / "runs"
+    test_dir.mkdir()
+    monkeypatch.setattr(_reg, "REGISTRY_DIR", test_dir)
+    yield test_dir
+
+
+@pytest.fixture()
+def mcp_blueprint():
+    """Build a StressTestModule + McpServer blueprint. Stops on teardown."""
+    global_config.update(viewer_backend="none", n_workers=1)
+    bp = autoconnect(
+        StressTestModule.blueprint(),
+        McpServer.blueprint(),
+    )
+    coord = bp.build()
+    yield coord
+    coord.stop()
+
+
+@pytest.fixture()
+def mcp_entry(mcp_blueprint):
+    """Create registry entry for the running blueprint."""
+    entry = RunEntry(
+        run_id=f"stress-{datetime.now(timezone.utc).strftime('%H%M%S%f')}",
+        pid=os.getpid(),
+        blueprint="stress-test",
+        started_at=datetime.now(timezone.utc).isoformat(),
+        log_dir="/tmp/dimos-stress-test",
+        cli_args=["stress-test"],
+        config_overrides={"n_workers": 1},
+    )
+    entry.save()
+    yield entry
+    entry.remove()
+
+
+def _mcp_call(method: str, params: dict | None = None, port: int = MCP_PORT) -> dict:
+    """Send a JSON-RPC request to MCP server."""
+    payload: dict = {"jsonrpc": "2.0", "id": 1, "method": method}
+    if params:
+        payload["params"] = params
+    resp = requests.post(f"http://localhost:{port}/mcp", json=payload, timeout=10)
+    resp.raise_for_status()
+    return resp.json()
+
+
+def _wait_for_mcp(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
+    """Poll until MCP server responds or timeout."""
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            resp = requests.post(
+                f"http://localhost:{port}/mcp",
+                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
+                timeout=2,
+            )
+            if resp.status_code == 200:
+                return True
+        except requests.ConnectionError:
+            pass
+        time.sleep(0.5)
+    return False
+
+
+def _wait_for_mcp_down(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
+    """Poll until MCP server stops responding."""
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            requests.post(
+                f"http://localhost:{port}/mcp",
+                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
+                timeout=1,
+            )
+        except (requests.ConnectionError, requests.ReadTimeout):
+            return True
+        time.sleep(0.5)
+    return False
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.slow
+class TestMCPLifecycle:
+    """MCP server lifecycle: start → respond → stop → dead."""
+
+    def test_mcp_responds_after_build(self, mcp_blueprint):
+        """After blueprint.build(), MCP should accept requests."""
+        assert _wait_for_mcp(), "MCP server did not start"
+
+        result = _mcp_call("initialize")
+        assert result["result"]["serverInfo"]["name"] == "dimensional"
+
+    def test_tools_list_includes_stress_skills(self, mcp_blueprint):
+        """tools/list should include echo, ping, slow, info from StressTestModule."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("tools/list")
+        tool_names = {t["name"] for t in result["result"]["tools"]}
+        assert "echo" in tool_names
+        assert "ping" in tool_names
+        assert "slow" in tool_names
+        assert "info" in tool_names
+
+    def test_echo_tool(self, mcp_blueprint):
+        """Call echo tool — should return the message."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call(
+            "tools/call",
+            {
+                "name": "echo",
+                "arguments": {"message": "hello from stress test"},
+            },
+        )
+        text = result["result"]["content"][0]["text"]
+        assert text == "hello from stress test"
+
+    def test_ping_tool(self, mcp_blueprint):
+        """Call ping tool — should return 'pong'."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("tools/call", {"name": "ping", "arguments": {}})
+        text = result["result"]["content"][0]["text"]
+        assert text == "pong"
+
+    def test_info_tool_returns_pid(self, mcp_blueprint):
+        """info tool should return process info."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("tools/call", {"name": "info", "arguments": {}})
+        text = result["result"]["content"][0]["text"]
+        assert "pid=" in text
+
+    def test_dimos_status_method(self, mcp_blueprint):
+        """dimos/status should return module and skill info."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("dimos/status")
+        data = result["result"]
+        assert "pid" in data
+        assert "modules" in data
+        assert "skills" in data
+        assert "StressTestModule" in data["modules"]
+
+    def test_dimos_list_modules_method(self, mcp_blueprint):
+        """dimos/list_modules should group skills by module."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("dimos/list_modules")
+        modules = result["result"]["modules"]
+        assert "StressTestModule" in modules
+        assert "echo" in modules["StressTestModule"]
+
+    def test_mcp_dead_after_stop(self, mcp_blueprint):
+        """After coordinator.stop(), MCP should stop responding."""
+        assert _wait_for_mcp()
+
+        mcp_blueprint.stop()
+        assert _wait_for_mcp_down(), "MCP server did not stop"
+
+
+@pytest.mark.slow
+class TestDaemonMCPRecovery:
+    """Test MCP recovery after daemon crashes and restarts."""
+
+    def test_restart_after_clean_stop(self):
+        """Stop then start again — MCP should come back."""
+        global_config.update(viewer_backend="none", n_workers=1)
+
+        # First run
+        bp1 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord1 = bp1.build()
+        assert _wait_for_mcp(), "First MCP start failed"
+        coord1.stop()
+        assert _wait_for_mcp_down(), "MCP didn't stop"
+
+        # Second run — should work without port conflicts
+        bp2 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord2 = bp2.build()
+        assert _wait_for_mcp(), "Second MCP start failed (port conflict?)"
+        coord2.stop()
+
+    def test_registry_cleanup_after_stop(self, mcp_blueprint, mcp_entry):
+        """Registry entry should be removable after stop."""
+        assert health_check(mcp_blueprint)
+
+        runs = list_runs(alive_only=True)
+        assert len(runs) == 1
+
+        mcp_blueprint.stop()
+        mcp_entry.remove()
+
+        runs = list_runs(alive_only=True)
+        assert len(runs) == 0
+
+    def test_stale_cleanup_after_crash(self):
+        """Stale entries from crashed processes should be cleaned up."""
+        stale = RunEntry(
+            run_id="crashed-mcp-test",
+            pid=99999999,
+            blueprint="stress-test",
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir="/tmp/ghost",
+            cli_args=[],
+            config_overrides={},
+        )
+        stale.save()
+        assert len(list_runs(alive_only=False)) == 1
+
+        removed = cleanup_stale()
+        assert removed == 1
+        assert len(list_runs(alive_only=False)) == 0
+
+
+@pytest.mark.slow
+class TestMCPCLICommands:
+    """Test dimos mcp CLI commands against real MCP server."""
+
+    def test_cli_list_tools(self, mcp_blueprint):
+        """dimos mcp list-tools should output JSON with tools."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "list-tools"])
+        assert result.exit_code == 0
+        tools = json.loads(result.output)
+        tool_names = {t["name"] for t in tools}
+        assert "echo" in tool_names
+        assert "ping" in tool_names
+
+    def test_cli_call_echo(self, mcp_blueprint):
+        """dimos mcp call echo --arg message=hello should return hello."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "call", "echo", "--arg", "message=hello"])
+        assert result.exit_code == 0
+        assert "hello" in result.output
+
+    def test_cli_mcp_status(self, mcp_blueprint):
+        """dimos mcp status should return JSON with modules."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "status"])
+        assert result.exit_code == 0
+        data = json.loads(result.output)
+        assert "StressTestModule" in data["modules"]
+
+    def test_cli_mcp_modules(self, mcp_blueprint):
+        """dimos mcp modules should show module-skill mapping."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "modules"])
+        assert result.exit_code == 0
+        data = json.loads(result.output)
+        assert "StressTestModule" in data["modules"]
+        assert "echo" in data["modules"]["StressTestModule"]
+
+    def test_cli_no_server_error(self):
+        """dimos mcp list-tools with no server should exit with error."""
+        result = CliRunner().invoke(main, ["mcp", "list-tools"])
+        assert result.exit_code == 1
+        assert "no running" in result.output.lower() or "error" in result.output.lower()
+
+
+@pytest.mark.slow
+class TestMCPErrorHandling:
+    """Test MCP error handling and edge cases."""
+
+    def test_call_nonexistent_tool(self, mcp_blueprint):
+        """Calling a tool that doesn't exist should return an error message."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call(
+            "tools/call",
+            {
+                "name": "nonexistent_tool_xyz",
+                "arguments": {},
+            },
+        )
+        text = result["result"]["content"][0]["text"]
+        assert "not found" in text.lower()
+
+    def test_call_tool_wrong_args(self, mcp_blueprint):
+        """Calling a tool with wrong argument types should still return."""
+        assert _wait_for_mcp()
+
+        # echo expects string, send nothing — should still work or error gracefully
+        result = _mcp_call(
+            "tools/call",
+            {
+                "name": "echo",
+                "arguments": {},
+            },
+        )
+        # Should not crash — either returns error or empty result
+        assert "result" in result or "error" in result
+
+    def test_unknown_jsonrpc_method(self, mcp_blueprint):
+        """Unknown JSON-RPC method should return error."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("nonexistent/method")
+        assert "error" in result
+        assert result["error"]["code"] == -32601
+
+    def test_mcp_handles_malformed_json(self, mcp_blueprint):
+        """MCP should handle malformed JSON gracefully."""
+        assert _wait_for_mcp()
+
+        resp = requests.post(
+            f"http://localhost:{MCP_PORT}/mcp",
+            data=b"not json{{{",
+            headers={"Content-Type": "application/json"},
+            timeout=5,
+        )
+        assert resp.status_code == 400
+
+    def test_rapid_tool_calls(self, mcp_blueprint):
+        """Fire 20 rapid echo calls — all should succeed."""
+        assert _wait_for_mcp()
+
+        for i in range(20):
+            result = _mcp_call(
+                "tools/call",
+                {
+                    "name": "echo",
+                    "arguments": {"message": f"rapid-{i}"},
+                },
+            )
+            text = result["result"]["content"][0]["text"]
+            assert text == f"rapid-{i}", f"Mismatch on call {i}"
+
+    def test_cli_call_tool_wrong_arg_format(self, mcp_blueprint):
+        """dimos mcp call with bad --arg format should error."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "call", "echo", "--arg", "no_equals_sign"])
+        assert result.exit_code == 1
+        assert "key=value" in result.output
+
+
+@pytest.mark.slow
+class TestMCPRapidRestart:
+    """Test rapid stop/start cycles."""
+
+    def test_three_restart_cycles(self):
+        """Start → stop → start 3 times — no port conflicts."""
+        global_config.update(viewer_backend="none", n_workers=1)
+
+        for cycle in range(3):
+            bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+            coord = bp.build()
+            assert _wait_for_mcp(timeout=15), f"MCP failed to start on cycle {cycle}"
+
+            result = _mcp_call("tools/call", {"name": "ping", "arguments": {}})
+            assert result["result"]["content"][0]["text"] == "pong"
+
+            coord.stop()
+            assert _wait_for_mcp_down(timeout=10), f"MCP failed to stop on cycle {cycle}"
diff --git a/dimos/core/tests/stress_test_blueprint.py b/dimos/core/tests/stress_test_blueprint.py
new file mode 100644
index 0000000000..5def4946d6
--- /dev/null
+++ b/dimos/core/tests/stress_test_blueprint.py
@@ -0,0 +1,29 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stress test blueprint: StressTestModule + McpServer.
+
+Lightweight, no hardware. Used for e2e daemon/MCP lifecycle testing.
+"""
+
+from dimos.agents.mcp.mcp_server import McpServer
+from dimos.core.blueprints import autoconnect
+from dimos.core.tests.stress_test_module import StressTestModule
+
+stress_test = autoconnect(
+    StressTestModule.blueprint(),
+    McpServer.blueprint(),
+)
+
+__all__ = ["stress_test"]
diff --git a/dimos/core/tests/stress_test_module.py b/dimos/core/tests/stress_test_module.py
new file mode 100644
index 0000000000..da48c5a2ca
--- /dev/null
+++ b/dimos/core/tests/stress_test_module.py
@@ -0,0 +1,59 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lightweight test module with MCP-callable skills for stress testing.
+
+Used by e2e tests to verify MCP lifecycle, agent CLI, and daemon robustness.
+No hardware, no LFS, no replay files needed.
+"""
+
+from __future__ import annotations
+
+import time
+
+from dimos.agents.annotation import skill
+from dimos.core.module import Module
+from dimos.core.stream import Out
+
+
+class StressTestModule(Module):
+    """Minimal module exposing test skills via MCP."""
+
+    heartbeat: Out[str]
+
+    @skill
+    def echo(self, message: str) -> str:
+        """Echo back the given message. Used for latency and connectivity tests."""
+        return message
+
+    @skill
+    def ping(self) -> str:
+        """Simple health check. Returns 'pong'."""
+        return "pong"
+
+    @skill
+    def slow(self, seconds: float = 1.0) -> str:
+        """Sleep for the given duration then return. Used for timeout tests."""
+        time.sleep(seconds)
+        return f"slept {seconds}s"
+
+    @skill
+    def info(self) -> str:
+        """Return module runtime info."""
+        import os
+
+        return f"pid={os.getpid()}, module={self.__class__.__name__}"
+
+    def start(self):
+        super().start()
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 83be2efbf8..452d69f156 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -187,6 +187,11 @@ def run(
             config_overrides=cli_config_overrides,
         )
         entry.save()
+
+        from dimos.core.agent_context import generate_context
+
+        generate_context(coordinator, blueprint_name, run_id, log_dir)
+
         install_signal_handlers(entry, coordinator)
         coordinator.loop()
     else:
@@ -200,6 +205,11 @@ def run(
             config_overrides=cli_config_overrides,
         )
         entry.save()
+
+        from dimos.core.agent_context import generate_context
+
+        generate_context(coordinator, blueprint_name, run_id, log_dir)
+
         try:
             coordinator.loop()
         finally:
@@ -250,7 +260,7 @@ def stop(
     _stop_entry(entry, force=force)
 
 
-def _stop_entry(entry: "RunEntry", force: bool = False) -> None:
+def _stop_entry(entry: Any, force: bool = False) -> None:
     """Stop a single DimOS instance by registry entry."""
     from dimos.core.run_registry import stop_entry
 
@@ -260,6 +270,101 @@ def _stop_entry(entry: "RunEntry", force: bool = False) -> None:
     typer.echo(f"  {msg}")
 
 
+# ---------------------------------------------------------------------------
+# MCP subcommands
+# ---------------------------------------------------------------------------
+
+mcp_app = typer.Typer(help="Interact with the running MCP server")
+main.add_typer(mcp_app, name="mcp")
+
+
+def _mcp_call(
+    method: str, params: dict[str, Any] | None = None, port: int = 9990
+) -> dict[str, Any]:
+    """Send a JSON-RPC request to the running MCP server."""
+    import json as _json
+    import urllib.error
+    import urllib.request
+
+    payload: dict[str, Any] = {"jsonrpc": "2.0", "id": 1, "method": method}
+    if params:
+        payload["params"] = params
+
+    data = _json.dumps(payload).encode()
+    req = urllib.request.Request(
+        f"http://localhost:{port}/mcp",
+        data=data,
+        headers={"Content-Type": "application/json"},
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            result: dict[str, Any] = _json.loads(resp.read())
+            return result
+    except urllib.error.URLError:
+        typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
+        raise typer.Exit(1)
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+
+
+@mcp_app.command("list-tools")
+def mcp_list_tools() -> None:
+    """List available MCP tools (skills)."""
+    import json
+
+    result = _mcp_call("tools/list")
+    tools = result.get("result", {}).get("tools", [])
+    typer.echo(json.dumps(tools, indent=2))
+
+
+@mcp_app.command("call")
+def mcp_call_tool(
+    tool_name: str = typer.Argument(..., help="Tool name to call"),
+    args: list[str] = typer.Option([], "--arg", "-a", help="Arguments as key=value"),
+) -> None:
+    """Call an MCP tool by name."""
+    import json
+
+    arguments = {}
+    for arg in args:
+        if "=" not in arg:
+            typer.echo(f"Error: argument must be key=value, got: {arg}", err=True)
+            raise typer.Exit(1)
+        key, value = arg.split("=", 1)
+        # Try to parse as JSON for non-string types
+        try:
+            arguments[key] = json.loads(value)
+        except (json.JSONDecodeError, ValueError):
+            arguments[key] = value
+
+    result = _mcp_call("tools/call", {"name": tool_name, "arguments": arguments})
+    content = result.get("result", {}).get("content", [])
+    for item in content:
+        typer.echo(item.get("text", str(item)))
+
+
+@mcp_app.command("status")
+def mcp_status() -> None:
+    """Show MCP server status (modules, skills)."""
+    import json
+
+    result = _mcp_call("dimos/status")
+    data = result.get("result", {})
+    typer.echo(json.dumps(data, indent=2))
+
+
+@mcp_app.command("modules")
+def mcp_modules() -> None:
+    """List deployed modules and their skills."""
+    import json
+
+    result = _mcp_call("dimos/list_modules")
+    data = result.get("result", {})
+    typer.echo(json.dumps(data, indent=2))
+
+
 @main.command()
 def show_config(ctx: typer.Context) -> None:
     """Show current config settings and their values."""

From 9caa3fbb3b3c5eb938d5b4a39fcce0c270c7e079 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 22:24:16 +0000
Subject: [PATCH 20/37] fix: address greptile review on PR #1451

- remove dimos restart from agent context (not in this branch yet)
- handle JSON-RPC errors in dimos mcp call (show error, exit 1)
- pass skills as parameter to dimos/status and dimos/list_modules handlers
- fix hardcoded port in curl example (use mcp_port parameter)
- fix double stop() in test_mcp_dead_after_stop (standalone coordinator)
- use tmp_path for log_dir in mcp_entry fixture (test isolation)
---
 dimos/agents/mcp/mcp_server.py    | 10 ++++------
 dimos/core/agent_context.py       |  5 +++--
 dimos/core/test_e2e_mcp_stress.py | 14 +++++++++-----
 dimos/robot/cli/dimos.py          |  6 ++++++
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index 1e0d6bd657..e57c432a81 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -114,11 +114,10 @@ async def _handle_tools_call(
     return _jsonrpc_result_text(req_id, str(result))
 
 
-def _handle_dimos_status(req_id: Any) -> dict[str, Any]:
+def _handle_dimos_status(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
     """Return running modules, skills, and server info."""
     import os
 
-    skills = app.state.skills
     return _jsonrpc_result(
         req_id,
         {
@@ -130,9 +129,8 @@ def _handle_dimos_status(req_id: Any) -> dict[str, Any]:
     )
 
 
-def _handle_dimos_list_modules(req_id: Any) -> dict[str, Any]:
+def _handle_dimos_list_modules(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
     """Return module names and their skills."""
-    skills = app.state.skills
     modules: dict[str, list[str]] = {}
     for s in skills:
         modules.setdefault(s.class_name, []).append(s.func_name)
@@ -164,9 +162,9 @@ async def handle_request(
     if method == "tools/call":
         return await _handle_tools_call(req_id, params, rpc_calls)
     if method == "dimos/status":
-        return _handle_dimos_status(req_id)
+        return _handle_dimos_status(req_id, skills)
     if method == "dimos/list_modules":
-        return _handle_dimos_list_modules(req_id)
+        return _handle_dimos_list_modules(req_id, skills)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
 
 
diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
index 76763ce5aa..db9748c82e 100644
--- a/dimos/core/agent_context.py
+++ b/dimos/core/agent_context.py
@@ -67,7 +67,6 @@ def generate_context(
         "dimos status              # show running instance",
         "dimos stop                # stop the instance",
         "dimos stop --force        # force kill (SIGKILL)",
-        "dimos restart             # restart with same args",
         "dimos mcp list-tools      # list available MCP tools",
         "dimos mcp call <tool>     # call a tool",
         "dimos mcp status          # module/skill info via MCP",
@@ -89,7 +88,9 @@ def generate_context(
     lines.append("")
     lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
     lines.append("```")
-    lines.append('curl -s localhost:9990/mcp -d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\'')
+    lines.append(
+        f'curl -s localhost:{mcp_port}/mcp -d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\''
+    )
     lines.append("```")
     lines.append("")
 
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index 4445290228..8c34038e16 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -72,14 +72,15 @@ def mcp_blueprint():
 
 
 @pytest.fixture()
-def mcp_entry(mcp_blueprint):
+def mcp_entry(mcp_blueprint, tmp_path):
     """Create registry entry for the running blueprint."""
+    log_dir = tmp_path / "stress-logs"
     entry = RunEntry(
         run_id=f"stress-{datetime.now(timezone.utc).strftime('%H%M%S%f')}",
         pid=os.getpid(),
         blueprint="stress-test",
         started_at=datetime.now(timezone.utc).isoformat(),
-        log_dir="/tmp/dimos-stress-test",
+        log_dir=str(log_dir),
         cli_args=["stress-test"],
         config_overrides={"n_workers": 1},
     )
@@ -209,11 +210,14 @@ def test_dimos_list_modules_method(self, mcp_blueprint):
         assert "StressTestModule" in modules
         assert "echo" in modules["StressTestModule"]
 
-    def test_mcp_dead_after_stop(self, mcp_blueprint):
+    def test_mcp_dead_after_stop(self):
         """After coordinator.stop(), MCP should stop responding."""
-        assert _wait_for_mcp()
+        global_config.update(viewer_backend="none", n_workers=1)
+        bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord = bp.build()
+        assert _wait_for_mcp(), "MCP server did not start"
 
-        mcp_blueprint.stop()
+        coord.stop()
         assert _wait_for_mcp_down(), "MCP server did not stop"
 
 
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 452d69f156..93bf12a179 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -340,7 +340,13 @@ def mcp_call_tool(
             arguments[key] = value
 
     result = _mcp_call("tools/call", {"name": tool_name, "arguments": arguments})
+    if "error" in result:
+        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+        raise typer.Exit(1)
     content = result.get("result", {}).get("content", [])
+    if not content:
+        typer.echo("No output from tool", err=True)
+        raise typer.Exit(1)
     for item in content:
         typer.echo(item.get("text", str(item)))
 

From 1858c66b1c84eb2c066cba2c2db1c50b6cb2da24 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 22:27:33 +0000
Subject: [PATCH 21/37] feat: dimos agent-send CLI + MCP method

- dimos/agent_send MCP method publishes on /human_input LCM channel
- dimos agent-send 'message' CLI wraps the MCP call
- 4 new tests: MCP send, empty message, CLI send, no-server
---
 dimos/agents/mcp/mcp_server.py    | 21 +++++++++++++++++++
 dimos/core/test_e2e_mcp_stress.py | 34 +++++++++++++++++++++++++++++++
 dimos/robot/cli/dimos.py          | 15 ++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index e57c432a81..b21cd76223 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -137,6 +137,25 @@ def _handle_dimos_list_modules(req_id: Any, skills: list[SkillInfo]) -> dict[str
     return _jsonrpc_result(req_id, {"modules": modules})
 
 
+def _handle_dimos_agent_send(
+    req_id: Any, params: dict[str, Any], rpc_calls: dict[str, Any]
+) -> dict[str, Any]:
+    """Route a message to the agent's human_input stream via LCM."""
+    message = params.get("message", "")
+    if not message:
+        return _jsonrpc_error(req_id, -32602, "Missing 'message' parameter")
+
+    # Publish on /human_input LCM channel (same as HumanCLI)
+    try:
+        from dimos.core.transport import pLCMTransport
+
+        transport = pLCMTransport("/human_input")
+        transport.publish(message)
+        return _jsonrpc_result_text(req_id, f"Message sent to agent: {message[:100]}")
+    except Exception as e:
+        return _jsonrpc_error(req_id, -32000, f"Failed to send: {e}")
+
+
 async def handle_request(
     request: dict[str, Any],
     skills: list[SkillInfo],
@@ -165,6 +184,8 @@ async def handle_request(
         return _handle_dimos_status(req_id, skills)
     if method == "dimos/list_modules":
         return _handle_dimos_list_modules(req_id, skills)
+    if method == "dimos/agent_send":
+        return _handle_dimos_agent_send(req_id, params, rpc_calls)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
 
 
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index 8c34038e16..a134e605de 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -418,3 +418,37 @@ def test_three_restart_cycles(self):
 
             coord.stop()
             assert _wait_for_mcp_down(timeout=10), f"MCP failed to stop on cycle {cycle}"
+
+
+@pytest.mark.slow
+class TestAgentSend:
+    """Test dimos agent-send CLI and MCP method."""
+
+    def test_agent_send_via_mcp(self, mcp_blueprint):
+        """dimos/agent_send should route message via LCM."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("dimos/agent_send", {"message": "hello agent"})
+        assert "result" in result
+        text = result["result"]["content"][0]["text"]
+        assert "hello agent" in text
+
+    def test_agent_send_empty_message(self, mcp_blueprint):
+        """Empty message should return error."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("dimos/agent_send", {"message": ""})
+        assert "error" in result
+
+    def test_agent_send_cli(self, mcp_blueprint):
+        """dimos agent-send 'hello' should work."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["agent-send", "hello from CLI"])
+        assert result.exit_code == 0
+        assert "hello from CLI" in result.output
+
+    def test_agent_send_cli_no_server(self):
+        """dimos agent-send with no server should exit with error."""
+        result = CliRunner().invoke(main, ["agent-send", "hello"])
+        assert result.exit_code == 1
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 93bf12a179..f0e382501f 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -371,6 +371,21 @@ def mcp_modules() -> None:
     typer.echo(json.dumps(data, indent=2))
 
 
+@main.command("agent-send")
+def agent_send(
+    message: str = typer.Argument(..., help="Message to send to the running agent"),
+    port: int = typer.Option(9990, "--port", "-p", help="MCP server port"),
+) -> None:
+    """Send a message to the running DimOS agent via MCP."""
+    result = _mcp_call("dimos/agent_send", {"message": message}, port=port)
+    if "error" in result:
+        typer.echo(f"Error: {result['error'].get('message', 'unknown')}", err=True)
+        raise typer.Exit(1)
+    content = result.get("result", {}).get("content", [])
+    for item in content:
+        typer.echo(item.get("text", str(item)))
+
+
 @main.command()
 def show_config(ctx: typer.Context) -> None:
     """Show current config settings and their values."""

From fa3a04fe53279905a18c6a48267323b4c8e05f26 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 22:37:11 +0000
Subject: [PATCH 22/37] fix: address greptile review round 2

- escape f-string curly braces in curl example (agent_context.py)
- fix double stop() in test_registry_cleanup_after_stop
- add JSON-RPC error handling to all MCP CLI commands
- add type annotation for LCM transport
- add agent-send to generated context.md CLI commands
---
 dimos/agents/mcp/mcp_server.py | 2 +-
 dimos/core/agent_context.py    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index b21cd76223..ca165a6c80 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -149,7 +149,7 @@ def _handle_dimos_agent_send(
     try:
         from dimos.core.transport import pLCMTransport
 
-        transport = pLCMTransport("/human_input")
+        transport: pLCMTransport[str] = pLCMTransport("/human_input")
         transport.publish(message)
         return _jsonrpc_result_text(req_id, f"Message sent to agent: {message[:100]}")
     except Exception as e:
diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
index db9748c82e..97ae909fa6 100644
--- a/dimos/core/agent_context.py
+++ b/dimos/core/agent_context.py
@@ -71,6 +71,7 @@ def generate_context(
         "dimos mcp call <tool>     # call a tool",
         "dimos mcp status          # module/skill info via MCP",
         "dimos mcp modules         # module-skill mapping",
+        'dimos agent-send "msg"   # send message to running agent',
         "```",
         "",
     ]
@@ -89,7 +90,7 @@ def generate_context(
     lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
     lines.append("```")
     lines.append(
-        f'curl -s localhost:{mcp_port}/mcp -d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\''
+        f'curl -s localhost:{mcp_port}/mcp -d \'{{{"jsonrpc":"2.0","id":1,"method":"tools/list"}}}\''
     )
     lines.append("```")
     lines.append("")

From 459c09a8e5a94b2c3449f60f07f5156d023c39e6 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Fri, 6 Mar 2026 22:40:28 +0000
Subject: [PATCH 23/37] feat: module IO introspection via MCP + CLI

- dimos/module_io MCP method: skills grouped by module with schema
- dimos mcp module-io CLI: human-readable module/skill listing
- 2 new tests
---
 dimos/agents/mcp/mcp_server.py    | 23 +++++++++++++++++++++++
 dimos/core/test_e2e_mcp_stress.py | 29 +++++++++++++++++++++++++++++
 dimos/robot/cli/dimos.py          | 17 +++++++++++++++++
 3 files changed, 69 insertions(+)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index ca165a6c80..07f4a92fbd 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -156,6 +156,27 @@ def _handle_dimos_agent_send(
         return _jsonrpc_error(req_id, -32000, f"Failed to send: {e}")
 
 
+def _handle_dimos_module_io(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
+    """Return module IO information: skills grouped by module with schema."""
+    modules: dict[str, dict[str, Any]] = {}
+    for s in skills:
+        if s.class_name not in modules:
+            modules[s.class_name] = {"skills": [], "skill_count": 0}
+        mod = modules[s.class_name]
+        schema = json.loads(s.args_schema)
+        description = schema.pop("description", "")
+        schema.pop("title", None)
+        mod["skills"].append(
+            {
+                "name": s.func_name,
+                "description": description,
+                "parameters": schema,
+            }
+        )
+        mod["skill_count"] += 1
+    return _jsonrpc_result(req_id, {"modules": modules, "module_count": len(modules)})
+
+
 async def handle_request(
     request: dict[str, Any],
     skills: list[SkillInfo],
@@ -184,6 +205,8 @@ async def handle_request(
         return _handle_dimos_status(req_id, skills)
     if method == "dimos/list_modules":
         return _handle_dimos_list_modules(req_id, skills)
+    if method == "dimos/module_io":
+        return _handle_dimos_module_io(req_id, skills)
     if method == "dimos/agent_send":
         return _handle_dimos_agent_send(req_id, params, rpc_calls)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index a134e605de..2ab1a55a16 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -452,3 +452,32 @@ def test_agent_send_cli_no_server(self):
         """dimos agent-send with no server should exit with error."""
         result = CliRunner().invoke(main, ["agent-send", "hello"])
         assert result.exit_code == 1
+
+
+@pytest.mark.slow
+class TestModuleVisualization:
+    """Test module IO introspection via MCP."""
+
+    def test_module_io_returns_skills_with_params(self, mcp_blueprint):
+        """dimos/module_io should return skills grouped by module with parameters."""
+        assert _wait_for_mcp()
+
+        result = _mcp_call("dimos/module_io")
+        assert "result" in result
+        data = result["result"]
+        assert "modules" in data
+        assert "StressTestModule" in data["modules"]
+
+        stress_mod = data["modules"]["StressTestModule"]
+        assert stress_mod["skill_count"] >= 4
+        skill_names = [s["name"] for s in stress_mod["skills"]]
+        assert "echo" in skill_names
+        assert "ping" in skill_names
+
+    def test_module_io_cli(self, mcp_blueprint):
+        """dimos mcp module-io should output module info."""
+        assert _wait_for_mcp()
+
+        result = CliRunner().invoke(main, ["mcp", "module-io"])
+        assert result.exit_code == 0
+        assert "StressTestModule" in result.output
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index f0e382501f..63c8e7d944 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -371,6 +371,23 @@ def mcp_modules() -> None:
     typer.echo(json.dumps(data, indent=2))
 
 
+@mcp_app.command("module-io")
+def mcp_module_io(
+    port: int = typer.Option(9990, "--port", "-p", help="MCP server port"),
+) -> None:
+    """Show detailed module IO: skills with parameters and descriptions."""
+    result = _mcp_call("dimos/module_io", port=port)
+    if "error" in result:
+        typer.echo(f"Error: {result['error'].get('message', 'unknown')}", err=True)
+        raise typer.Exit(1)
+    data = result.get("result", {})
+    for mod_name, mod_info in data.get("modules", {}).items():
+        typer.echo(f"\n{mod_name} ({mod_info['skill_count']} skills):")
+        for skill in mod_info.get("skills", []):
+            desc = f" — {skill['description']}" if skill.get("description") else ""
+            typer.echo(f"  {skill['name']}{desc}")
+
+
 @main.command("agent-send")
 def agent_send(
     message: str = typer.Argument(..., help="Message to send to the running agent"),

From 89364f1adf02ceb92efb6c4433ad280ffa097fdb Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 07:54:54 +0000
Subject: [PATCH 24/37] fix: daemon context generation + standalone e2e stress
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug fixes found during e2e testing:
- worker.pid: catch AssertionError after daemonize() when
  is_alive() asserts _parent_pid == os.getpid() (double-fork
  changes PID, but stored process.pid is still valid)
- agent_context: fix f-string ValueError from triple braces in
  curl example — split into f-string + plain string concat

New standalone test scripts (no pytest):
- e2e_mcp_killtest.py: SIGKILL/restart stress test (3 cycles,
  verifies MCP recovers after crash, tests all endpoints)
- e2e_devex_test.py: full developer experience test simulating
  OpenClaw agent workflow (daemon start → CLI ops → logs → stop)

Register stress-test blueprint in all_blueprints.py for
`dimos run stress-test --daemon` support.
---
 dimos/core/agent_context.py          |   5 +-
 dimos/core/tests/e2e_devex_test.py   | 327 +++++++++++++++++++++++++++
 dimos/core/tests/e2e_mcp_killtest.py | 318 ++++++++++++++++++++++++++
 dimos/core/worker.py                 |  13 +-
 dimos/robot/all_blueprints.py        |   1 +
 5 files changed, 659 insertions(+), 5 deletions(-)
 create mode 100644 dimos/core/tests/e2e_devex_test.py
 create mode 100644 dimos/core/tests/e2e_mcp_killtest.py

diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
index 97ae909fa6..fff64a0462 100644
--- a/dimos/core/agent_context.py
+++ b/dimos/core/agent_context.py
@@ -89,9 +89,10 @@ def generate_context(
     lines.append("")
     lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
     lines.append("```")
-    lines.append(
-        f'curl -s localhost:{mcp_port}/mcp -d \'{{{"jsonrpc":"2.0","id":1,"method":"tools/list"}}}\''
+    curl_example = (
+        f'curl -s localhost:{mcp_port}/mcp -d \'{{"jsonrpc":"2.0","id":1,"method":"tools/list"}}\''
     )
+    lines.append(curl_example)
     lines.append("```")
     lines.append("")
 
diff --git a/dimos/core/tests/e2e_devex_test.py b/dimos/core/tests/e2e_devex_test.py
new file mode 100644
index 0000000000..2ad93fe386
--- /dev/null
+++ b/dimos/core/tests/e2e_devex_test.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Full-stack developer experience test — no pytest.
+
+Simulates the complete DimOS developer workflow as if an OpenClaw agent
+is using DimOS for the first time:
+
+1. dimos run stress-test --daemon  (start in background)
+2. dimos status                    (verify running)
+3. dimos mcp list-tools            (discover tools)
+4. dimos mcp call echo             (call a tool)
+5. dimos mcp status                (module info)
+6. dimos mcp modules               (module-skill mapping)
+7. dimos mcp module-io             (introspect module IO)
+8. dimos agent-send "hello"        (send to agent)
+9. Check logs for responses
+10. dimos stop                     (clean shutdown)
+11. dimos status                   (verify stopped)
+
+Usage:
+    python -m dimos.core.tests.e2e_devex_test
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import time
+
+REPO_DIR = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+VENV_PYTHON = os.path.join(REPO_DIR, ".venv", "bin", "python")
+# Use the repo's own python if venv exists, otherwise fall back to system
+if not os.path.exists(VENV_PYTHON):
+    VENV_PYTHON = sys.executable
+
+
+def run_dimos(*args: str, timeout: float = 30) -> subprocess.CompletedProcess:
+    """Run a dimos CLI command."""
+    cmd = [VENV_PYTHON, "-m", "dimos.robot.cli.dimos", *args]
+    env = {**os.environ, "CI": "1", "PYTHONPATH": REPO_DIR}
+    result = subprocess.run(
+        cmd, capture_output=True, text=True, timeout=timeout, cwd=REPO_DIR, env=env
+    )
+    return result
+
+
+def p(msg: str, ok: bool = True) -> None:
+    icon = "\u2705" if ok else "\u274c"
+    print(f"  {icon} {msg}")
+
+
+def section(title: str) -> None:
+    print(f"\n{'=' * 60}")
+    print(f"  {title}")
+    print(f"{'=' * 60}")
+
+
+def wait_for_mcp(timeout: float = 20.0) -> bool:
+    """Poll MCP until responsive."""
+    import requests
+
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            resp = requests.post(
+                "http://localhost:9990/mcp",
+                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
+                timeout=2,
+            )
+            if resp.status_code == 200:
+                return True
+        except Exception:
+            pass
+        time.sleep(0.5)
+    return False
+
+
+def main() -> None:
+    failures = 0
+    print("\n" + "=" * 60)
+    print("  FULL-STACK DEVELOPER EXPERIENCE TEST")
+    print("  Simulating: OpenClaw agent using DimOS")
+    print("=" * 60)
+
+    # ---------------------------------------------------------------
+    # Step 1: dimos run stress-test --daemon
+    # ---------------------------------------------------------------
+    section("Step 1: dimos run stress-test --daemon")
+    result = run_dimos("run", "stress-test", "--daemon", timeout=60)
+    print(f"  stdout: {result.stdout.strip()[:200]}")
+    if result.stderr:
+        # Filter out noisy log lines
+        err_lines = [
+            l
+            for l in result.stderr.strip().split("\n")
+            if not any(x in l for x in ["[inf]", "[dbg]", "INFO:", "WARNING:"])
+        ]
+        if err_lines:
+            print(f"  stderr: {chr(10).join(err_lines[:5])}")
+
+    if result.returncode == 0:
+        p("Daemon started successfully")
+    else:
+        p(f"Daemon failed to start (exit={result.returncode})", ok=False)
+        print(f"  Full stderr:\n{result.stderr[:500]}")
+        failures += 1
+        # Try to continue anyway — maybe foreground mode issue
+
+    # Wait for MCP to be ready
+    if wait_for_mcp(timeout=20):
+        p("MCP server responding")
+    else:
+        p("MCP server not responding after 20s", ok=False)
+        failures += 1
+        print("  Cannot continue without MCP. Exiting.")
+        sys.exit(1)
+
+    # ---------------------------------------------------------------
+    # Step 2: dimos status
+    # ---------------------------------------------------------------
+    section("Step 2: dimos status")
+    result = run_dimos("status")
+    print(f"  output: {result.stdout.strip()[:300]}")
+    if result.returncode == 0 and (
+        "running" in result.stdout.lower() or "pid" in result.stdout.lower()
+    ):
+        p("Status shows running instance")
+    else:
+        p(f"Status unclear (exit={result.returncode})", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 3: dimos mcp list-tools
+    # ---------------------------------------------------------------
+    section("Step 3: dimos mcp list-tools")
+    result = run_dimos("mcp", "list-tools")
+    if result.returncode == 0:
+        try:
+            tools = json.loads(result.stdout)
+            tool_names = [t["name"] for t in tools]
+            p(f"Discovered {len(tools)} tools: {', '.join(tool_names)}")
+            if "echo" in tool_names and "ping" in tool_names:
+                p("Expected tools (echo, ping) present")
+            else:
+                p("Missing expected tools", ok=False)
+                failures += 1
+        except json.JSONDecodeError:
+            p(f"Invalid JSON output: {result.stdout[:100]}", ok=False)
+            failures += 1
+    else:
+        p(f"list-tools failed (exit={result.returncode}): {result.stdout[:100]}", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 4: dimos mcp call echo --arg message=hello
+    # ---------------------------------------------------------------
+    section("Step 4: dimos mcp call echo --arg message=hello")
+    result = run_dimos("mcp", "call", "echo", "--arg", "message=hello-from-devex-test")
+    if result.returncode == 0 and "hello-from-devex-test" in result.stdout:
+        p(f"echo returned: {result.stdout.strip()[:100]}")
+    else:
+        p(f"echo call failed (exit={result.returncode}): {result.stdout[:100]}", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 5: dimos mcp status
+    # ---------------------------------------------------------------
+    section("Step 5: dimos mcp status")
+    result = run_dimos("mcp", "status")
+    if result.returncode == 0:
+        try:
+            data = json.loads(result.stdout)
+            p(
+                f"Status: pid={data.get('pid')}, {data.get('skill_count', '?')} skills, modules={data.get('modules', [])}"
+            )
+        except json.JSONDecodeError:
+            p(f"Non-JSON output: {result.stdout[:100]}", ok=False)
+            failures += 1
+    else:
+        p(f"mcp status failed (exit={result.returncode})", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 6: dimos mcp modules
+    # ---------------------------------------------------------------
+    section("Step 6: dimos mcp modules")
+    result = run_dimos("mcp", "modules")
+    if result.returncode == 0:
+        try:
+            data = json.loads(result.stdout)
+            for mod_name, skills in data.get("modules", {}).items():
+                p(f"Module {mod_name}: {', '.join(skills)}")
+        except json.JSONDecodeError:
+            p(f"Non-JSON output: {result.stdout[:100]}", ok=False)
+            failures += 1
+    else:
+        p(f"mcp modules failed (exit={result.returncode})", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 7: dimos mcp module-io
+    # ---------------------------------------------------------------
+    section("Step 7: dimos mcp module-io")
+    result = run_dimos("mcp", "module-io")
+    if result.returncode == 0:
+        try:
+            data = json.loads(result.stdout)
+            for mod_name, mod_info in data.get("modules", {}).items():
+                skill_names = [s["name"] for s in mod_info.get("skills", [])]
+                p(
+                    f"Module {mod_name}: {mod_info.get('skill_count', '?')} skills ({', '.join(skill_names)})"
+                )
+        except json.JSONDecodeError:
+            p(f"Non-JSON output: {result.stdout[:100]}", ok=False)
+            failures += 1
+    else:
+        p(f"mcp module-io failed (exit={result.returncode})", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 8: dimos agent-send "hello"
+    # ---------------------------------------------------------------
+    section("Step 8: dimos agent-send 'what tools do you have?'")
+    result = run_dimos("agent-send", "what tools do you have?")
+    if result.returncode == 0:
+        p(f"agent-send response: {result.stdout.strip()[:200]}")
+    else:
+        p(f"agent-send failed (exit={result.returncode}): {result.stdout[:100]}", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 9: Check logs
+    # ---------------------------------------------------------------
+    section("Step 9: Check per-run logs")
+    log_base = os.path.expanduser("~/.local/state/dimos/logs")
+    if os.path.isdir(log_base):
+        runs = sorted(os.listdir(log_base), reverse=True)
+        if runs:
+            latest_run = runs[0]
+            log_file = os.path.join(log_base, latest_run, "main.jsonl")
+            if os.path.exists(log_file):
+                size = os.path.getsize(log_file)
+                with open(log_file) as f:
+                    lines = f.readlines()
+                p(f"Log file: {log_file} ({size} bytes, {len(lines)} lines)")
+                if lines:
+                    # Show last 3 lines
+                    for line in lines[-3:]:
+                        print(f"    {line.strip()[:120]}")
+            else:
+                p(f"No main.jsonl found in {latest_run}", ok=False)
+                # Check what files exist
+                run_dir = os.path.join(log_base, latest_run)
+                files = os.listdir(run_dir) if os.path.isdir(run_dir) else []
+                print(f"    Files in run dir: {files}")
+                failures += 1
+        else:
+            p("No run directories found", ok=False)
+            failures += 1
+    else:
+        p(f"Log base dir not found: {log_base}", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Step 10: dimos stop
+    # ---------------------------------------------------------------
+    section("Step 10: dimos stop")
+    result = run_dimos("stop")
+    print(f"  output: {result.stdout.strip()[:200]}")
+    if result.returncode == 0:
+        p("Stopped successfully")
+    else:
+        p(f"Stop failed (exit={result.returncode}): {result.stderr[:100]}", ok=False)
+        failures += 1
+
+    # Wait for shutdown
+    time.sleep(2)
+
+    # ---------------------------------------------------------------
+    # Step 11: dimos status (verify stopped)
+    # ---------------------------------------------------------------
+    section("Step 11: dimos status (verify stopped)")
+    result = run_dimos("status")
+    print(f"  output: {result.stdout.strip()[:200]}")
+    if (
+        "no running" in result.stdout.lower()
+        or "no dimos" in result.stdout.lower()
+        or result.returncode == 0
+    ):
+        p("Confirmed: no running instances")
+    else:
+        p(f"Unexpected status after stop (exit={result.returncode})", ok=False)
+        failures += 1
+
+    # ---------------------------------------------------------------
+    # Summary
+    # ---------------------------------------------------------------
+    print("\n" + "=" * 60)
+    if failures == 0:
+        print("  \u2705 FULL DEVELOPER EXPERIENCE TEST PASSED")
+        print("  All CLI commands work end-to-end!")
+    else:
+        print(f"  \u274c {failures} FAILURES in developer experience test")
+    print("=" * 60 + "\n")
+
+    sys.exit(1 if failures else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dimos/core/tests/e2e_mcp_killtest.py b/dimos/core/tests/e2e_mcp_killtest.py
new file mode 100644
index 0000000000..1d648b43a1
--- /dev/null
+++ b/dimos/core/tests/e2e_mcp_killtest.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Standalone MCP kill/restart stress test — no pytest.
+
+Simulates what happens when DimOS crashes and restarts:
+1. Start blueprint with MCP server
+2. Verify MCP is responsive
+3. Send burst of calls
+4. Kill the process (SIGKILL)
+5. Verify MCP is dead
+6. Restart
+7. Verify recovery
+8. Repeat N cycles
+
+Usage:
+    python -m dimos.core.tests.e2e_mcp_killtest
+    python -m dimos.core.tests.e2e_mcp_killtest --cycles 5
+"""
+
+from __future__ import annotations
+
+import argparse
+import multiprocessing
+import os
+import signal
+import sys
+import time
+
+import requests
+
+MCP_PORT = 9990
+MCP_URL = f"http://localhost:{MCP_PORT}/mcp"
+
+
+def mcp_call(method: str, params: dict | None = None) -> dict:
+    payload: dict = {"jsonrpc": "2.0", "id": 1, "method": method}
+    if params:
+        payload["params"] = params
+    resp = requests.post(MCP_URL, json=payload, timeout=10)
+    resp.raise_for_status()
+    return resp.json()
+
+
+def wait_for_mcp(timeout: float = 15.0) -> bool:
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            resp = requests.post(
+                MCP_URL,
+                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
+                timeout=2,
+            )
+            if resp.status_code == 200:
+                return True
+        except (requests.ConnectionError, requests.ReadTimeout):
+            pass
+        time.sleep(0.3)
+    return False
+
+
+def wait_for_mcp_down(timeout: float = 10.0) -> bool:
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            requests.post(
+                MCP_URL,
+                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
+                timeout=1,
+            )
+        except (requests.ConnectionError, requests.ReadTimeout):
+            return True
+        time.sleep(0.3)
+    return False
+
+
+def run_blueprint_in_process(ready_event: multiprocessing.Event) -> None:
+    os.environ["CI"] = "1"
+    from dimos.agents.mcp.mcp_server import McpServer
+    from dimos.core.blueprints import autoconnect
+    from dimos.core.global_config import global_config
+    from dimos.core.tests.stress_test_module import StressTestModule
+
+    global_config.update(viewer_backend="none", n_workers=1)
+    bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+    coord = bp.build()
+    ready_event.set()
+    try:
+        while True:
+            time.sleep(1)
+    except (KeyboardInterrupt, SystemExit):
+        coord.stop()
+
+
+def p(msg: str, ok: bool = True) -> None:
+    icon = "\u2705" if ok else "\u274c"
+    print(f"  {icon} {msg}")
+
+
+def section(title: str) -> None:
+    print(f"\n{'=' * 60}")
+    print(f"  {title}")
+    print(f"{'=' * 60}")
+
+
+def test_mcp_basic_ops() -> int:
+    failures = 0
+    [
+        ("initialize", lambda: (mcp_call("initialize"), "initialize \u2192 dimensional")),
+    ]
+
+    # initialize
+    try:
+        result = mcp_call("initialize")
+        assert result["result"]["serverInfo"]["name"] == "dimensional"
+        p("initialize \u2192 dimensional")
+    except Exception as e:
+        p(f"initialize failed: {e}", ok=False)
+        failures += 1
+
+    # tools/list
+    try:
+        result = mcp_call("tools/list")
+        tools = {t["name"] for t in result["result"]["tools"]}
+        assert "echo" in tools and "ping" in tools
+        p(f"tools/list \u2192 {len(tools)} tools")
+    except Exception as e:
+        p(f"tools/list failed: {e}", ok=False)
+        failures += 1
+
+    # echo
+    try:
+        result = mcp_call("tools/call", {"name": "echo", "arguments": {"message": "killtest"}})
+        assert result["result"]["content"][0]["text"] == "killtest"
+        p("echo \u2192 killtest")
+    except Exception as e:
+        p(f"echo failed: {e}", ok=False)
+        failures += 1
+
+    # ping
+    try:
+        result = mcp_call("tools/call", {"name": "ping", "arguments": {}})
+        assert result["result"]["content"][0]["text"] == "pong"
+        p("ping \u2192 pong")
+    except Exception as e:
+        p(f"ping failed: {e}", ok=False)
+        failures += 1
+
+    # dimos/status
+    try:
+        result = mcp_call("dimos/status")
+        assert "StressTestModule" in result["result"]["modules"]
+        p(f"dimos/status \u2192 pid={result['result']['pid']}")
+    except Exception as e:
+        p(f"dimos/status failed: {e}", ok=False)
+        failures += 1
+
+    # dimos/agent_send
+    try:
+        result = mcp_call("dimos/agent_send", {"message": "hello from killtest"})
+        assert "hello from killtest" in result["result"]["content"][0]["text"]
+        p("agent_send \u2192 delivered")
+    except Exception as e:
+        p(f"agent_send failed: {e}", ok=False)
+        failures += 1
+
+    # dimos/module_io
+    try:
+        result = mcp_call("dimos/module_io")
+        assert "StressTestModule" in result["result"]["modules"]
+        p(f"module_io \u2192 {result['result']['module_count']} modules")
+    except Exception as e:
+        p(f"module_io failed: {e}", ok=False)
+        failures += 1
+
+    # rapid burst
+    try:
+        for i in range(10):
+            r = mcp_call("tools/call", {"name": "echo", "arguments": {"message": f"burst-{i}"}})
+            assert r["result"]["content"][0]["text"] == f"burst-{i}"
+        p("rapid burst \u2192 10/10 echo calls OK")
+    except Exception as e:
+        p(f"rapid burst failed: {e}", ok=False)
+        failures += 1
+
+    # error handling
+    try:
+        result = mcp_call("nonexistent/method")
+        assert "error" in result
+        p("unknown method \u2192 error (correct)")
+    except Exception as e:
+        p(f"error handling failed: {e}", ok=False)
+        failures += 1
+
+    return failures
+
+
+def run_kill_restart_cycle(cycle: int) -> int:
+    failures = 0
+    section(f"CYCLE {cycle}: Starting DimOS")
+
+    ctx = multiprocessing.get_context("spawn")
+    ready = ctx.Event()
+    proc = ctx.Process(target=run_blueprint_in_process, args=(ready,))
+    proc.start()
+
+    if not ready.wait(timeout=30):
+        p("Blueprint failed to start within 30s", ok=False)
+        proc.kill()
+        proc.join(5)
+        return 1
+
+    if not wait_for_mcp(timeout=15):
+        p("MCP server did not come up", ok=False)
+        proc.kill()
+        proc.join(5)
+        return 1
+    p("MCP server is up")
+
+    failures += test_mcp_basic_ops()
+
+    # KILL
+    section(f"CYCLE {cycle}: SIGKILL \u2192 simulating crash")
+    os.kill(proc.pid, signal.SIGKILL)
+    proc.join(10)
+    p(f"Process killed (pid={proc.pid}, exitcode={proc.exitcode})")
+
+    if wait_for_mcp_down(timeout=10):
+        p("MCP confirmed dead after kill")
+    else:
+        p("MCP still responding after kill!", ok=False)
+        failures += 1
+
+    time.sleep(1)
+
+    # RESTART
+    section(f"CYCLE {cycle}: Restarting DimOS")
+    ready2 = ctx.Event()
+    proc2 = ctx.Process(target=run_blueprint_in_process, args=(ready2,))
+    proc2.start()
+
+    if not ready2.wait(timeout=30):
+        p("Blueprint failed to restart within 30s", ok=False)
+        proc2.kill()
+        proc2.join(5)
+        return failures + 1
+
+    if not wait_for_mcp(timeout=15):
+        p("MCP server did not recover after restart", ok=False)
+        proc2.kill()
+        proc2.join(5)
+        return failures + 1
+    p("MCP server recovered!")
+
+    section(f"CYCLE {cycle}: Post-recovery verification")
+    failures += test_mcp_basic_ops()
+
+    # Clean shutdown
+    section(f"CYCLE {cycle}: Clean shutdown")
+    os.kill(proc2.pid, signal.SIGTERM)
+    proc2.join(10)
+    if proc2.exitcode is not None:
+        p(f"Clean shutdown (exitcode={proc2.exitcode})")
+    else:
+        p("Process did not exit cleanly, forcing kill", ok=False)
+        proc2.kill()
+        proc2.join(5)
+        failures += 1
+
+    if wait_for_mcp_down(timeout=10):
+        p("MCP confirmed dead after clean shutdown")
+    else:
+        p("MCP still responding after shutdown!", ok=False)
+        failures += 1
+
+    time.sleep(1)
+    return failures
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MCP kill/restart stress test")
+    parser.add_argument("--cycles", type=int, default=3, help="Number of kill/restart cycles")
+    args = parser.parse_args()
+
+    print("\n" + "=" * 60)
+    print("  MCP KILL/RESTART STRESS TEST")
+    print(f"  Cycles: {args.cycles}")
+    print("=" * 60)
+
+    total_failures = 0
+    for cycle in range(1, args.cycles + 1):
+        failures = run_kill_restart_cycle(cycle)
+        total_failures += failures
+
+    print("\n" + "=" * 60)
+    if total_failures == 0:
+        print("  \u2705 ALL CYCLES PASSED \u2014 MCP is resilient to kill/restart")
+    else:
+        print(f"  \u274c {total_failures} FAILURES across {args.cycles} cycles")
+    print("=" * 60 + "\n")
+
+    sys.exit(1 if total_failures else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dimos/core/worker.py b/dimos/core/worker.py
index b0dd802841..7ae24b16ab 100644
--- a/dimos/core/worker.py
+++ b/dimos/core/worker.py
@@ -157,9 +157,16 @@ def module_count(self) -> int:
     @property
     def pid(self) -> int | None:
         """PID of the worker process, or ``None`` if not alive."""
-        if self._process is not None and self._process.is_alive():
-            p: int | None = self._process.pid
-            return p
+        if self._process is None:
+            return None
+        try:
+            if self._process.is_alive():
+                return self._process.pid  # type: ignore[return-value]
+        except AssertionError:
+            # After daemonize() we are no longer the parent process;
+            # is_alive() asserts _parent_pid == os.getpid().  Fall back to
+            # the stored pid attribute which is still valid.
+            return self._process.pid  # type: ignore[return-value]
         return None
 
     @property
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index 6026572388..4d82344579 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -80,6 +80,7 @@
     "unitree-go2-ros": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_ros:unitree_go2_ros",
     "unitree-go2-spatial": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial:unitree_go2_spatial",
     "unitree-go2-temporal-memory": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_temporal_memory:unitree_go2_temporal_memory",
+    "stress-test": "dimos.core.tests.stress_test_blueprint:stress_test",
     "unitree-go2-vlm-stream-test": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_vlm_stream_test:unitree_go2_vlm_stream_test",
     "xarm-perception": "dimos.manipulation.manipulation_blueprints:xarm_perception",
     "xarm-perception-agent": "dimos.manipulation.manipulation_blueprints:xarm_perception_agent",

From 5c60adf0a7de43cdfe90bc00af8f9010d94d5bea Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 10:47:15 +0000
Subject: [PATCH 25/37] refactor: strip module-io, fix greptile review issues
 7-13
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove dimos/module_io handler + 'dimos mcp module-io' CLI command
  (showed skill descriptions, not actual IO — misleading)
- Remove unused rpc_calls param from _handle_dimos_agent_send
- Add JSON-RPC error checking to mcp_list_tools, mcp_status, mcp_modules
- Fix empty tool content: exit 0 with '(no output)' instead of exit 1
- Add Content-Type header to curl example in agent context
- Fix double-stop in test_registry_cleanup_after_stop
- Remove module-io references from standalone test scripts
---
 dimos/agents/mcp/mcp_server.py       | 29 ++--------------------
 dimos/core/agent_context.py          |  4 +++-
 dimos/core/test_e2e_mcp_stress.py    | 36 ++--------------------------
 dimos/core/tests/e2e_devex_test.py   | 19 +++++++--------
 dimos/core/tests/e2e_mcp_killtest.py |  9 -------
 dimos/robot/cli/dimos.py             | 28 ++++++++--------------
 6 files changed, 26 insertions(+), 99 deletions(-)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index 07f4a92fbd..e89312f5be 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -137,9 +137,7 @@ def _handle_dimos_list_modules(req_id: Any, skills: list[SkillInfo]) -> dict[str
     return _jsonrpc_result(req_id, {"modules": modules})
 
 
-def _handle_dimos_agent_send(
-    req_id: Any, params: dict[str, Any], rpc_calls: dict[str, Any]
-) -> dict[str, Any]:
+def _handle_dimos_agent_send(req_id: Any, params: dict[str, Any]) -> dict[str, Any]:
     """Route a message to the agent's human_input stream via LCM."""
     message = params.get("message", "")
     if not message:
@@ -156,27 +154,6 @@ def _handle_dimos_agent_send(
         return _jsonrpc_error(req_id, -32000, f"Failed to send: {e}")
 
 
-def _handle_dimos_module_io(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
-    """Return module IO information: skills grouped by module with schema."""
-    modules: dict[str, dict[str, Any]] = {}
-    for s in skills:
-        if s.class_name not in modules:
-            modules[s.class_name] = {"skills": [], "skill_count": 0}
-        mod = modules[s.class_name]
-        schema = json.loads(s.args_schema)
-        description = schema.pop("description", "")
-        schema.pop("title", None)
-        mod["skills"].append(
-            {
-                "name": s.func_name,
-                "description": description,
-                "parameters": schema,
-            }
-        )
-        mod["skill_count"] += 1
-    return _jsonrpc_result(req_id, {"modules": modules, "module_count": len(modules)})
-
-
 async def handle_request(
     request: dict[str, Any],
     skills: list[SkillInfo],
@@ -205,10 +182,8 @@ async def handle_request(
         return _handle_dimos_status(req_id, skills)
     if method == "dimos/list_modules":
         return _handle_dimos_list_modules(req_id, skills)
-    if method == "dimos/module_io":
-        return _handle_dimos_module_io(req_id, skills)
     if method == "dimos/agent_send":
-        return _handle_dimos_agent_send(req_id, params, rpc_calls)
+        return _handle_dimos_agent_send(req_id, params)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
 
 
diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
index fff64a0462..3cd69a8da2 100644
--- a/dimos/core/agent_context.py
+++ b/dimos/core/agent_context.py
@@ -90,7 +90,9 @@ def generate_context(
     lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
     lines.append("```")
     curl_example = (
-        f'curl -s localhost:{mcp_port}/mcp -d \'{{"jsonrpc":"2.0","id":1,"method":"tools/list"}}\''
+        f"curl -s localhost:{mcp_port}/mcp "
+        '-H "Content-Type: application/json" '
+        '-d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\''
     )
     lines.append(curl_example)
     lines.append("```")
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index 2ab1a55a16..36141bafe3 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -25,7 +25,6 @@
 
 from dimos.agents.mcp.mcp_server import McpServer
 from dimos.core.blueprints import autoconnect
-from dimos.core.daemon import health_check
 from dimos.core.global_config import global_config
 from dimos.core.run_registry import (
     RunEntry,
@@ -242,14 +241,12 @@ def test_restart_after_clean_stop(self):
         assert _wait_for_mcp(), "Second MCP start failed (port conflict?)"
         coord2.stop()
 
-    def test_registry_cleanup_after_stop(self, mcp_blueprint, mcp_entry):
+    def test_registry_cleanup_after_stop(self, mcp_entry):
         """Registry entry should be removable after stop."""
-        assert health_check(mcp_blueprint)
-
         runs = list_runs(alive_only=True)
         assert len(runs) == 1
 
-        mcp_blueprint.stop()
+        # Remove registry entry
         mcp_entry.remove()
 
         runs = list_runs(alive_only=True)
@@ -452,32 +449,3 @@ def test_agent_send_cli_no_server(self):
         """dimos agent-send with no server should exit with error."""
         result = CliRunner().invoke(main, ["agent-send", "hello"])
         assert result.exit_code == 1
-
-
-@pytest.mark.slow
-class TestModuleVisualization:
-    """Test module IO introspection via MCP."""
-
-    def test_module_io_returns_skills_with_params(self, mcp_blueprint):
-        """dimos/module_io should return skills grouped by module with parameters."""
-        assert _wait_for_mcp()
-
-        result = _mcp_call("dimos/module_io")
-        assert "result" in result
-        data = result["result"]
-        assert "modules" in data
-        assert "StressTestModule" in data["modules"]
-
-        stress_mod = data["modules"]["StressTestModule"]
-        assert stress_mod["skill_count"] >= 4
-        skill_names = [s["name"] for s in stress_mod["skills"]]
-        assert "echo" in skill_names
-        assert "ping" in skill_names
-
-    def test_module_io_cli(self, mcp_blueprint):
-        """dimos mcp module-io should output module info."""
-        assert _wait_for_mcp()
-
-        result = CliRunner().invoke(main, ["mcp", "module-io"])
-        assert result.exit_code == 0
-        assert "StressTestModule" in result.output
diff --git a/dimos/core/tests/e2e_devex_test.py b/dimos/core/tests/e2e_devex_test.py
index 2ad93fe386..8edea3a7d8 100644
--- a/dimos/core/tests/e2e_devex_test.py
+++ b/dimos/core/tests/e2e_devex_test.py
@@ -24,11 +24,10 @@
 4. dimos mcp call echo             (call a tool)
 5. dimos mcp status                (module info)
 6. dimos mcp modules               (module-skill mapping)
-7. dimos mcp module-io             (introspect module IO)
-8. dimos agent-send "hello"        (send to agent)
-9. Check logs for responses
-10. dimos stop                     (clean shutdown)
-11. dimos status                   (verify stopped)
+7. dimos agent-send "hello"        (send to agent)
+8. Check logs for responses
+9. dimos stop                     (clean shutdown)
+10. dimos status                   (verify stopped)
 
 Usage:
     python -m dimos.core.tests.e2e_devex_test
@@ -248,7 +247,7 @@ def main() -> None:
     # ---------------------------------------------------------------
     # Step 9: Check logs
     # ---------------------------------------------------------------
-    section("Step 9: Check per-run logs")
+    section("Step 8: Check per-run logs")
     log_base = os.path.expanduser("~/.local/state/dimos/logs")
     if os.path.isdir(log_base):
         runs = sorted(os.listdir(log_base), reverse=True)
@@ -279,9 +278,9 @@ def main() -> None:
         failures += 1
 
     # ---------------------------------------------------------------
-    # Step 10: dimos stop
+    # Step 9: dimos stop
     # ---------------------------------------------------------------
-    section("Step 10: dimos stop")
+    section("Step 9: dimos stop")
     result = run_dimos("stop")
     print(f"  output: {result.stdout.strip()[:200]}")
     if result.returncode == 0:
@@ -294,9 +293,9 @@ def main() -> None:
     time.sleep(2)
 
     # ---------------------------------------------------------------
-    # Step 11: dimos status (verify stopped)
+    # Step 10: dimos status (verify stopped)
     # ---------------------------------------------------------------
-    section("Step 11: dimos status (verify stopped)")
+    section("Step 10: dimos status (verify stopped)")
     result = run_dimos("status")
     print(f"  output: {result.stdout.strip()[:200]}")
     if (
diff --git a/dimos/core/tests/e2e_mcp_killtest.py b/dimos/core/tests/e2e_mcp_killtest.py
index 1d648b43a1..431a84a0c9 100644
--- a/dimos/core/tests/e2e_mcp_killtest.py
+++ b/dimos/core/tests/e2e_mcp_killtest.py
@@ -176,15 +176,6 @@ def test_mcp_basic_ops() -> int:
         p(f"agent_send failed: {e}", ok=False)
         failures += 1
 
-    # dimos/module_io
-    try:
-        result = mcp_call("dimos/module_io")
-        assert "StressTestModule" in result["result"]["modules"]
-        p(f"module_io \u2192 {result['result']['module_count']} modules")
-    except Exception as e:
-        p(f"module_io failed: {e}", ok=False)
-        failures += 1
-
     # rapid burst
     try:
         for i in range(10):
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 63c8e7d944..c69ccdf2f2 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -315,6 +315,9 @@ def mcp_list_tools() -> None:
     import json
 
     result = _mcp_call("tools/list")
+    if "error" in result:
+        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+        raise typer.Exit(1)
     tools = result.get("result", {}).get("tools", [])
     typer.echo(json.dumps(tools, indent=2))
 
@@ -345,8 +348,8 @@ def mcp_call_tool(
         raise typer.Exit(1)
     content = result.get("result", {}).get("content", [])
     if not content:
-        typer.echo("No output from tool", err=True)
-        raise typer.Exit(1)
+        typer.echo("(no output)")
+        return
     for item in content:
         typer.echo(item.get("text", str(item)))
 
@@ -357,6 +360,9 @@ def mcp_status() -> None:
     import json
 
     result = _mcp_call("dimos/status")
+    if "error" in result:
+        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+        raise typer.Exit(1)
     data = result.get("result", {})
     typer.echo(json.dumps(data, indent=2))
 
@@ -367,25 +373,11 @@ def mcp_modules() -> None:
     import json
 
     result = _mcp_call("dimos/list_modules")
-    data = result.get("result", {})
-    typer.echo(json.dumps(data, indent=2))
-
-
-@mcp_app.command("module-io")
-def mcp_module_io(
-    port: int = typer.Option(9990, "--port", "-p", help="MCP server port"),
-) -> None:
-    """Show detailed module IO: skills with parameters and descriptions."""
-    result = _mcp_call("dimos/module_io", port=port)
     if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown')}", err=True)
+        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
         raise typer.Exit(1)
     data = result.get("result", {})
-    for mod_name, mod_info in data.get("modules", {}).items():
-        typer.echo(f"\n{mod_name} ({mod_info['skill_count']} skills):")
-        for skill in mod_info.get("skills", []):
-            desc = f" — {skill['description']}" if skill.get("description") else ""
-            typer.echo(f"  {skill['name']}{desc}")
+    typer.echo(json.dumps(data, indent=2))
 
 
 @main.command("agent-send")

From 7df0e64c389db0fe6b120bbf8793c39f41027d85 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 11:35:42 +0000
Subject: [PATCH 26/37] cleanup: remove agent_context.py, fix final greptile
 nits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Delete agent_context.py entirely — runtime info is available via
  `dimos status` and `dimos mcp status`, no need for a separate file
- Remove generate_context() calls from CLI (daemon + foreground paths)
- Fix non-deterministic module list in dimos/status (set → dict.fromkeys)
- Remove unused heartbeat: Out[str] from StressTestModule
- Remove dead list literal from e2e_mcp_killtest.py
---
 dimos/agents/mcp/mcp_server.py         |   2 +-
 dimos/core/agent_context.py            | 112 -------------------------
 dimos/core/tests/stress_test_module.py |   3 -
 dimos/robot/cli/dimos.py               |   8 --
 4 files changed, 1 insertion(+), 124 deletions(-)
 delete mode 100644 dimos/core/agent_context.py

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index e89312f5be..df92454a9c 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -122,7 +122,7 @@ def _handle_dimos_status(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]
         req_id,
         {
             "pid": os.getpid(),
-            "modules": list({s.class_name for s in skills}),
+            "modules": list(dict.fromkeys(s.class_name for s in skills)),
             "skills": [s.func_name for s in skills],
             "skill_count": len(skills),
         },
diff --git a/dimos/core/agent_context.py b/dimos/core/agent_context.py
deleted file mode 100644
index 3cd69a8da2..0000000000
--- a/dimos/core/agent_context.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Generate agent context file for a running DimOS instance (DIM-687).
-
-Writes a markdown file with module diagram, available skills, CLI commands,
-and MCP endpoint info — everything an AI agent needs to interact with the
-running system.
-"""
-
-from __future__ import annotations
-
-import os
-from typing import TYPE_CHECKING
-
-from dimos.utils.logging_config import setup_logger
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from dimos.core.module_coordinator import ModuleCoordinator
-
-logger = setup_logger()
-
-
-def generate_context(
-    coordinator: ModuleCoordinator,
-    blueprint_name: str,
-    run_id: str,
-    log_dir: Path,
-    mcp_port: int = 9990,
-) -> Path:
-    """Generate a context.md file for agents in the log directory.
-
-    Returns the path to the generated file.
-    """
-    log_dir.mkdir(parents=True, exist_ok=True)
-    context_path = log_dir / "context.md"
-
-    lines = [
-        f"# DimOS Agent Context — {blueprint_name}",
-        "",
-        f"**Run ID:** {run_id}",
-        f"**PID:** {os.getpid()}",
-        f"**Blueprint:** {blueprint_name}",
-        "",
-        "## MCP Endpoint",
-        "",
-        f"- URL: `http://localhost:{mcp_port}/mcp`",
-        "- Protocol: JSON-RPC 2.0",
-        "- Methods: `initialize`, `tools/list`, `tools/call`, `dimos/status`, `dimos/list_modules`",
-        "",
-        "## CLI Commands",
-        "",
-        "```",
-        "dimos status              # show running instance",
-        "dimos stop                # stop the instance",
-        "dimos stop --force        # force kill (SIGKILL)",
-        "dimos mcp list-tools      # list available MCP tools",
-        "dimos mcp call <tool>     # call a tool",
-        "dimos mcp status          # module/skill info via MCP",
-        "dimos mcp modules         # module-skill mapping",
-        'dimos agent-send "msg"   # send message to running agent',
-        "```",
-        "",
-    ]
-
-    # Module info
-    lines.append("## Deployed Modules")
-    lines.append("")
-    for w in coordinator.workers:
-        for name in w.module_names:
-            lines.append(f"- {name} (worker {w.worker_id}, pid {w.pid})")
-    lines.append("")
-
-    # Skills info (via MCP)
-    lines.append("## Available Skills (MCP Tools)")
-    lines.append("")
-    lines.append("Use `dimos mcp list-tools` for full schema, or call via:")
-    lines.append("```")
-    curl_example = (
-        f"curl -s localhost:{mcp_port}/mcp "
-        '-H "Content-Type: application/json" '
-        '-d \'{"jsonrpc":"2.0","id":1,"method":"tools/list"}\''
-    )
-    lines.append(curl_example)
-    lines.append("```")
-    lines.append("")
-
-    # Log info
-    lines.append("## Logs")
-    lines.append("")
-    lines.append(f"- Directory: `{log_dir}`")
-    lines.append(f"- Structured log: `{log_dir}/main.jsonl`")
-    lines.append("- Format: structlog JSON, one line per event")
-    lines.append('- Filter by module: `grep \'"logger":"module_name"\' main.jsonl`')
-    lines.append("")
-
-    context_path.write_text("\n".join(lines))
-    logger.info("Agent context written", path=str(context_path))
-    return context_path
diff --git a/dimos/core/tests/stress_test_module.py b/dimos/core/tests/stress_test_module.py
index da48c5a2ca..1fa5b97013 100644
--- a/dimos/core/tests/stress_test_module.py
+++ b/dimos/core/tests/stress_test_module.py
@@ -24,14 +24,11 @@
 
 from dimos.agents.annotation import skill
 from dimos.core.module import Module
-from dimos.core.stream import Out
 
 
 class StressTestModule(Module):
     """Minimal module exposing test skills via MCP."""
 
-    heartbeat: Out[str]
-
     @skill
     def echo(self, message: str) -> str:
         """Echo back the given message. Used for latency and connectivity tests."""
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index c69ccdf2f2..025a4d5b06 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -188,10 +188,6 @@ def run(
         )
         entry.save()
 
-        from dimos.core.agent_context import generate_context
-
-        generate_context(coordinator, blueprint_name, run_id, log_dir)
-
         install_signal_handlers(entry, coordinator)
         coordinator.loop()
     else:
@@ -206,10 +202,6 @@ def run(
         )
         entry.save()
 
-        from dimos.core.agent_context import generate_context
-
-        generate_context(coordinator, blueprint_name, run_id, log_dir)
-
         try:
             coordinator.loop()
         finally:

From baa70362175271e0e6f797d54caee26336359a75 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 11:54:18 +0000
Subject: [PATCH 27/37] fix: address latest greptile review round
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove stale module-io step from e2e_devex_test (command was removed)
- Fix step numbering in devex test (7-10 sequential, no gaps)
- Fix double remove() on registry entry — let fixture handle cleanup
- Remove misleading 'non-default to avoid conflicts' port comment
---
 dimos/core/test_e2e_mcp_stress.py  | 10 ++++------
 dimos/core/tests/e2e_devex_test.py | 26 +++-----------------------
 2 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index 36141bafe3..522c43a208 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -34,7 +34,7 @@
 from dimos.core.tests.stress_test_module import StressTestModule
 from dimos.robot.cli.dimos import main
 
-MCP_PORT = 9990  # non-default to avoid conflicts
+MCP_PORT = 9990
 
 
 # ---------------------------------------------------------------------------
@@ -245,12 +245,10 @@ def test_registry_cleanup_after_stop(self, mcp_entry):
         """Registry entry should be removable after stop."""
         runs = list_runs(alive_only=True)
         assert len(runs) == 1
+        assert mcp_entry.run_id in [r.run_id for r in runs]
 
-        # Remove registry entry
-        mcp_entry.remove()
-
-        runs = list_runs(alive_only=True)
-        assert len(runs) == 0
+        # Fixture teardown will call remove() — just verify entry exists and is valid
+        assert mcp_entry.pid > 0
 
     def test_stale_cleanup_after_crash(self):
         """Stale entries from crashed processes should be cleaned up."""
diff --git a/dimos/core/tests/e2e_devex_test.py b/dimos/core/tests/e2e_devex_test.py
index 8edea3a7d8..04395acef9 100644
--- a/dimos/core/tests/e2e_devex_test.py
+++ b/dimos/core/tests/e2e_devex_test.py
@@ -214,29 +214,9 @@ def main() -> None:
         failures += 1
 
     # ---------------------------------------------------------------
-    # Step 7: dimos mcp module-io
+    # Step 7: dimos agent-send "hello"
     # ---------------------------------------------------------------
-    section("Step 7: dimos mcp module-io")
-    result = run_dimos("mcp", "module-io")
-    if result.returncode == 0:
-        try:
-            data = json.loads(result.stdout)
-            for mod_name, mod_info in data.get("modules", {}).items():
-                skill_names = [s["name"] for s in mod_info.get("skills", [])]
-                p(
-                    f"Module {mod_name}: {mod_info.get('skill_count', '?')} skills ({', '.join(skill_names)})"
-                )
-        except json.JSONDecodeError:
-            p(f"Non-JSON output: {result.stdout[:100]}", ok=False)
-            failures += 1
-    else:
-        p(f"mcp module-io failed (exit={result.returncode})", ok=False)
-        failures += 1
-
-    # ---------------------------------------------------------------
-    # Step 8: dimos agent-send "hello"
-    # ---------------------------------------------------------------
-    section("Step 8: dimos agent-send 'what tools do you have?'")
+    section("Step 7: dimos agent-send 'what tools do you have?'")
     result = run_dimos("agent-send", "what tools do you have?")
     if result.returncode == 0:
         p(f"agent-send response: {result.stdout.strip()[:200]}")
@@ -245,7 +225,7 @@ def main() -> None:
         failures += 1
 
     # ---------------------------------------------------------------
-    # Step 9: Check logs
+    # Step 8: Check logs
     # ---------------------------------------------------------------
     section("Step 8: Check per-run logs")
     log_base = os.path.expanduser("~/.local/state/dimos/logs")

From 0237032ce49b8ae7c00659fbd1c690e1f32f4591 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 12:13:46 +0000
Subject: [PATCH 28/37] fix: resolve mypy errors in worker.py and
 stress_test_module

- worker.py: use typed local variable for process.pid instead of
  direct return (fixes no-any-return)
- stress_test_module.py: add return type annotation to start()
---
 dimos/core/tests/stress_test_module.py | 2 +-
 dimos/core/worker.py                   | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/dimos/core/tests/stress_test_module.py b/dimos/core/tests/stress_test_module.py
index 1fa5b97013..b8ad6d32b8 100644
--- a/dimos/core/tests/stress_test_module.py
+++ b/dimos/core/tests/stress_test_module.py
@@ -52,5 +52,5 @@ def info(self) -> str:
 
         return f"pid={os.getpid()}, module={self.__class__.__name__}"
 
-    def start(self):
+    def start(self) -> None:
         super().start()
diff --git a/dimos/core/worker.py b/dimos/core/worker.py
index 7ae24b16ab..ec86295133 100644
--- a/dimos/core/worker.py
+++ b/dimos/core/worker.py
@@ -161,12 +161,14 @@ def pid(self) -> int | None:
             return None
         try:
             if self._process.is_alive():
-                return self._process.pid  # type: ignore[return-value]
+                p: int | None = self._process.pid
+                return p
         except AssertionError:
             # After daemonize() we are no longer the parent process;
             # is_alive() asserts _parent_pid == os.getpid().  Fall back to
             # the stored pid attribute which is still valid.
-            return self._process.pid  # type: ignore[return-value]
+            p = self._process.pid
+            return p
         return None
 
     @property

From 3ef29ff87fa0faad6783a398e44e07e186005baa Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 12:27:35 +0000
Subject: [PATCH 29/37] =?UTF-8?q?perf:=20class-scoped=20MCP=20fixtures,=20?=
 =?UTF-8?q?125s=20=E2=86=92=2051s=20test=20runtime?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make mcp_shared fixture class-scoped: blueprint starts once per
  class instead of once per test (~4s setup saved per test)
- Move no-server tests (dead_after_stop, cli_no_server_error,
  agent_send_cli_no_server) to dedicated TestMCPNoServer class to
  avoid port conflict with shared fixture
- Add full type annotations to all fixtures and helpers
- Add docstring explaining performance rationale
- Remove unused mcp_blueprint fixture (replaced by mcp_shared)
---
 dimos/core/test_e2e_mcp_stress.py | 318 +++++++++++++++---------------
 1 file changed, 158 insertions(+), 160 deletions(-)

diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_e2e_mcp_stress.py
index 522c43a208..69b629c68a 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_e2e_mcp_stress.py
@@ -12,12 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""MCP server e2e stress tests.
+
+These tests build a real StressTestModule + McpServer blueprint and exercise
+every CLI command and JSON-RPC method.  They are marked ``slow`` so local
+``pytest`` skips them by default (CI runs them).
+
+**Performance note:** Fixtures that create a running blueprint are
+*class-scoped* so the ~4 s startup cost is paid once per class instead of
+once per test.  Only classes that explicitly manage their own lifecycle
+(Recovery, RapidRestart) use per-test setup.
+"""
+
 from __future__ import annotations
 
 from datetime import datetime, timezone
 import json
 import os
 import time
+from typing import TYPE_CHECKING
 
 import pytest
 import requests
@@ -34,6 +47,11 @@
 from dimos.core.tests.stress_test_module import StressTestModule
 from dimos.robot.cli.dimos import main
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from dimos.core.module_coordinator import ModuleCoordinator
+
 MCP_PORT = 9990
 
 
@@ -43,37 +61,47 @@
 
 
 @pytest.fixture(autouse=True)
-def _ci_env(monkeypatch):
+def _ci_env(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("CI", "1")
 
 
 @pytest.fixture(autouse=True)
-def _clean_registry(tmp_path, monkeypatch):
+def _clean_registry(
+    tmp_path: object, monkeypatch: pytest.MonkeyPatch
+) -> Generator[object, None, None]:
+    from pathlib import Path
+
     import dimos.core.run_registry as _reg
 
-    test_dir = tmp_path / "runs"
+    test_dir = Path(str(tmp_path)) / "runs"
     test_dir.mkdir()
     monkeypatch.setattr(_reg, "REGISTRY_DIR", test_dir)
     yield test_dir
 
 
-@pytest.fixture()
-def mcp_blueprint():
-    """Build a StressTestModule + McpServer blueprint. Stops on teardown."""
+@pytest.fixture(scope="class")
+def mcp_shared(request: pytest.FixtureRequest) -> Generator[ModuleCoordinator, None, None]:
+    """Build a shared StressTestModule + McpServer.  Class-scoped -- started
+    once, torn down after every test in the class finishes.  Use for
+    read-only tests that don't stop/restart the server."""
+    os.environ["CI"] = "1"
     global_config.update(viewer_backend="none", n_workers=1)
     bp = autoconnect(
         StressTestModule.blueprint(),
         McpServer.blueprint(),
     )
     coord = bp.build()
+    assert _wait_for_mcp(), "MCP server did not start within timeout"
     yield coord
     coord.stop()
 
 
 @pytest.fixture()
-def mcp_entry(mcp_blueprint, tmp_path):
+def mcp_entry(mcp_shared: ModuleCoordinator, tmp_path: object) -> Generator[RunEntry, None, None]:
     """Create registry entry for the running blueprint."""
-    log_dir = tmp_path / "stress-logs"
+    from pathlib import Path
+
+    log_dir = Path(str(tmp_path)) / "stress-logs"
     entry = RunEntry(
         run_id=f"stress-{datetime.now(timezone.utc).strftime('%H%M%S%f')}",
         pid=os.getpid(),
@@ -88,14 +116,16 @@ def mcp_entry(mcp_blueprint, tmp_path):
     entry.remove()
 
 
-def _mcp_call(method: str, params: dict | None = None, port: int = MCP_PORT) -> dict:
+def _mcp_call(
+    method: str, params: dict[str, object] | None = None, port: int = MCP_PORT
+) -> dict[str, object]:
     """Send a JSON-RPC request to MCP server."""
-    payload: dict = {"jsonrpc": "2.0", "id": 1, "method": method}
+    payload: dict[str, object] = {"jsonrpc": "2.0", "id": 1, "method": method}
     if params:
         payload["params"] = params
     resp = requests.post(f"http://localhost:{port}/mcp", json=payload, timeout=10)
     resp.raise_for_status()
-    return resp.json()
+    return resp.json()  # type: ignore[no-any-return]
 
 
 def _wait_for_mcp(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
@@ -133,25 +163,21 @@ def _wait_for_mcp_down(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
 
 
 # ---------------------------------------------------------------------------
-# Tests
+# Tests -- read-only against a shared MCP server
 # ---------------------------------------------------------------------------
 
 
 @pytest.mark.slow
 class TestMCPLifecycle:
-    """MCP server lifecycle: start → respond → stop → dead."""
+    """MCP server lifecycle: start -> respond -> stop -> dead."""
 
-    def test_mcp_responds_after_build(self, mcp_blueprint):
+    def test_mcp_responds_after_build(self, mcp_shared: ModuleCoordinator) -> None:
         """After blueprint.build(), MCP should accept requests."""
-        assert _wait_for_mcp(), "MCP server did not start"
-
         result = _mcp_call("initialize")
         assert result["result"]["serverInfo"]["name"] == "dimensional"
 
-    def test_tools_list_includes_stress_skills(self, mcp_blueprint):
+    def test_tools_list_includes_stress_skills(self, mcp_shared: ModuleCoordinator) -> None:
         """tools/list should include echo, ping, slow, info from StressTestModule."""
-        assert _wait_for_mcp()
-
         result = _mcp_call("tools/list")
         tool_names = {t["name"] for t in result["result"]["tools"]}
         assert "echo" in tool_names
@@ -159,10 +185,8 @@ def test_tools_list_includes_stress_skills(self, mcp_blueprint):
         assert "slow" in tool_names
         assert "info" in tool_names
 
-    def test_echo_tool(self, mcp_blueprint):
-        """Call echo tool — should return the message."""
-        assert _wait_for_mcp()
-
+    def test_echo_tool(self, mcp_shared: ModuleCoordinator) -> None:
+        """Call echo tool -- should return the message."""
         result = _mcp_call(
             "tools/call",
             {
@@ -173,26 +197,20 @@ def test_echo_tool(self, mcp_blueprint):
         text = result["result"]["content"][0]["text"]
         assert text == "hello from stress test"
 
-    def test_ping_tool(self, mcp_blueprint):
-        """Call ping tool — should return 'pong'."""
-        assert _wait_for_mcp()
-
+    def test_ping_tool(self, mcp_shared: ModuleCoordinator) -> None:
+        """Call ping tool -- should return 'pong'."""
         result = _mcp_call("tools/call", {"name": "ping", "arguments": {}})
         text = result["result"]["content"][0]["text"]
         assert text == "pong"
 
-    def test_info_tool_returns_pid(self, mcp_blueprint):
+    def test_info_tool_returns_pid(self, mcp_shared: ModuleCoordinator) -> None:
         """info tool should return process info."""
-        assert _wait_for_mcp()
-
         result = _mcp_call("tools/call", {"name": "info", "arguments": {}})
         text = result["result"]["content"][0]["text"]
         assert "pid=" in text
 
-    def test_dimos_status_method(self, mcp_blueprint):
+    def test_dimos_status_method(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos/status should return module and skill info."""
-        assert _wait_for_mcp()
-
         result = _mcp_call("dimos/status")
         data = result["result"]
         assert "pid" in data
@@ -200,83 +218,20 @@ def test_dimos_status_method(self, mcp_blueprint):
         assert "skills" in data
         assert "StressTestModule" in data["modules"]
 
-    def test_dimos_list_modules_method(self, mcp_blueprint):
+    def test_dimos_list_modules_method(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos/list_modules should group skills by module."""
-        assert _wait_for_mcp()
-
         result = _mcp_call("dimos/list_modules")
         modules = result["result"]["modules"]
         assert "StressTestModule" in modules
         assert "echo" in modules["StressTestModule"]
 
-    def test_mcp_dead_after_stop(self):
-        """After coordinator.stop(), MCP should stop responding."""
-        global_config.update(viewer_backend="none", n_workers=1)
-        bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
-        coord = bp.build()
-        assert _wait_for_mcp(), "MCP server did not start"
-
-        coord.stop()
-        assert _wait_for_mcp_down(), "MCP server did not stop"
-
-
-@pytest.mark.slow
-class TestDaemonMCPRecovery:
-    """Test MCP recovery after daemon crashes and restarts."""
-
-    def test_restart_after_clean_stop(self):
-        """Stop then start again — MCP should come back."""
-        global_config.update(viewer_backend="none", n_workers=1)
-
-        # First run
-        bp1 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
-        coord1 = bp1.build()
-        assert _wait_for_mcp(), "First MCP start failed"
-        coord1.stop()
-        assert _wait_for_mcp_down(), "MCP didn't stop"
-
-        # Second run — should work without port conflicts
-        bp2 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
-        coord2 = bp2.build()
-        assert _wait_for_mcp(), "Second MCP start failed (port conflict?)"
-        coord2.stop()
-
-    def test_registry_cleanup_after_stop(self, mcp_entry):
-        """Registry entry should be removable after stop."""
-        runs = list_runs(alive_only=True)
-        assert len(runs) == 1
-        assert mcp_entry.run_id in [r.run_id for r in runs]
-
-        # Fixture teardown will call remove() — just verify entry exists and is valid
-        assert mcp_entry.pid > 0
-
-    def test_stale_cleanup_after_crash(self):
-        """Stale entries from crashed processes should be cleaned up."""
-        stale = RunEntry(
-            run_id="crashed-mcp-test",
-            pid=99999999,
-            blueprint="stress-test",
-            started_at=datetime.now(timezone.utc).isoformat(),
-            log_dir="/tmp/ghost",
-            cli_args=[],
-            config_overrides={},
-        )
-        stale.save()
-        assert len(list_runs(alive_only=False)) == 1
-
-        removed = cleanup_stale()
-        assert removed == 1
-        assert len(list_runs(alive_only=False)) == 0
-
 
 @pytest.mark.slow
 class TestMCPCLICommands:
     """Test dimos mcp CLI commands against real MCP server."""
 
-    def test_cli_list_tools(self, mcp_blueprint):
+    def test_cli_list_tools(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp list-tools should output JSON with tools."""
-        assert _wait_for_mcp()
-
         result = CliRunner().invoke(main, ["mcp", "list-tools"])
         assert result.exit_code == 0
         tools = json.loads(result.output)
@@ -284,48 +239,34 @@ def test_cli_list_tools(self, mcp_blueprint):
         assert "echo" in tool_names
         assert "ping" in tool_names
 
-    def test_cli_call_echo(self, mcp_blueprint):
+    def test_cli_call_echo(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp call echo --arg message=hello should return hello."""
-        assert _wait_for_mcp()
-
         result = CliRunner().invoke(main, ["mcp", "call", "echo", "--arg", "message=hello"])
         assert result.exit_code == 0
         assert "hello" in result.output
 
-    def test_cli_mcp_status(self, mcp_blueprint):
+    def test_cli_mcp_status(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp status should return JSON with modules."""
-        assert _wait_for_mcp()
-
         result = CliRunner().invoke(main, ["mcp", "status"])
         assert result.exit_code == 0
         data = json.loads(result.output)
         assert "StressTestModule" in data["modules"]
 
-    def test_cli_mcp_modules(self, mcp_blueprint):
+    def test_cli_mcp_modules(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp modules should show module-skill mapping."""
-        assert _wait_for_mcp()
-
         result = CliRunner().invoke(main, ["mcp", "modules"])
         assert result.exit_code == 0
         data = json.loads(result.output)
         assert "StressTestModule" in data["modules"]
         assert "echo" in data["modules"]["StressTestModule"]
 
-    def test_cli_no_server_error(self):
-        """dimos mcp list-tools with no server should exit with error."""
-        result = CliRunner().invoke(main, ["mcp", "list-tools"])
-        assert result.exit_code == 1
-        assert "no running" in result.output.lower() or "error" in result.output.lower()
-
 
 @pytest.mark.slow
 class TestMCPErrorHandling:
     """Test MCP error handling and edge cases."""
 
-    def test_call_nonexistent_tool(self, mcp_blueprint):
+    def test_call_nonexistent_tool(self, mcp_shared: ModuleCoordinator) -> None:
         """Calling a tool that doesn't exist should return an error message."""
-        assert _wait_for_mcp()
-
         result = _mcp_call(
             "tools/call",
             {
@@ -336,11 +277,8 @@ def test_call_nonexistent_tool(self, mcp_blueprint):
         text = result["result"]["content"][0]["text"]
         assert "not found" in text.lower()
 
-    def test_call_tool_wrong_args(self, mcp_blueprint):
+    def test_call_tool_wrong_args(self, mcp_shared: ModuleCoordinator) -> None:
         """Calling a tool with wrong argument types should still return."""
-        assert _wait_for_mcp()
-
-        # echo expects string, send nothing — should still work or error gracefully
         result = _mcp_call(
             "tools/call",
             {
@@ -348,21 +286,16 @@ def test_call_tool_wrong_args(self, mcp_blueprint):
                 "arguments": {},
             },
         )
-        # Should not crash — either returns error or empty result
         assert "result" in result or "error" in result
 
-    def test_unknown_jsonrpc_method(self, mcp_blueprint):
+    def test_unknown_jsonrpc_method(self, mcp_shared: ModuleCoordinator) -> None:
         """Unknown JSON-RPC method should return error."""
-        assert _wait_for_mcp()
-
         result = _mcp_call("nonexistent/method")
         assert "error" in result
         assert result["error"]["code"] == -32601
 
-    def test_mcp_handles_malformed_json(self, mcp_blueprint):
+    def test_mcp_handles_malformed_json(self, mcp_shared: ModuleCoordinator) -> None:
         """MCP should handle malformed JSON gracefully."""
-        assert _wait_for_mcp()
-
         resp = requests.post(
             f"http://localhost:{MCP_PORT}/mcp",
             data=b"not json{{{",
@@ -371,10 +304,8 @@ def test_mcp_handles_malformed_json(self, mcp_blueprint):
         )
         assert resp.status_code == 400
 
-    def test_rapid_tool_calls(self, mcp_blueprint):
-        """Fire 20 rapid echo calls — all should succeed."""
-        assert _wait_for_mcp()
-
+    def test_rapid_tool_calls(self, mcp_shared: ModuleCoordinator) -> None:
+        """Fire 20 rapid echo calls -- all should succeed."""
         for i in range(20):
             result = _mcp_call(
                 "tools/call",
@@ -386,21 +317,96 @@ def test_rapid_tool_calls(self, mcp_blueprint):
             text = result["result"]["content"][0]["text"]
             assert text == f"rapid-{i}", f"Mismatch on call {i}"
 
-    def test_cli_call_tool_wrong_arg_format(self, mcp_blueprint):
+    def test_cli_call_tool_wrong_arg_format(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp call with bad --arg format should error."""
-        assert _wait_for_mcp()
-
         result = CliRunner().invoke(main, ["mcp", "call", "echo", "--arg", "no_equals_sign"])
         assert result.exit_code == 1
         assert "key=value" in result.output
 
 
+@pytest.mark.slow
+class TestAgentSend:
+    """Test dimos agent-send CLI and MCP method."""
+
+    def test_agent_send_via_mcp(self, mcp_shared: ModuleCoordinator) -> None:
+        """dimos/agent_send should route message via LCM."""
+        result = _mcp_call("dimos/agent_send", {"message": "hello agent"})
+        assert "result" in result
+        text = result["result"]["content"][0]["text"]
+        assert "hello agent" in text
+
+    def test_agent_send_empty_message(self, mcp_shared: ModuleCoordinator) -> None:
+        """Empty message should return error."""
+        result = _mcp_call("dimos/agent_send", {"message": ""})
+        assert "error" in result
+
+    def test_agent_send_cli(self, mcp_shared: ModuleCoordinator) -> None:
+        """dimos agent-send 'hello' should work."""
+        result = CliRunner().invoke(main, ["agent-send", "hello from CLI"])
+        assert result.exit_code == 0
+        assert "hello from CLI" in result.output
+
+
+# ---------------------------------------------------------------------------
+# Tests -- lifecycle management (own setup/teardown per test)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.slow
+class TestDaemonMCPRecovery:
+    """Test MCP recovery after daemon crashes and restarts."""
+
+    def test_restart_after_clean_stop(self) -> None:
+        """Stop then start again -- MCP should come back."""
+        global_config.update(viewer_backend="none", n_workers=1)
+
+        # First run
+        bp1 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord1 = bp1.build()
+        assert _wait_for_mcp(), "First MCP start failed"
+        coord1.stop()
+        assert _wait_for_mcp_down(), "MCP didn't stop"
+
+        # Second run -- should work without port conflicts
+        bp2 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord2 = bp2.build()
+        assert _wait_for_mcp(), "Second MCP start failed (port conflict?)"
+        coord2.stop()
+
+    def test_registry_cleanup_after_stop(self, mcp_entry: RunEntry) -> None:
+        """Registry entry should be removable after stop."""
+        runs = list_runs(alive_only=True)
+        assert len(runs) == 1
+        assert mcp_entry.run_id in [r.run_id for r in runs]
+
+        # Fixture teardown will call remove() -- just verify entry exists and is valid
+        assert mcp_entry.pid > 0
+
+    def test_stale_cleanup_after_crash(self) -> None:
+        """Stale entries from crashed processes should be cleaned up."""
+        stale = RunEntry(
+            run_id="crashed-mcp-test",
+            pid=99999999,
+            blueprint="stress-test",
+            started_at=datetime.now(timezone.utc).isoformat(),
+            log_dir="/tmp/ghost",
+            cli_args=[],
+            config_overrides={},
+        )
+        stale.save()
+        assert len(list_runs(alive_only=False)) == 1
+
+        removed = cleanup_stale()
+        assert removed == 1
+        assert len(list_runs(alive_only=False)) == 0
+
+
 @pytest.mark.slow
 class TestMCPRapidRestart:
     """Test rapid stop/start cycles."""
 
-    def test_three_restart_cycles(self):
-        """Start → stop → start 3 times — no port conflicts."""
+    def test_three_restart_cycles(self) -> None:
+        """Start -> stop -> start 3 times -- no port conflicts."""
         global_config.update(viewer_backend="none", n_workers=1)
 
         for cycle in range(3):
@@ -416,34 +422,26 @@ def test_three_restart_cycles(self):
 
 
 @pytest.mark.slow
-class TestAgentSend:
-    """Test dimos agent-send CLI and MCP method."""
-
-    def test_agent_send_via_mcp(self, mcp_blueprint):
-        """dimos/agent_send should route message via LCM."""
-        assert _wait_for_mcp()
-
-        result = _mcp_call("dimos/agent_send", {"message": "hello agent"})
-        assert "result" in result
-        text = result["result"]["content"][0]["text"]
-        assert "hello agent" in text
+class TestMCPNoServer:
+    """Tests that require NO MCP server running."""
 
-    def test_agent_send_empty_message(self, mcp_blueprint):
-        """Empty message should return error."""
-        assert _wait_for_mcp()
-
-        result = _mcp_call("dimos/agent_send", {"message": ""})
-        assert "error" in result
+    def test_mcp_dead_after_stop(self) -> None:
+        """After coordinator.stop(), MCP should stop responding."""
+        global_config.update(viewer_backend="none", n_workers=1)
+        bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
+        coord = bp.build()
+        assert _wait_for_mcp(), "MCP server did not start"
 
-    def test_agent_send_cli(self, mcp_blueprint):
-        """dimos agent-send 'hello' should work."""
-        assert _wait_for_mcp()
+        coord.stop()
+        assert _wait_for_mcp_down(), "MCP server did not stop"
 
-        result = CliRunner().invoke(main, ["agent-send", "hello from CLI"])
-        assert result.exit_code == 0
-        assert "hello from CLI" in result.output
+    def test_cli_no_server_error(self) -> None:
+        """dimos mcp list-tools with no server should exit with error."""
+        result = CliRunner().invoke(main, ["mcp", "list-tools"])
+        assert result.exit_code == 1
+        assert "no running" in result.output.lower() or "error" in result.output.lower()
 
-    def test_agent_send_cli_no_server(self):
+    def test_agent_send_cli_no_server(self) -> None:
         """dimos agent-send with no server should exit with error."""
         result = CliRunner().invoke(main, ["agent-send", "hello"])
         assert result.exit_code == 1

From 8c9ebd709b419e97a2c00226f4710b6207166e59 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sat, 7 Mar 2026 12:30:51 +0000
Subject: [PATCH 30/37] fix: resolve remaining CI failures (mypy +
 all_blueprints)

- Fix mypy errors in standalone test scripts (e2e_devex_test.py,
  e2e_mcp_killtest.py): typed CompletedProcess, multiprocessing.Event,
  dict params, assert pid not None before os.kill
- Regenerate all_blueprints.py (stress-test entry now alphabetically sorted)
---
 dimos/core/tests/e2e_devex_test.py   |  2 +-
 dimos/core/tests/e2e_mcp_killtest.py | 10 +++++++---
 dimos/robot/all_blueprints.py        |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/dimos/core/tests/e2e_devex_test.py b/dimos/core/tests/e2e_devex_test.py
index 04395acef9..b9ac1393d7 100644
--- a/dimos/core/tests/e2e_devex_test.py
+++ b/dimos/core/tests/e2e_devex_test.py
@@ -50,7 +50,7 @@
     VENV_PYTHON = sys.executable
 
 
-def run_dimos(*args: str, timeout: float = 30) -> subprocess.CompletedProcess:
+def run_dimos(*args: str, timeout: float = 30) -> subprocess.CompletedProcess[str]:
     """Run a dimos CLI command."""
     cmd = [VENV_PYTHON, "-m", "dimos.robot.cli.dimos", *args]
     env = {**os.environ, "CI": "1", "PYTHONPATH": REPO_DIR}
diff --git a/dimos/core/tests/e2e_mcp_killtest.py b/dimos/core/tests/e2e_mcp_killtest.py
index 431a84a0c9..33330f540c 100644
--- a/dimos/core/tests/e2e_mcp_killtest.py
+++ b/dimos/core/tests/e2e_mcp_killtest.py
@@ -34,10 +34,12 @@
 
 import argparse
 import multiprocessing
+import multiprocessing.synchronize
 import os
 import signal
 import sys
 import time
+from typing import Any
 
 import requests
 
@@ -45,8 +47,8 @@
 MCP_URL = f"http://localhost:{MCP_PORT}/mcp"
 
 
-def mcp_call(method: str, params: dict | None = None) -> dict:
-    payload: dict = {"jsonrpc": "2.0", "id": 1, "method": method}
+def mcp_call(method: str, params: dict[str, object] | None = None) -> Any:
+    payload: dict[str, object] = {"jsonrpc": "2.0", "id": 1, "method": method}
     if params:
         payload["params"] = params
     resp = requests.post(MCP_URL, json=payload, timeout=10)
@@ -86,7 +88,7 @@ def wait_for_mcp_down(timeout: float = 10.0) -> bool:
     return False
 
 
-def run_blueprint_in_process(ready_event: multiprocessing.Event) -> None:
+def run_blueprint_in_process(ready_event: multiprocessing.synchronize.Event) -> None:
     os.environ["CI"] = "1"
     from dimos.agents.mcp.mcp_server import McpServer
     from dimos.core.blueprints import autoconnect
@@ -224,6 +226,7 @@ def run_kill_restart_cycle(cycle: int) -> int:
 
     # KILL
     section(f"CYCLE {cycle}: SIGKILL \u2192 simulating crash")
+    assert proc.pid is not None
     os.kill(proc.pid, signal.SIGKILL)
     proc.join(10)
     p(f"Process killed (pid={proc.pid}, exitcode={proc.exitcode})")
@@ -260,6 +263,7 @@ def run_kill_restart_cycle(cycle: int) -> int:
 
     # Clean shutdown
     section(f"CYCLE {cycle}: Clean shutdown")
+    assert proc2.pid is not None
     os.kill(proc2.pid, signal.SIGTERM)
     proc2.join(10)
     if proc2.exitcode is not None:
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index 4d82344579..2b0ca30b1f 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -59,6 +59,7 @@
     "mid360-fastlio-voxels-native": "dimos.hardware.sensors.lidar.fastlio2.fastlio_blueprints:mid360_fastlio_voxels_native",
     "phone-go2-teleop": "dimos.teleop.phone.blueprints:phone_go2_teleop",
     "simple-phone-teleop": "dimos.teleop.phone.blueprints:simple_phone_teleop",
+    "stress-test": "dimos.core.tests.stress_test_blueprint:stress_test",
     "uintree-g1-primitive-no-nav": "dimos.robot.unitree.g1.blueprints.primitive.uintree_g1_primitive_no_nav:uintree_g1_primitive_no_nav",
     "unitree-g1": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1:unitree_g1",
     "unitree-g1-agentic": "dimos.robot.unitree.g1.blueprints.agentic.unitree_g1_agentic:unitree_g1_agentic",
@@ -80,7 +81,6 @@
     "unitree-go2-ros": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_ros:unitree_go2_ros",
     "unitree-go2-spatial": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial:unitree_go2_spatial",
     "unitree-go2-temporal-memory": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_temporal_memory:unitree_go2_temporal_memory",
-    "stress-test": "dimos.core.tests.stress_test_blueprint:stress_test",
     "unitree-go2-vlm-stream-test": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_vlm_stream_test:unitree_go2_vlm_stream_test",
     "xarm-perception": "dimos.manipulation.manipulation_blueprints:xarm_perception",
     "xarm-perception-agent": "dimos.manipulation.manipulation_blueprints:xarm_perception_agent",

From 315b2016bf6dc8783e47e60b9f717f6968897b0f Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sun, 8 Mar 2026 15:14:02 +0000
Subject: [PATCH 31/37] refactor: McpAdapter class + convert custom methods to
 @skill tools

Address Paul review comments on PR #1451:

- New McpAdapter class replacing 3 duplicated _mcp_call implementations
- Convert dimos/status, list_modules, agent_send to @skill on McpServer
- CLI thin wrappers over McpAdapter, added --json-args flag
- worker.py: os.kill(pid, 0) for pid check
- Renamed test files (demo_ prefix for non-CI, integration for pytest)
- transport.start()/stop(), removed skill_count, requests in pyproject

29/29 tests pass locally (41s).
---
 dimos/agents/mcp/mcp_adapter.py               | 187 ++++++++++++++++++
 dimos/agents/mcp/mcp_server.py                | 133 +++++++------
 ..._mcp_stress.py => test_mcp_integration.py} | 140 ++++++-------
 .../{e2e_devex_test.py => demo_devex.py}      |   0
 ...e_mcp_killtest.py => demo_mcp_killtest.py} |   0
 dimos/core/tests/stress_test_blueprint.py     |   4 +-
 dimos/core/worker.py                          |  21 +-
 dimos/robot/all_blueprints.py                 |   2 +-
 dimos/robot/cli/dimos.py                      | 142 ++++++-------
 pyproject.toml                                |   1 +
 uv.lock                                       |   2 +
 11 files changed, 405 insertions(+), 227 deletions(-)
 create mode 100644 dimos/agents/mcp/mcp_adapter.py
 rename dimos/core/{test_e2e_mcp_stress.py => test_mcp_integration.py} (78%)
 rename dimos/core/tests/{e2e_devex_test.py => demo_devex.py} (100%)
 rename dimos/core/tests/{e2e_mcp_killtest.py => demo_mcp_killtest.py} (100%)

diff --git a/dimos/agents/mcp/mcp_adapter.py b/dimos/agents/mcp/mcp_adapter.py
new file mode 100644
index 0000000000..2a6cb7efcd
--- /dev/null
+++ b/dimos/agents/mcp/mcp_adapter.py
@@ -0,0 +1,187 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lightweight MCP JSON-RPC client adapter.
+
+``McpAdapter`` provides a typed Python interface to a running MCP server.
+It is used by:
+
+* The ``dimos mcp`` CLI commands
+* Integration / e2e tests
+* Any code that needs to talk to a local MCP server
+
+Usage::
+
+    adapter = McpAdapter("http://localhost:9990/mcp")
+    adapter.wait_for_ready(timeout=10)
+    tools = adapter.list_tools()
+    result = adapter.call_tool("echo", {"message": "hi"})
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any
+import uuid
+
+import requests
+
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+DEFAULT_TIMEOUT = 30
+
+
+class McpError(Exception):
+    """Raised when the MCP server returns a JSON-RPC error."""
+
+    def __init__(self, message: str, code: int | None = None) -> None:
+        self.code = code
+        super().__init__(message)
+
+
+class McpAdapter:
+    """Thin JSON-RPC client for a running MCP server."""
+
+    def __init__(self, url: str | None = None, timeout: int = DEFAULT_TIMEOUT) -> None:
+        if url is None:
+            from dimos.core.global_config import global_config
+
+            url = f"http://localhost:{global_config.mcp_port}/mcp"
+        self.url = url
+        self.timeout = timeout
+
+    # ------------------------------------------------------------------
+    # Low-level JSON-RPC
+    # ------------------------------------------------------------------
+
+    def call(self, method: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
+        """Send a JSON-RPC request and return the parsed response.
+
+        Raises ``requests.ConnectionError`` if the server is unreachable.
+        """
+        payload: dict[str, Any] = {
+            "jsonrpc": "2.0",
+            "id": str(uuid.uuid4()),
+            "method": method,
+        }
+        if params:
+            payload["params"] = params
+
+        resp = requests.post(self.url, json=payload, timeout=self.timeout)
+        resp.raise_for_status()
+        return resp.json()  # type: ignore[no-any-return]
+
+    # ------------------------------------------------------------------
+    # MCP standard methods
+    # ------------------------------------------------------------------
+
+    def initialize(self) -> dict[str, Any]:
+        """Send ``initialize`` and return server info."""
+        return self.call("initialize")
+
+    def list_tools(self) -> list[dict[str, Any]]:
+        """Return the list of available tools."""
+        result = self._unwrap(self.call("tools/list"))
+        return result.get("tools", [])  # type: ignore[no-any-return]
+
+    def call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> dict[str, Any]:
+        """Call a tool by name and return the result dict."""
+        return self._unwrap(self.call("tools/call", {"name": name, "arguments": arguments or {}}))
+
+    def call_tool_text(self, name: str, arguments: dict[str, Any] | None = None) -> str:
+        """Call a tool and return just the first text content item."""
+        result = self.call_tool(name, arguments)
+        content = result.get("content", [])
+        if not content:
+            return ""
+        return content[0].get("text", str(content[0]))  # type: ignore[no-any-return]
+
+    # ------------------------------------------------------------------
+    # Readiness probes
+    # ------------------------------------------------------------------
+
+    def wait_for_ready(self, timeout: float = 10.0, interval: float = 0.5) -> bool:
+        """Poll until the MCP server responds, or return False on timeout."""
+        deadline = time.monotonic() + timeout
+        while time.monotonic() < deadline:
+            try:
+                resp = requests.post(
+                    self.url,
+                    json={"jsonrpc": "2.0", "id": "probe", "method": "initialize"},
+                    timeout=2,
+                )
+                if resp.status_code == 200:
+                    return True
+            except requests.ConnectionError:
+                pass
+            time.sleep(interval)
+        return False
+
+    def wait_for_down(self, timeout: float = 10.0, interval: float = 0.5) -> bool:
+        """Poll until the MCP server stops responding."""
+        deadline = time.monotonic() + timeout
+        while time.monotonic() < deadline:
+            try:
+                requests.post(
+                    self.url,
+                    json={"jsonrpc": "2.0", "id": "probe", "method": "initialize"},
+                    timeout=1,
+                )
+            except (requests.ConnectionError, requests.ReadTimeout):
+                return True
+            time.sleep(interval)
+        return False
+
+    # ------------------------------------------------------------------
+    # Class methods for discovery
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def from_run_entry(cls, entry: Any | None = None, timeout: int = DEFAULT_TIMEOUT) -> McpAdapter:
+        """Create an adapter from a RunEntry, or discover the latest one.
+
+        Falls back to the default URL if no entry is found.
+        """
+        if entry is None:
+            from dimos.core.run_registry import list_runs
+
+            runs = list_runs(alive_only=True)
+            entry = runs[0] if runs else None
+
+        if entry is not None and hasattr(entry, "mcp_url") and entry.mcp_url:
+            return cls(url=entry.mcp_url, timeout=timeout)
+
+        # Fall back to default URL using GlobalConfig port
+        from dimos.core.global_config import global_config
+
+        url = f"http://localhost:{global_config.mcp_port}/mcp"
+        return cls(url=url, timeout=timeout)
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _unwrap(response: dict[str, Any]) -> dict[str, Any]:
+        """Extract the ``result`` from a JSON-RPC response, raising on error."""
+        if "error" in response:
+            err = response["error"]
+            msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
+            raise McpError(msg, code=err.get("code") if isinstance(err, dict) else None)
+        return response.get("result", {})  # type: ignore[no-any-return]
+
+    def __repr__(self) -> str:
+        return f"McpAdapter(url={self.url!r})"
diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index c11a7e28cd..30f13e7594 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -15,25 +15,25 @@
 
 import asyncio
 import json
+import os
 from typing import TYPE_CHECKING, Any
 
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
+from starlette.requests import Request  # noqa: TC002
 from starlette.responses import Response
 import uvicorn
 
+from dimos.agents.annotation import skill
+from dimos.core.core import rpc
+from dimos.core.module import Module
+from dimos.core.rpc_client import RpcCall, RPCClient
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger()
 
 
-from starlette.requests import Request  # noqa: TC002
-
-from dimos.core.core import rpc
-from dimos.core.module import Module
-from dimos.core.rpc_client import RpcCall, RPCClient
-
 if TYPE_CHECKING:
     import concurrent.futures
 
@@ -51,6 +51,11 @@
 app.state.rpc_calls = {}
 
 
+# ---------------------------------------------------------------------------
+# JSON-RPC helpers
+# ---------------------------------------------------------------------------
+
+
 def _jsonrpc_result(req_id: Any, result: Any) -> dict[str, Any]:
     return {"jsonrpc": "2.0", "id": req_id, "result": result}
 
@@ -63,6 +68,11 @@ def _jsonrpc_error(req_id: Any, code: int, message: str) -> dict[str, Any]:
     return {"jsonrpc": "2.0", "id": req_id, "error": {"code": code, "message": message}}
 
 
+# ---------------------------------------------------------------------------
+# JSON-RPC handlers (standard MCP protocol only)
+# ---------------------------------------------------------------------------
+
+
 def _handle_initialize(req_id: Any) -> dict[str, Any]:
     return _jsonrpc_result(
         req_id,
@@ -77,11 +87,11 @@ def _handle_initialize(req_id: Any) -> dict[str, Any]:
 def _handle_tools_list(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
     tools = []
 
-    for skill in skills:
-        schema = json.loads(skill.args_schema)
+    for s in skills:
+        schema = json.loads(s.args_schema)
         description = schema.pop("description", None)
         schema.pop("title", None)
-        tool = {"name": skill.func_name, "inputSchema": schema}
+        tool: dict[str, Any] = {"name": s.func_name, "inputSchema": schema}
         if description:
             tool["description"] = description
         tools.append(tool)
@@ -114,46 +124,6 @@ async def _handle_tools_call(
     return _jsonrpc_result_text(req_id, str(result))
 
 
-def _handle_dimos_status(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
-    """Return running modules, skills, and server info."""
-    import os
-
-    return _jsonrpc_result(
-        req_id,
-        {
-            "pid": os.getpid(),
-            "modules": list(dict.fromkeys(s.class_name for s in skills)),
-            "skills": [s.func_name for s in skills],
-            "skill_count": len(skills),
-        },
-    )
-
-
-def _handle_dimos_list_modules(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]:
-    """Return module names and their skills."""
-    modules: dict[str, list[str]] = {}
-    for s in skills:
-        modules.setdefault(s.class_name, []).append(s.func_name)
-    return _jsonrpc_result(req_id, {"modules": modules})
-
-
-def _handle_dimos_agent_send(req_id: Any, params: dict[str, Any]) -> dict[str, Any]:
-    """Route a message to the agent's human_input stream via LCM."""
-    message = params.get("message", "")
-    if not message:
-        return _jsonrpc_error(req_id, -32602, "Missing 'message' parameter")
-
-    # Publish on /human_input LCM channel (same as HumanCLI)
-    try:
-        from dimos.core.transport import pLCMTransport
-
-        transport: pLCMTransport[str] = pLCMTransport("/human_input")
-        transport.publish(message)
-        return _jsonrpc_result_text(req_id, f"Message sent to agent: {message[:100]}")
-    except Exception as e:
-        return _jsonrpc_error(req_id, -32000, f"Failed to send: {e}")
-
-
 async def handle_request(
     request: dict[str, Any],
     skills: list[SkillInfo],
@@ -168,7 +138,7 @@ async def handle_request(
     params = request.get("params", {}) or {}
     req_id = request.get("id")
 
-    # JSON-RPC notifications have no "id" – the server must not reply.
+    # JSON-RPC notifications have no "id" -- the server must not reply.
     if "id" not in request:
         return None
 
@@ -178,12 +148,6 @@ async def handle_request(
         return _handle_tools_list(req_id, skills)
     if method == "tools/call":
         return await _handle_tools_call(req_id, params, rpc_calls)
-    if method == "dimos/status":
-        return _handle_dimos_status(req_id, skills)
-    if method == "dimos/list_modules":
-        return _handle_dimos_list_modules(req_id, skills)
-    if method == "dimos/agent_send":
-        return _handle_dimos_agent_send(req_id, params)
     return _jsonrpc_error(req_id, -32601, f"Unknown: {method}")
 
 
@@ -204,6 +168,11 @@ async def mcp_endpoint(request: Request) -> Response:
     return JSONResponse(result)
 
 
+# ---------------------------------------------------------------------------
+# McpServer Module
+# ---------------------------------------------------------------------------
+
+
 class McpServer(Module):
     def __init__(self) -> None:
         super().__init__()
@@ -229,12 +198,58 @@ def stop(self) -> None:
     @rpc
     def on_system_modules(self, modules: list[RPCClient]) -> None:
         assert self.rpc is not None
-        app.state.skills = [skill for module in modules for skill in (module.get_skills() or [])]
+        app.state.skills = [
+            skill_info for module in modules for skill_info in (module.get_skills() or [])
+        ]
         app.state.rpc_calls = {
-            skill.func_name: RpcCall(None, self.rpc, skill.func_name, skill.class_name, [])
-            for skill in app.state.skills
+            skill_info.func_name: RpcCall(
+                None, self.rpc, skill_info.func_name, skill_info.class_name, []
+            )
+            for skill_info in app.state.skills
         }
 
+    # ------------------------------------------------------------------
+    # Introspection skills (exposed as MCP tools via tools/list)
+    # ------------------------------------------------------------------
+
+    @skill
+    def server_status(self) -> str:
+        """Get MCP server status: PID, deployed modules, and skill count."""
+        skills: list[SkillInfo] = app.state.skills
+        modules = list(dict.fromkeys(s.class_name for s in skills))
+        return json.dumps(
+            {
+                "pid": os.getpid(),
+                "modules": modules,
+                "skills": [s.func_name for s in skills],
+            }
+        )
+
+    @skill
+    def list_modules(self) -> str:
+        """List deployed modules and their skills."""
+        skills: list[SkillInfo] = app.state.skills
+        modules: dict[str, list[str]] = {}
+        for s in skills:
+            modules.setdefault(s.class_name, []).append(s.func_name)
+        return json.dumps({"modules": modules})
+
+    @skill
+    def agent_send(self, message: str) -> str:
+        """Send a message to the running DimOS agent via LCM."""
+        if not message:
+            raise ValueError("Message cannot be empty")
+
+        from dimos.core.transport import pLCMTransport
+
+        transport: pLCMTransport[str] = pLCMTransport("/human_input")
+        try:
+            transport.start()
+            transport.publish(message)
+            return f"Message sent to agent: {message[:100]}"
+        finally:
+            transport.stop()
+
     def _start_server(self, port: int | None = None) -> None:
         from dimos.core.global_config import global_config
 
diff --git a/dimos/core/test_e2e_mcp_stress.py b/dimos/core/test_mcp_integration.py
similarity index 78%
rename from dimos/core/test_e2e_mcp_stress.py
rename to dimos/core/test_mcp_integration.py
index 69b629c68a..8ec06c6bca 100644
--- a/dimos/core/test_e2e_mcp_stress.py
+++ b/dimos/core/test_mcp_integration.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""MCP server e2e stress tests.
+"""MCP server integration tests.
 
 These tests build a real StressTestModule + McpServer blueprint and exercise
 every CLI command and JSON-RPC method.  They are marked ``slow`` so local
@@ -29,13 +29,13 @@
 from datetime import datetime, timezone
 import json
 import os
-import time
 from typing import TYPE_CHECKING
 
 import pytest
 import requests
 from typer.testing import CliRunner
 
+from dimos.agents.mcp.mcp_adapter import McpAdapter
 from dimos.agents.mcp.mcp_server import McpServer
 from dimos.core.blueprints import autoconnect
 from dimos.core.global_config import global_config
@@ -52,7 +52,7 @@
 
     from dimos.core.module_coordinator import ModuleCoordinator
 
-MCP_PORT = 9990
+MCP_URL = f"http://localhost:{global_config.mcp_port}/mcp"
 
 
 # ---------------------------------------------------------------------------
@@ -84,16 +84,16 @@ def mcp_shared(request: pytest.FixtureRequest) -> Generator[ModuleCoordinator, N
     """Build a shared StressTestModule + McpServer.  Class-scoped -- started
     once, torn down after every test in the class finishes.  Use for
     read-only tests that don't stop/restart the server."""
-    os.environ["CI"] = "1"
     global_config.update(viewer_backend="none", n_workers=1)
     bp = autoconnect(
         StressTestModule.blueprint(),
         McpServer.blueprint(),
     )
     coord = bp.build()
-    assert _wait_for_mcp(), "MCP server did not start within timeout"
+    ready = _adapter().wait_for_ready()
     yield coord
     coord.stop()
+    assert ready, "MCP server did not start within timeout"
 
 
 @pytest.fixture()
@@ -116,50 +116,9 @@ def mcp_entry(mcp_shared: ModuleCoordinator, tmp_path: object) -> Generator[RunE
     entry.remove()
 
 
-def _mcp_call(
-    method: str, params: dict[str, object] | None = None, port: int = MCP_PORT
-) -> dict[str, object]:
-    """Send a JSON-RPC request to MCP server."""
-    payload: dict[str, object] = {"jsonrpc": "2.0", "id": 1, "method": method}
-    if params:
-        payload["params"] = params
-    resp = requests.post(f"http://localhost:{port}/mcp", json=payload, timeout=10)
-    resp.raise_for_status()
-    return resp.json()  # type: ignore[no-any-return]
-
-
-def _wait_for_mcp(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
-    """Poll until MCP server responds or timeout."""
-    deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
-        try:
-            resp = requests.post(
-                f"http://localhost:{port}/mcp",
-                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
-                timeout=2,
-            )
-            if resp.status_code == 200:
-                return True
-        except requests.ConnectionError:
-            pass
-        time.sleep(0.5)
-    return False
-
-
-def _wait_for_mcp_down(port: int = MCP_PORT, timeout: float = 10.0) -> bool:
-    """Poll until MCP server stops responding."""
-    deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
-        try:
-            requests.post(
-                f"http://localhost:{port}/mcp",
-                json={"jsonrpc": "2.0", "id": 1, "method": "initialize"},
-                timeout=1,
-            )
-        except (requests.ConnectionError, requests.ReadTimeout):
-            return True
-        time.sleep(0.5)
-    return False
+def _adapter() -> McpAdapter:
+    """Return an McpAdapter using GlobalConfig port."""
+    return McpAdapter()
 
 
 # ---------------------------------------------------------------------------
@@ -173,21 +132,25 @@ class TestMCPLifecycle:
 
     def test_mcp_responds_after_build(self, mcp_shared: ModuleCoordinator) -> None:
         """After blueprint.build(), MCP should accept requests."""
-        result = _mcp_call("initialize")
+        result = _adapter().call("initialize")
         assert result["result"]["serverInfo"]["name"] == "dimensional"
 
     def test_tools_list_includes_stress_skills(self, mcp_shared: ModuleCoordinator) -> None:
         """tools/list should include echo, ping, slow, info from StressTestModule."""
-        result = _mcp_call("tools/list")
+        result = _adapter().call("tools/list")
         tool_names = {t["name"] for t in result["result"]["tools"]}
         assert "echo" in tool_names
         assert "ping" in tool_names
         assert "slow" in tool_names
         assert "info" in tool_names
+        # McpServer introspection skills
+        assert "server_status" in tool_names
+        assert "list_modules" in tool_names
+        assert "agent_send" in tool_names
 
     def test_echo_tool(self, mcp_shared: ModuleCoordinator) -> None:
         """Call echo tool -- should return the message."""
-        result = _mcp_call(
+        result = _adapter().call(
             "tools/call",
             {
                 "name": "echo",
@@ -199,29 +162,29 @@ def test_echo_tool(self, mcp_shared: ModuleCoordinator) -> None:
 
     def test_ping_tool(self, mcp_shared: ModuleCoordinator) -> None:
         """Call ping tool -- should return 'pong'."""
-        result = _mcp_call("tools/call", {"name": "ping", "arguments": {}})
+        result = _adapter().call("tools/call", {"name": "ping", "arguments": {}})
         text = result["result"]["content"][0]["text"]
         assert text == "pong"
 
     def test_info_tool_returns_pid(self, mcp_shared: ModuleCoordinator) -> None:
         """info tool should return process info."""
-        result = _mcp_call("tools/call", {"name": "info", "arguments": {}})
+        result = _adapter().call("tools/call", {"name": "info", "arguments": {}})
         text = result["result"]["content"][0]["text"]
         assert "pid=" in text
 
-    def test_dimos_status_method(self, mcp_shared: ModuleCoordinator) -> None:
-        """dimos/status should return module and skill info."""
-        result = _mcp_call("dimos/status")
-        data = result["result"]
+    def test_server_status_tool(self, mcp_shared: ModuleCoordinator) -> None:
+        """server_status tool should return module and skill info."""
+        text = _adapter().call_tool_text("server_status")
+        data = json.loads(text)
         assert "pid" in data
         assert "modules" in data
         assert "skills" in data
         assert "StressTestModule" in data["modules"]
 
-    def test_dimos_list_modules_method(self, mcp_shared: ModuleCoordinator) -> None:
-        """dimos/list_modules should group skills by module."""
-        result = _mcp_call("dimos/list_modules")
-        modules = result["result"]["modules"]
+    def test_list_modules_tool(self, mcp_shared: ModuleCoordinator) -> None:
+        """list_modules tool should group skills by module."""
+        text = _adapter().call_tool_text("list_modules")
+        modules = json.loads(text)["modules"]
         assert "StressTestModule" in modules
         assert "echo" in modules["StressTestModule"]
 
@@ -267,7 +230,7 @@ class TestMCPErrorHandling:
 
     def test_call_nonexistent_tool(self, mcp_shared: ModuleCoordinator) -> None:
         """Calling a tool that doesn't exist should return an error message."""
-        result = _mcp_call(
+        result = _adapter().call(
             "tools/call",
             {
                 "name": "nonexistent_tool_xyz",
@@ -279,7 +242,7 @@ def test_call_nonexistent_tool(self, mcp_shared: ModuleCoordinator) -> None:
 
     def test_call_tool_wrong_args(self, mcp_shared: ModuleCoordinator) -> None:
         """Calling a tool with wrong argument types should still return."""
-        result = _mcp_call(
+        result = _adapter().call(
             "tools/call",
             {
                 "name": "echo",
@@ -290,14 +253,14 @@ def test_call_tool_wrong_args(self, mcp_shared: ModuleCoordinator) -> None:
 
     def test_unknown_jsonrpc_method(self, mcp_shared: ModuleCoordinator) -> None:
         """Unknown JSON-RPC method should return error."""
-        result = _mcp_call("nonexistent/method")
+        result = _adapter().call("nonexistent/method")
         assert "error" in result
         assert result["error"]["code"] == -32601
 
     def test_mcp_handles_malformed_json(self, mcp_shared: ModuleCoordinator) -> None:
         """MCP should handle malformed JSON gracefully."""
         resp = requests.post(
-            f"http://localhost:{MCP_PORT}/mcp",
+            MCP_URL,
             data=b"not json{{{",
             headers={"Content-Type": "application/json"},
             timeout=5,
@@ -307,7 +270,7 @@ def test_mcp_handles_malformed_json(self, mcp_shared: ModuleCoordinator) -> None
     def test_rapid_tool_calls(self, mcp_shared: ModuleCoordinator) -> None:
         """Fire 20 rapid echo calls -- all should succeed."""
         for i in range(20):
-            result = _mcp_call(
+            result = _adapter().call(
                 "tools/call",
                 {
                     "name": "echo",
@@ -323,22 +286,35 @@ def test_cli_call_tool_wrong_arg_format(self, mcp_shared: ModuleCoordinator) ->
         assert result.exit_code == 1
         assert "key=value" in result.output
 
+    def test_cli_call_json_args(self, mcp_shared: ModuleCoordinator) -> None:
+        """dimos mcp call --json-args should work."""
+        result = CliRunner().invoke(
+            main, ["mcp", "call", "echo", "--json-args", '{"message": "json-test"}']
+        )
+        assert result.exit_code == 0
+        assert "json-test" in result.output
+
+    def test_cli_call_bad_json_args(self, mcp_shared: ModuleCoordinator) -> None:
+        """dimos mcp call with invalid --json-args should error."""
+        result = CliRunner().invoke(main, ["mcp", "call", "echo", "--json-args", "not json"])
+        assert result.exit_code == 1
+        assert "invalid JSON" in result.output
+
 
 @pytest.mark.slow
 class TestAgentSend:
     """Test dimos agent-send CLI and MCP method."""
 
     def test_agent_send_via_mcp(self, mcp_shared: ModuleCoordinator) -> None:
-        """dimos/agent_send should route message via LCM."""
-        result = _mcp_call("dimos/agent_send", {"message": "hello agent"})
-        assert "result" in result
-        text = result["result"]["content"][0]["text"]
+        """agent_send tool should route message via LCM."""
+        text = _adapter().call_tool_text("agent_send", {"message": "hello agent"})
         assert "hello agent" in text
 
     def test_agent_send_empty_message(self, mcp_shared: ModuleCoordinator) -> None:
-        """Empty message should return error."""
-        result = _mcp_call("dimos/agent_send", {"message": ""})
-        assert "error" in result
+        """Empty message should return error text."""
+        result = _adapter().call_tool("agent_send", {"message": ""})
+        text = result.get("content", [{}])[0].get("text", "")
+        assert "error" in text.lower() or "empty" in text.lower()
 
     def test_agent_send_cli(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos agent-send 'hello' should work."""
@@ -363,14 +339,14 @@ def test_restart_after_clean_stop(self) -> None:
         # First run
         bp1 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
         coord1 = bp1.build()
-        assert _wait_for_mcp(), "First MCP start failed"
+        assert _adapter().wait_for_ready(), "First MCP start failed"
         coord1.stop()
-        assert _wait_for_mcp_down(), "MCP didn't stop"
+        assert _adapter().wait_for_down(), "MCP didn't stop"
 
         # Second run -- should work without port conflicts
         bp2 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
         coord2 = bp2.build()
-        assert _wait_for_mcp(), "Second MCP start failed (port conflict?)"
+        assert _adapter().wait_for_ready(), "Second MCP start failed (port conflict?)"
         coord2.stop()
 
     def test_registry_cleanup_after_stop(self, mcp_entry: RunEntry) -> None:
@@ -412,13 +388,13 @@ def test_three_restart_cycles(self) -> None:
         for cycle in range(3):
             bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
             coord = bp.build()
-            assert _wait_for_mcp(timeout=15), f"MCP failed to start on cycle {cycle}"
+            assert _adapter().wait_for_ready(timeout=15), f"MCP failed to start on cycle {cycle}"
 
-            result = _mcp_call("tools/call", {"name": "ping", "arguments": {}})
+            result = _adapter().call("tools/call", {"name": "ping", "arguments": {}})
             assert result["result"]["content"][0]["text"] == "pong"
 
             coord.stop()
-            assert _wait_for_mcp_down(timeout=10), f"MCP failed to stop on cycle {cycle}"
+            assert _adapter().wait_for_down(timeout=10), f"MCP failed to stop on cycle {cycle}"
 
 
 @pytest.mark.slow
@@ -430,10 +406,10 @@ def test_mcp_dead_after_stop(self) -> None:
         global_config.update(viewer_backend="none", n_workers=1)
         bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
         coord = bp.build()
-        assert _wait_for_mcp(), "MCP server did not start"
+        assert _adapter().wait_for_ready(), "MCP server did not start"
 
         coord.stop()
-        assert _wait_for_mcp_down(), "MCP server did not stop"
+        assert _adapter().wait_for_down(), "MCP server did not stop"
 
     def test_cli_no_server_error(self) -> None:
         """dimos mcp list-tools with no server should exit with error."""
diff --git a/dimos/core/tests/e2e_devex_test.py b/dimos/core/tests/demo_devex.py
similarity index 100%
rename from dimos/core/tests/e2e_devex_test.py
rename to dimos/core/tests/demo_devex.py
diff --git a/dimos/core/tests/e2e_mcp_killtest.py b/dimos/core/tests/demo_mcp_killtest.py
similarity index 100%
rename from dimos/core/tests/e2e_mcp_killtest.py
rename to dimos/core/tests/demo_mcp_killtest.py
diff --git a/dimos/core/tests/stress_test_blueprint.py b/dimos/core/tests/stress_test_blueprint.py
index 5def4946d6..6ad438b2dc 100644
--- a/dimos/core/tests/stress_test_blueprint.py
+++ b/dimos/core/tests/stress_test_blueprint.py
@@ -21,9 +21,9 @@
 from dimos.core.blueprints import autoconnect
 from dimos.core.tests.stress_test_module import StressTestModule
 
-stress_test = autoconnect(
+demo_mcp_stress_test = autoconnect(
     StressTestModule.blueprint(),
     McpServer.blueprint(),
 )
 
-__all__ = ["stress_test"]
+__all__ = ["demo_mcp_stress_test"]
diff --git a/dimos/core/worker.py b/dimos/core/worker.py
index ec86295133..f61ebba35d 100644
--- a/dimos/core/worker.py
+++ b/dimos/core/worker.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 from __future__ import annotations
 
-import multiprocessing as mp
+import multiprocessing
+import os
 import threading
 import traceback
 from typing import TYPE_CHECKING, Any
@@ -125,7 +126,7 @@ def __getattr__(self, name: str) -> Any:
 def get_forkserver_context() -> Any:
     global _forkserver_ctx
     if _forkserver_ctx is None:
-        _forkserver_ctx = mp.get_context("forkserver")
+        _forkserver_ctx = multiprocessing.get_context("forkserver")
     return _forkserver_ctx
 
 
@@ -160,16 +161,12 @@ def pid(self) -> int | None:
         if self._process is None:
             return None
         try:
-            if self._process.is_alive():
-                p: int | None = self._process.pid
-                return p
-        except AssertionError:
-            # After daemonize() we are no longer the parent process;
-            # is_alive() asserts _parent_pid == os.getpid().  Fall back to
-            # the stored pid attribute which is still valid.
-            p = self._process.pid
-            return p
-        return None
+            # Signal 0 just checks if the process is alive.
+            pid: int = self._process.pid  # type: ignore[assignment]
+            os.kill(pid, 0)
+            return pid
+        except OSError:
+            return None
 
     @property
     def worker_id(self) -> int:
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index cfad65a3cd..64313e24ec 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -60,7 +60,7 @@
     "phone-go2-fleet-teleop": "dimos.teleop.phone.blueprints:phone_go2_fleet_teleop",
     "phone-go2-teleop": "dimos.teleop.phone.blueprints:phone_go2_teleop",
     "simple-phone-teleop": "dimos.teleop.phone.blueprints:simple_phone_teleop",
-    "stress-test": "dimos.core.tests.stress_test_blueprint:stress_test",
+    "demo-mcp-stress-test": "dimos.core.tests.stress_test_blueprint:demo_mcp_stress_test",
     "uintree-g1-primitive-no-nav": "dimos.robot.unitree.g1.blueprints.primitive.uintree_g1_primitive_no_nav:uintree_g1_primitive_no_nav",
     "unitree-g1": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1:unitree_g1",
     "unitree-g1-agentic": "dimos.robot.unitree.g1.blueprints.agentic.unitree_g1_agentic:unitree_g1_agentic",
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index d823dbe807..16d976884f 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -15,12 +15,15 @@
 from __future__ import annotations
 
 import inspect
+import json
 import sys
 from typing import Any, get_args, get_origin
 
 from dotenv import load_dotenv
+import requests
 import typer
 
+from dimos.agents.mcp.mcp_adapter import McpAdapter, McpError
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.utils.logging_config import setup_logger
 
@@ -264,47 +267,24 @@ def stop(
 main.add_typer(mcp_app, name="mcp")
 
 
-def _mcp_call(
-    method: str, params: dict[str, Any] | None = None, port: int = 9990
-) -> dict[str, Any]:
-    """Send a JSON-RPC request to the running MCP server."""
-    import json as _json
-    import urllib.error
-    import urllib.request
+def _get_adapter() -> McpAdapter:
+    """Get an McpAdapter from the latest RunEntry or default URL."""
+    from dimos.agents.mcp.mcp_adapter import McpAdapter
 
-    payload: dict[str, Any] = {"jsonrpc": "2.0", "id": 1, "method": method}
-    if params:
-        payload["params"] = params
+    return McpAdapter.from_run_entry()
 
-    data = _json.dumps(payload).encode()
-    req = urllib.request.Request(
-        f"http://localhost:{port}/mcp",
-        data=data,
-        headers={"Content-Type": "application/json"},
-    )
 
+@mcp_app.command("list-tools")
+def mcp_list_tools() -> None:
+    """List available MCP tools (skills)."""
     try:
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            result: dict[str, Any] = _json.loads(resp.read())
-            return result
-    except urllib.error.URLError:
+        tools = _get_adapter().list_tools()
+    except requests.ConnectionError:
         typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
         raise typer.Exit(1)
-    except Exception as e:
+    except McpError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1)
-
-
-@mcp_app.command("list-tools")
-def mcp_list_tools() -> None:
-    """List available MCP tools (skills)."""
-    import json
-
-    result = _mcp_call("tools/list")
-    if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
-        raise typer.Exit(1)
-    tools = result.get("result", {}).get("tools", [])
     typer.echo(json.dumps(tools, indent=2))
 
 
@@ -312,27 +292,37 @@ def mcp_list_tools() -> None:
 def mcp_call_tool(
     tool_name: str = typer.Argument(..., help="Tool name to call"),
     args: list[str] = typer.Option([], "--arg", "-a", help="Arguments as key=value"),
+    json_args: str = typer.Option("", "--json-args", "-j", help="Arguments as JSON string"),
 ) -> None:
     """Call an MCP tool by name."""
-    import json
-
-    arguments = {}
-    for arg in args:
-        if "=" not in arg:
-            typer.echo(f"Error: argument must be key=value, got: {arg}", err=True)
-            raise typer.Exit(1)
-        key, value = arg.split("=", 1)
-        # Try to parse as JSON for non-string types
+    arguments: dict[str, Any] = {}
+    if json_args:
         try:
-            arguments[key] = json.loads(value)
-        except (json.JSONDecodeError, ValueError):
-            arguments[key] = value
+            arguments = json.loads(json_args)
+        except json.JSONDecodeError as e:
+            typer.echo(f"Error: invalid JSON in --json-args: {e}", err=True)
+            raise typer.Exit(1)
+    else:
+        for arg in args:
+            if "=" not in arg:
+                typer.echo(f"Error: argument must be key=value, got: {arg}", err=True)
+                raise typer.Exit(1)
+            key, value = arg.split("=", 1)
+            try:
+                arguments[key] = json.loads(value)
+            except (json.JSONDecodeError, ValueError):
+                arguments[key] = value
 
-    result = _mcp_call("tools/call", {"name": tool_name, "arguments": arguments})
-    if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+    try:
+        result = _get_adapter().call_tool(tool_name, arguments)
+    except requests.ConnectionError:
+        typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
+        raise typer.Exit(1)
+    except McpError as e:
+        typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1)
-    content = result.get("result", {}).get("content", [])
+
+    content = result.get("content", [])
     if not content:
         typer.echo("(no output)")
         return
@@ -343,42 +333,52 @@ def mcp_call_tool(
 @mcp_app.command("status")
 def mcp_status() -> None:
     """Show MCP server status (modules, skills)."""
-    import json
-
-    result = _mcp_call("dimos/status")
-    if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+    try:
+        data = _get_adapter().call_tool_text("server_status")
+    except requests.ConnectionError:
+        typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
+        raise typer.Exit(1)
+    except McpError as e:
+        typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1)
-    data = result.get("result", {})
-    typer.echo(json.dumps(data, indent=2))
+    # server_status returns JSON string -- pretty-print it
+    try:
+        typer.echo(json.dumps(json.loads(data), indent=2))
+    except (json.JSONDecodeError, ValueError):
+        typer.echo(data)
 
 
 @mcp_app.command("modules")
 def mcp_modules() -> None:
     """List deployed modules and their skills."""
-    import json
-
-    result = _mcp_call("dimos/list_modules")
-    if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown error')}", err=True)
+    try:
+        data = _get_adapter().call_tool_text("list_modules")
+    except requests.ConnectionError:
+        typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
         raise typer.Exit(1)
-    data = result.get("result", {})
-    typer.echo(json.dumps(data, indent=2))
+    except McpError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+    try:
+        typer.echo(json.dumps(json.loads(data), indent=2))
+    except (json.JSONDecodeError, ValueError):
+        typer.echo(data)
 
 
 @main.command("agent-send")
-def agent_send(
+def agent_send_cmd(
     message: str = typer.Argument(..., help="Message to send to the running agent"),
-    port: int = typer.Option(9990, "--port", "-p", help="MCP server port"),
 ) -> None:
     """Send a message to the running DimOS agent via MCP."""
-    result = _mcp_call("dimos/agent_send", {"message": message}, port=port)
-    if "error" in result:
-        typer.echo(f"Error: {result['error'].get('message', 'unknown')}", err=True)
+    try:
+        text = _get_adapter().call_tool_text("agent_send", {"message": message})
+    except requests.ConnectionError:
+        typer.echo("Error: no running MCP server (is DimOS running?)", err=True)
         raise typer.Exit(1)
-    content = result.get("result", {}).get("content", [])
-    for item in content:
-        typer.echo(item.get("text", str(item)))
+    except McpError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+    typer.echo(text)
 
 
 @main.command()
diff --git a/pyproject.toml b/pyproject.toml
index f50b9cf32a..88f106297e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -316,6 +316,7 @@ docker = [
     "pydantic",
     "pydantic-settings>=2.11.0,<3",
     "typer>=0.19.2,<1",
+    "requests>=2.28",
     "opencv-python-headless",
     "lcm",
     "sortedcontainers",
diff --git a/uv.lock b/uv.lock
index d76a022e63..fd39e45bb5 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1861,6 +1861,7 @@ docker = [
     { name = "pydantic-settings" },
     { name = "pyturbojpeg" },
     { name = "reactivex" },
+    { name = "requests" },
     { name = "rerun-sdk" },
     { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -2078,6 +2079,7 @@ requires-dist = [
     { name = "pyyaml", marker = "extra == 'manipulation'", specifier = ">=6.0" },
     { name = "reactivex" },
     { name = "reactivex", marker = "extra == 'docker'" },
+    { name = "requests", marker = "extra == 'docker'", specifier = ">=2.28" },
     { name = "requests-mock", marker = "extra == 'dev'", specifier = "==1.12.1" },
     { name = "rerun-sdk", specifier = ">=0.20.0" },
     { name = "rerun-sdk", marker = "extra == 'docker'" },

From 45929a0a0d4882073f62d2c78a83578cddc18c65 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Sun, 8 Mar 2026 18:42:22 +0000
Subject: [PATCH 32/37] fix: alphabetical order in all_blueprints.py for
 demo-mcp-stress-test

---
 dimos/robot/all_blueprints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index 64313e24ec..f842230550 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -46,6 +46,7 @@
     "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill",
     "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav",
     "demo-grasping": "dimos.manipulation.grasping.demo_grasping:demo_grasping",
+    "demo-mcp-stress-test": "dimos.core.tests.stress_test_blueprint:demo_mcp_stress_test",
     "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration",
     "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm",
     "demo-skill": "dimos.agents.skills.demo_skill:demo_skill",
@@ -60,7 +61,6 @@
     "phone-go2-fleet-teleop": "dimos.teleop.phone.blueprints:phone_go2_fleet_teleop",
     "phone-go2-teleop": "dimos.teleop.phone.blueprints:phone_go2_teleop",
     "simple-phone-teleop": "dimos.teleop.phone.blueprints:simple_phone_teleop",
-    "demo-mcp-stress-test": "dimos.core.tests.stress_test_blueprint:demo_mcp_stress_test",
     "uintree-g1-primitive-no-nav": "dimos.robot.unitree.g1.blueprints.primitive.uintree_g1_primitive_no_nav:uintree_g1_primitive_no_nav",
     "unitree-g1": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1:unitree_g1",
     "unitree-g1-agentic": "dimos.robot.unitree.g1.blueprints.agentic.unitree_g1_agentic:unitree_g1_agentic",

From 1ebfb70bf23165cf99f98ff192548c214434a464 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Mon, 9 Mar 2026 13:32:54 +0000
Subject: [PATCH 33/37] fix: catch HTTPError in McpAdapter, guard None pid in
 Worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- McpAdapter.call(): catch requests.HTTPError and re-raise as McpError
  so CLI callers get clean error messages instead of raw tracebacks
- Worker.pid: check for None before os.kill() — unstarted processes
  have pid=None which would raise TypeError (not caught by OSError)
---
 dimos/agents/mcp/mcp_adapter.py | 5 ++++-
 dimos/core/worker.py            | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/dimos/agents/mcp/mcp_adapter.py b/dimos/agents/mcp/mcp_adapter.py
index 2a6cb7efcd..9b8cc5c4b9 100644
--- a/dimos/agents/mcp/mcp_adapter.py
+++ b/dimos/agents/mcp/mcp_adapter.py
@@ -81,7 +81,10 @@ def call(self, method: str, params: dict[str, Any] | None = None) -> dict[str, A
             payload["params"] = params
 
         resp = requests.post(self.url, json=payload, timeout=self.timeout)
-        resp.raise_for_status()
+        try:
+            resp.raise_for_status()
+        except requests.HTTPError as e:
+            raise McpError(f"HTTP {resp.status_code}: {e}") from e
         return resp.json()  # type: ignore[no-any-return]
 
     # ------------------------------------------------------------------
diff --git a/dimos/core/worker.py b/dimos/core/worker.py
index f61ebba35d..f11e780f4f 100644
--- a/dimos/core/worker.py
+++ b/dimos/core/worker.py
@@ -162,7 +162,9 @@ def pid(self) -> int | None:
             return None
         try:
             # Signal 0 just checks if the process is alive.
-            pid: int = self._process.pid  # type: ignore[assignment]
+            pid: int | None = self._process.pid
+            if pid is None:
+                return None
             os.kill(pid, 0)
             return pid
         except OSError:

From b4ae18a14001d3120366375bbc386f0b7fc85cf0 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Mon, 9 Mar 2026 13:50:28 +0000
Subject: [PATCH 34/37] fix: server_status returns main process PID, not worker
 PID

McpServer runs in a forkserver worker, so os.getpid() returns the
worker PID. Read from RunEntry instead to get the main daemon PID
that dimos stop/status actually needs.
---
 dimos/agents/mcp/mcp_server.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py
index 30f13e7594..e63057b528 100644
--- a/dimos/agents/mcp/mcp_server.py
+++ b/dimos/agents/mcp/mcp_server.py
@@ -214,12 +214,16 @@ def on_system_modules(self, modules: list[RPCClient]) -> None:
 
     @skill
     def server_status(self) -> str:
-        """Get MCP server status: PID, deployed modules, and skill count."""
+        """Get MCP server status: main process PID, deployed modules, and skill count."""
+        from dimos.core.run_registry import get_most_recent
+
         skills: list[SkillInfo] = app.state.skills
         modules = list(dict.fromkeys(s.class_name for s in skills))
+        entry = get_most_recent()
+        pid = entry.pid if entry else os.getpid()
         return json.dumps(
             {
-                "pid": os.getpid(),
+                "pid": pid,
                 "modules": modules,
                 "skills": [s.func_name for s in skills],
             }

From 88bc77ceb7ecee0a9cb4bc91241ad731dbb4b90d Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Mon, 9 Mar 2026 14:37:15 +0000
Subject: [PATCH 35/37] refactor: use click.ParamType for --arg parsing in mcp
 call

Replace manual string splitting with _KeyValueType click.ParamType
per Paul's review suggestion. Validation and JSON coercion now handled
by click's type system instead of inline loop.
---
 dimos/robot/cli/dimos.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 16d976884f..d1e12be394 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -19,6 +19,7 @@
 import sys
 from typing import Any, get_args, get_origin
 
+import click
 from dotenv import load_dotenv
 import requests
 import typer
@@ -288,10 +289,29 @@ def mcp_list_tools() -> None:
     typer.echo(json.dumps(tools, indent=2))
 
 
+class _KeyValueType(click.ParamType):
+    """Parse KEY=VALUE arguments, auto-converting JSON values."""
+
+    name = "KEY=VALUE"
+
+    def convert(
+        self, value: str, param: click.Parameter | None, ctx: click.Context | None
+    ) -> tuple[str, Any]:
+        if "=" not in value:
+            self.fail(f"expected KEY=VALUE, got: {value}", param, ctx)
+        key, val = value.split("=", 1)
+        try:
+            return (key, json.loads(val))
+        except (json.JSONDecodeError, ValueError):
+            return (key, val)
+
+
 @mcp_app.command("call")
 def mcp_call_tool(
     tool_name: str = typer.Argument(..., help="Tool name to call"),
-    args: list[str] = typer.Option([], "--arg", "-a", help="Arguments as key=value"),
+    args: list[tuple[str, Any]] = typer.Option(
+        [], "--arg", "-a", click_type=_KeyValueType(), help="Arguments as key=value"
+    ),
     json_args: str = typer.Option("", "--json-args", "-j", help="Arguments as JSON string"),
 ) -> None:
     """Call an MCP tool by name."""
@@ -303,15 +323,7 @@ def mcp_call_tool(
             typer.echo(f"Error: invalid JSON in --json-args: {e}", err=True)
             raise typer.Exit(1)
     else:
-        for arg in args:
-            if "=" not in arg:
-                typer.echo(f"Error: argument must be key=value, got: {arg}", err=True)
-                raise typer.Exit(1)
-            key, value = arg.split("=", 1)
-            try:
-                arguments[key] = json.loads(value)
-            except (json.JSONDecodeError, ValueError):
-                arguments[key] = value
+        arguments = dict(args)
 
     try:
         result = _get_adapter().call_tool(tool_name, arguments)

From dd842800445b4c5519773b69936aec516493afee Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Mon, 9 Mar 2026 14:46:27 +0000
Subject: [PATCH 36/37] =?UTF-8?q?fix:=20viewer=5Fbackend=20=E2=86=92=20vie?=
 =?UTF-8?q?wer=20rename=20+=20KeyValueType=20test=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update test_mcp_integration.py and demo_mcp_killtest.py to use
  'viewer' instead of 'viewer_backend' (renamed in #1477)
- Fix test_cli_call_tool_wrong_arg_format: exit code 2 (click
  ParamType validation) instead of 1, check KEY=VALUE in output
- Merge dev to pick up #1477 rename and #1494 replay loop
---
 dimos/core/test_mcp_integration.py    | 12 ++++++------
 dimos/core/tests/demo_mcp_killtest.py |  2 +-
 dimos/robot/cli/dimos.py              |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/dimos/core/test_mcp_integration.py b/dimos/core/test_mcp_integration.py
index 8ec06c6bca..543b9a7fbd 100644
--- a/dimos/core/test_mcp_integration.py
+++ b/dimos/core/test_mcp_integration.py
@@ -84,7 +84,7 @@ def mcp_shared(request: pytest.FixtureRequest) -> Generator[ModuleCoordinator, N
     """Build a shared StressTestModule + McpServer.  Class-scoped -- started
     once, torn down after every test in the class finishes.  Use for
     read-only tests that don't stop/restart the server."""
-    global_config.update(viewer_backend="none", n_workers=1)
+    global_config.update(viewer="none", n_workers=1)
     bp = autoconnect(
         StressTestModule.blueprint(),
         McpServer.blueprint(),
@@ -283,8 +283,8 @@ def test_rapid_tool_calls(self, mcp_shared: ModuleCoordinator) -> None:
     def test_cli_call_tool_wrong_arg_format(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp call with bad --arg format should error."""
         result = CliRunner().invoke(main, ["mcp", "call", "echo", "--arg", "no_equals_sign"])
-        assert result.exit_code == 1
-        assert "key=value" in result.output
+        assert result.exit_code == 2  # click ParamType validation error
+        assert "KEY=VALUE" in result.output
 
     def test_cli_call_json_args(self, mcp_shared: ModuleCoordinator) -> None:
         """dimos mcp call --json-args should work."""
@@ -334,7 +334,7 @@ class TestDaemonMCPRecovery:
 
     def test_restart_after_clean_stop(self) -> None:
         """Stop then start again -- MCP should come back."""
-        global_config.update(viewer_backend="none", n_workers=1)
+        global_config.update(viewer="none", n_workers=1)
 
         # First run
         bp1 = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
@@ -383,7 +383,7 @@ class TestMCPRapidRestart:
 
     def test_three_restart_cycles(self) -> None:
         """Start -> stop -> start 3 times -- no port conflicts."""
-        global_config.update(viewer_backend="none", n_workers=1)
+        global_config.update(viewer="none", n_workers=1)
 
         for cycle in range(3):
             bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
@@ -403,7 +403,7 @@ class TestMCPNoServer:
 
     def test_mcp_dead_after_stop(self) -> None:
         """After coordinator.stop(), MCP should stop responding."""
-        global_config.update(viewer_backend="none", n_workers=1)
+        global_config.update(viewer="none", n_workers=1)
         bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
         coord = bp.build()
         assert _adapter().wait_for_ready(), "MCP server did not start"
diff --git a/dimos/core/tests/demo_mcp_killtest.py b/dimos/core/tests/demo_mcp_killtest.py
index 33330f540c..b933bb9a1c 100644
--- a/dimos/core/tests/demo_mcp_killtest.py
+++ b/dimos/core/tests/demo_mcp_killtest.py
@@ -95,7 +95,7 @@ def run_blueprint_in_process(ready_event: multiprocessing.synchronize.Event) ->
     from dimos.core.global_config import global_config
     from dimos.core.tests.stress_test_module import StressTestModule
 
-    global_config.update(viewer_backend="none", n_workers=1)
+    global_config.update(viewer="none", n_workers=1)
     bp = autoconnect(StressTestModule.blueprint(), McpServer.blueprint())
     coord = bp.build()
     ready_event.set()
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index d1e12be394..fd120be784 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -309,7 +309,7 @@ def convert(
 @mcp_app.command("call")
 def mcp_call_tool(
     tool_name: str = typer.Argument(..., help="Tool name to call"),
-    args: list[tuple[str, Any]] = typer.Option(
+    args: list[str] = typer.Option(
         [], "--arg", "-a", click_type=_KeyValueType(), help="Arguments as key=value"
     ),
     json_args: str = typer.Option("", "--json-args", "-j", help="Arguments as JSON string"),
@@ -323,7 +323,7 @@ def mcp_call_tool(
             typer.echo(f"Error: invalid JSON in --json-args: {e}", err=True)
             raise typer.Exit(1)
     else:
-        arguments = dict(args)
+        arguments = dict(args)  # _KeyValueType returns (key, val) tuples
 
     try:
         result = _get_adapter().call_tool(tool_name, arguments)

From 6cf6f270fb264cefd9a403a621ba91be82728430 Mon Sep 17 00:00:00 2001
From: spomichter <pomichterstash@gmail.com>
Date: Mon, 9 Mar 2026 15:18:52 +0000
Subject: [PATCH 37/37] fix: mypy arg-type error for KeyValueType dict(args)

---
 dimos/robot/cli/dimos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index fd120be784..1e95a65dcc 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -323,7 +323,8 @@ def mcp_call_tool(
             typer.echo(f"Error: invalid JSON in --json-args: {e}", err=True)
             raise typer.Exit(1)
     else:
-        arguments = dict(args)  # _KeyValueType returns (key, val) tuples
+        # _KeyValueType.convert() returns (key, val) tuples at runtime
+        arguments = dict(args)  # type: ignore[arg-type]
 
     try:
         result = _get_adapter().call_tool(tool_name, arguments)