VforVitorio · VforVitorio · Apr 7, 2026 · Apr 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,10 @@ lmcode uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Added
+- **`max_rounds` safety boundary** — `model.act()` now runs with `max_prediction_rounds` set from `agent.max_rounds` (default `10`, down from the previously unused `50`). Applies to ask and auto modes (strict routes through `model.respond()` which has no round concept). When the limit is hit the agent prints an inline warning — `agent stopped after N rounds — raise the limit with LMCODE_AGENT__MAX_ROUNDS=N, set agent.max_rounds in config, or pass --max-rounds N on the CLI` — instead of letting runaway tool loops burn the context window. Handles both cases: the well-behaved model finishing its final tool-free round at the cap, and the stubborn model that still emits a `tool_call` on the final round (SDK raises `LMStudioPredictionError` with `"final prediction round"` in the message, now caught specifically in `_run_turn` so the broader `LMStudioServerError` catch at `run()` level does not mistake it for a disconnect). (#97)
+- **CLI `--max-rounds N` flag is now actually wired** — previously accepted by `lmcode chat` but silently ignored. Now mutates `get_settings().agent.max_rounds` for the session before launching the agent, so `lmcode chat --max-rounds 25` works as expected.
+
 ### Fixed
 - **Strict mode now truly disables tools** — previously the `strict` permission mode label said "no tools — pure chat only" but the runtime still passed the full tool list to `model.act()`, so the model could happily emit tool calls and the runtime would execute them silently. Strict now routes through `model.respond()` — the pure-chat SDK primitive that has no tool concept at all — so the model never even sees a tool schema. (`model.act(tools=[])` is not a viable alternative: the SDK rejects it with `LMStudioValueError`.) (#99)
 - **Strict mode hard system prompt (second layer of defence)** — the SDK-level fix stops the model from emitting *real* tool calls, but cannot stop it from *fabricating* tool output in plain text (e.g. Qwen 7B happily replied "Here is the file content:" followed by invented code because the base prompt said "Always call `read_file` before describing file contents"). Strict now passes a *separate* `lms.Chat` seeded with a dedicated `_STRICT_SYSTEM_PROMPT` that explicitly forbids pretending to read files, fabricating output, or describing tool calls — while still listing what the model *can* do (review pasted code, answer general questions, rubber-duck debugging). `self._chat` is untouched so Tab-switching back to ask/auto keeps the base prompt and full history. (#99)

diff --git a/docs/features/config.md b/docs/features/config.md
@@ -23,8 +23,9 @@ base_url = "http://localhost:1234"
 model = "auto"
 
 [agent]
-# Maximum number of agent loop iterations per run
-max_rounds = 50
+# Max tool-call rounds per turn. Applies to ask and auto modes.
+# Override for a session with --max-rounds N or LMCODE_AGENT__MAX_ROUNDS=N.
+max_rounds = 10
 
 # Whether to confirm before writing files
 confirm_writes = false
@@ -50,7 +51,7 @@ class LMStudioSettings(BaseSettings):
     model: str = "auto"
 
 class AgentSettings(BaseSettings):
-    max_rounds: int = 50
+    max_rounds: int = 10
     confirm_writes: bool = False
 
 class SessionSettings(BaseSettings):

diff --git a/src/lmcode/agent/core.py b/src/lmcode/agent/core.py
@@ -75,6 +75,17 @@
 # Spinner / tips constants
 # ---------------------------------------------------------------------------
 
+#: Substring found in :class:`lms.LMStudioPredictionError` when the model
+#: requests a tool call on the final round that was capped by
+#: ``max_prediction_rounds``.  The SDK raises unconditionally in that case
+#: (our ``handle_invalid_tool_request`` callback can only override the
+#: message), so we match on this marker to convert the crash into a
+#: graceful "limit reached" warning instead of letting the run() loop's
+#: broader ``LMStudioServerError`` catch mistake it for a disconnect.
+#: Source of truth: ``lmstudio.async_api.AsyncLLM.act`` — the string is
+#: literally ``"Model requested tool use on final prediction round."``.
+_MAX_ROUNDS_ERR_MARKER: str = "final prediction round"
+
 #: Rich spinner style used during model inference.
 _SPINNER: str = "circleHalves"
 
@@ -382,6 +393,11 @@ def __init__(self, model_id: str = "auto") -> None:
         self._show_stats: bool = get_settings().ui.show_stats
         self._always_allowed_tools: set[str] = set()
         self._inference_config: dict[str, Any] = {}  # passed as config= to model.act()
+        # Set by _run_turn when ``max_prediction_rounds`` was hit this turn
+        # (either the SDK raised LMStudioPredictionError on the final round,
+        # or ActResult.rounds reached the configured cap). run() reads this
+        # after _run_turn returns to print an inline warning to the user.
+        self._last_turn_limit_reached: bool = False
         # Set by _wrap_tool when the user rejects a tool call and types a
         # redirect instruction.  run() detects this after the turn ends and
         # feeds the instruction back as a brand-new user message.
@@ -1185,6 +1201,13 @@ async def _keepalive() -> None:
         act_result: Any = None
         respond_result: Any = None
         strict_start: float | None = None
+        # Reset the flag at the start of every turn. run() reads it after
+        # _run_turn returns to decide whether to print the limit warning.
+        self._last_turn_limit_reached = False
+        # Pull max_rounds here (not later) so tests can patch
+        # get_settings() once and see a single call. None disables the cap.
+        max_rounds = get_settings().agent.max_rounds
+        max_prediction_rounds: int | None = max_rounds if max_rounds and max_rounds > 0 else None
         try:
             if self._mode == "strict":
                 # Strict mode (#99): use ``model.respond()`` — the pure
@@ -1212,18 +1235,43 @@ async def _keepalive() -> None:
                     on_prediction_fragment=_on_fragment,
                 )
             else:
-                act_result = await model.act(
-                    chat,
-                    tools=tools,
-                    config=self._inference_config if self._inference_config else None,
-                    on_message=_on_message,
-                    on_prediction_completed=_on_prediction_completed,
-                    on_prediction_fragment=_on_fragment,
-                )
+                try:
+                    act_result = await model.act(
+                        chat,
+                        tools=tools,
+                        max_prediction_rounds=max_prediction_rounds,
+                        config=self._inference_config if self._inference_config else None,
+                        on_message=_on_message,
+                        on_prediction_completed=_on_prediction_completed,
+                        on_prediction_fragment=_on_fragment,
+                    )
+                except lms.LMStudioPredictionError as e:
+                    # Max-rounds hit with a stubborn model: the SDK disabled
+                    # tools on the final round, the model still emitted a
+                    # tool_call, and the SDK raised unconditionally. Convert
+                    # that into a graceful limit-reached signal so run()
+                    # prints a warning instead of the run() top-level catch
+                    # mistaking it for a server disconnect (LMStudioServerError
+                    # is a *parent* of LMStudioPredictionError).
+                    if _MAX_ROUNDS_ERR_MARKER in str(e):
+                        self._last_turn_limit_reached = True
+                    else:
+                        raise
         finally:
             stop_evt.set()
             await keepalive
 
+        # Well-behaved model case: the SDK forced a tool-free final round and
+        # the model produced a normal text response, but rounds == the cap
+        # still means the task likely got truncated. Flag it so the user
+        # knows why the answer might feel incomplete.
+        if (
+            act_result is not None
+            and max_prediction_rounds is not None
+            and getattr(act_result, "rounds", 0) >= max_prediction_rounds
+        ):
+            self._last_turn_limit_reached = True
+
         # ``respond()`` returns a single ``PredictionResult`` with ``.stats``
         # directly, whereas ``act()`` fires ``_on_prediction_completed`` once
         # per round.  Normalise so the downstream stats code runs identically.
@@ -1398,6 +1446,18 @@ def _cycle_mode() -> None:
                                 f"[{TEXT_MUTED}] — run /compact to summarise "
                                 f"the conversation[/]"
                             )
+
+                    if self._last_turn_limit_reached:
+                        cap = get_settings().agent.max_rounds
+                        console.print(
+                            f"\n[{WARNING}]agent stopped after {cap} rounds[/]"
+                            f"[{TEXT_MUTED}] — raise the limit with "
+                            f"LMCODE_AGENT__MAX_ROUNDS=N, set "
+                            f"agent.max_rounds in config, or pass "
+                            f"--max-rounds N on the CLI. "
+                            f"Type to continue the conversation.[/]"
+                        )
+
                     console.print(Rule(style=f"dim {ACCENT}"))
 
                     # If the user rejected a tool call with redirect

diff --git a/src/lmcode/cli/app.py b/src/lmcode/cli/app.py
@@ -42,7 +42,8 @@ def main(
     if ctx.invoked_subcommand is None:
         from lmcode.cli.chat import chat
 
-        chat(model="auto", max_rounds=50)
+        # max_rounds=None → use agent.max_rounds from config.toml / env var
+        chat(model="auto", max_rounds=None)
 
 
 # ---------------------------------------------------------------------------

diff --git a/src/lmcode/cli/chat.py b/src/lmcode/cli/chat.py
@@ -385,10 +385,19 @@ def _exit_no_model() -> None:
 @app.callback(invoke_without_command=True)
 def chat(
     model: str = typer.Option("auto", "--model", "-m", help="Model ID (default: auto-detect)."),
-    max_rounds: int = typer.Option(50, "--max-rounds", help="Maximum agent loop iterations."),
+    max_rounds: int | None = typer.Option(
+        None,
+        "--max-rounds",
+        help="Max tool-call rounds per turn (overrides agent.max_rounds in config).",
+    ),
 ) -> None:
     """Start an interactive coding agent session in the current directory."""
     settings = get_settings()
+    if max_rounds is not None:
+        # CLI flag overrides config.toml / env var for this session only.
+        # The Agent reads get_settings().agent.max_rounds at the start of
+        # every turn, so mutating it here is enough — no re-init needed.
+        settings.agent.max_rounds = max_rounds
     connected, detected_model = _probe_lmstudio()
 
     if not connected:

diff --git a/src/lmcode/config/settings.py b/src/lmcode/config/settings.py
@@ -32,7 +32,15 @@ def base_url(self) -> str:
 class AgentSettings(BaseSettings):
     """Runtime behaviour settings for the agent loop."""
 
-    max_rounds: int = 50
+    #: Maximum number of tool-call rounds inside a single ``model.act()``
+    #: invocation.  Passed to the LM Studio SDK as ``max_prediction_rounds``.
+    #: 10 is the sweet spot for Qwen 7B: enough headroom for multi-file
+    #: edits, tight enough to stop runaway loops before they burn context.
+    #: Raise via ``LMCODE_AGENT__MAX_ROUNDS=N`` or ``agent.max_rounds`` in
+    #: ``config.toml``, or pass ``--max-rounds N`` on the CLI for a single
+    #: session.  Applies to ask and auto modes; strict mode routes through
+    #: ``model.respond()`` which has no round concept.
+    max_rounds: int = 10
     permission_mode: Literal["ask", "auto", "strict"] = "ask"
     timeout_seconds: int = 30
     max_file_bytes: int = 100_000  # ~100 KB; see issue #2 for token-aware improvement

diff --git a/tests/test_agent/test_core.py b/tests/test_agent/test_core.py
@@ -74,8 +74,15 @@ def test_agent_ensure_chat_returns_same_instance() -> None:
 # ---------------------------------------------------------------------------
 
 
-def _make_mock_model(response_text: str) -> MagicMock:
-    """Build a mock model whose act() calls on_message with a fake AssistantResponse."""
+def _make_mock_model(response_text: str, rounds: int = 1) -> MagicMock:
+    """Build a mock model whose act() calls on_message with a fake AssistantResponse.
+
+    The returned ``ActResult``-shaped mock has a real ``rounds`` int
+    (default ``1``, well below the default ``max_rounds=10`` cap) so
+    the post-act limit check in :meth:`Agent._run_turn` does not crash
+    on a ``MagicMock >= int`` comparison. Pass ``rounds=N`` to simulate
+    a turn that used all N rounds for the limit-warning test.
+    """
 
     async def act_side_effect(
         chat: object, tools: object, on_message: object = None, **kwargs: object
@@ -88,7 +95,12 @@ async def act_side_effect(
             msg.role = "assistant"
             msg.tool_calls = None
             on_message(msg)
-        return MagicMock()
+        # Return an ActResult-shaped mock with concrete numeric fields so
+        # `getattr(result, "rounds", 0) >= max_prediction_rounds` works.
+        result = MagicMock()
+        result.rounds = rounds
+        result.total_time_seconds = 0.1
+        return result
 
     mock = MagicMock()
     mock.act = AsyncMock(side_effect=act_side_effect)
@@ -294,6 +306,154 @@ async def test_run_turn_non_strict_modes_use_act_with_tools() -> None:
         )
 
 
+# ---------------------------------------------------------------------------
+# max_rounds safety boundary (#97)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_run_turn_passes_max_rounds_to_act() -> None:
+    """``max_prediction_rounds`` must flow from the agent config into ``model.act()``.
+
+    Pins the config→SDK plumbing: if someone removes the kwarg from the
+    ``act()`` call, auto mode is once again unbounded and can loop
+    forever — the exact bug #97 was filed to fix.
+    """
+    agent = Agent()
+    agent._mode = "auto"
+    mock_model = _make_mock_model("ok")
+
+    # Patch get_settings so the test does not depend on ~/.config state.
+    mock_settings = MagicMock()
+    mock_settings.agent.max_rounds = 7
+    mock_settings.agent.max_file_bytes = 100_000
+    with (
+        patch("lmcode.agent.core.read_lmcode_md", return_value=None),
+        patch("lmcode.agent.core.get_settings", return_value=mock_settings),
+    ):
+        await agent._run_turn(mock_model, "hi")
+
+    call_kwargs = mock_model.act.await_args.kwargs
+    assert call_kwargs["max_prediction_rounds"] == 7
+
+
+@pytest.mark.asyncio
+async def test_run_turn_max_rounds_none_when_config_zero() -> None:
+    """A non-positive ``max_rounds`` disables the cap (passes ``None`` to the SDK).
+
+    Users who explicitly set ``max_rounds = 0`` in config.toml are opting
+    out of the safety boundary; we must not send ``0`` to the SDK because
+    ``model.act()`` rejects values < 1 with ``LMStudioValueError``.
+    """
+    agent = Agent()
+    agent._mode = "auto"
+    mock_model = _make_mock_model("ok")
+
+    mock_settings = MagicMock()
+    mock_settings.agent.max_rounds = 0
+    mock_settings.agent.max_file_bytes = 100_000
+    with (
+        patch("lmcode.agent.core.read_lmcode_md", return_value=None),
+        patch("lmcode.agent.core.get_settings", return_value=mock_settings),
+    ):
+        await agent._run_turn(mock_model, "hi")
+
+    call_kwargs = mock_model.act.await_args.kwargs
+    assert call_kwargs["max_prediction_rounds"] is None
+
+
+@pytest.mark.asyncio
+async def test_run_turn_flags_limit_reached_when_rounds_equal_cap() -> None:
+    """Well-behaved model case: ``ActResult.rounds == cap`` → set the flag.
+
+    Even when the SDK closes the loop cleanly (model gave up tools on the
+    final round and produced a text answer), reaching the cap means the
+    task was likely truncated. ``self._last_turn_limit_reached`` must be
+    set so ``run()`` prints the warning.
+    """
+    agent = Agent()
+    agent._mode = "auto"
+    mock_model = _make_mock_model("done", rounds=5)
+
+    mock_settings = MagicMock()
+    mock_settings.agent.max_rounds = 5
+    mock_settings.agent.max_file_bytes = 100_000
+    with (
+        patch("lmcode.agent.core.read_lmcode_md", return_value=None),
+        patch("lmcode.agent.core.get_settings", return_value=mock_settings),
+    ):
+        await agent._run_turn(mock_model, "hi")
+
+    assert agent._last_turn_limit_reached is True
+
+
+@pytest.mark.asyncio
+async def test_run_turn_flags_limit_reached_on_final_round_error() -> None:
+    """Stubborn model case: SDK raises ``LMStudioPredictionError`` on the final round.
+
+    When the model ignores the tools-disabled signal on the final round
+    and emits a tool_call anyway, the SDK raises unconditionally with a
+    message containing ``"final prediction round"``. ``_run_turn`` must
+    catch this *specific* error, set the limit flag, and NOT let it
+    bubble to the ``run()`` top-level handler (which catches
+    ``LMStudioServerError`` — the parent class — and would misinterpret
+    the crash as a server disconnect).
+    """
+    import lmstudio as lms
+
+    agent = Agent()
+    agent._mode = "auto"
+
+    async def _raise_final_round(
+        chat: object, tools: object, on_message: object = None, **kwargs: object
+    ) -> None:
+        raise lms.LMStudioPredictionError("Model requested tool use on final prediction round.")
+
+    mock_model = MagicMock()
+    mock_model.act = AsyncMock(side_effect=_raise_final_round)
+
+    mock_settings = MagicMock()
+    mock_settings.agent.max_rounds = 3
+    mock_settings.agent.max_file_bytes = 100_000
+    with (
+        patch("lmcode.agent.core.read_lmcode_md", return_value=None),
+        patch("lmcode.agent.core.get_settings", return_value=mock_settings),
+    ):
+        # Must NOT raise — the error is caught and converted to a flag.
+        await agent._run_turn(mock_model, "loop forever please")
+
+    assert agent._last_turn_limit_reached is True
+
+
+@pytest.mark.asyncio
+async def test_run_turn_reraises_unrelated_prediction_error() -> None:
+    """A prediction error that is NOT the final-round case must propagate.
+
+    The catch in ``_run_turn`` is narrow on purpose — matching only the
+    SDK's ``"final prediction round"`` marker. Any other
+    ``LMStudioPredictionError`` (malformed tool schema, invalid
+    response format, etc.) is a real bug the user needs to see.
+    """
+    import lmstudio as lms
+
+    agent = Agent()
+    agent._mode = "auto"
+
+    async def _raise_other(
+        chat: object, tools: object, on_message: object = None, **kwargs: object
+    ) -> None:
+        raise lms.LMStudioPredictionError("something else went wrong")
+
+    mock_model = MagicMock()
+    mock_model.act = AsyncMock(side_effect=_raise_other)
+
+    with patch("lmcode.agent.core.read_lmcode_md", return_value=None):
+        with pytest.raises(lms.LMStudioPredictionError, match="something else"):
+            await agent._run_turn(mock_model, "hi")
+
+    assert agent._last_turn_limit_reached is False
+
+
 # ---------------------------------------------------------------------------
 # _wrap_tool_verbose — positional-arg merging
 # ---------------------------------------------------------------------------