diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ca4d2f..02b7f9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ lmcode uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- **`max_rounds` safety boundary** — `model.act()` now runs with `max_prediction_rounds` set from `agent.max_rounds` (default `10`, down from the previously unused `50`). Applies to ask and auto modes (strict routes through `model.respond()` which has no round concept). When the limit is hit the agent prints an inline warning — `agent stopped after N rounds — raise the limit with LMCODE_AGENT__MAX_ROUNDS=N, set agent.max_rounds in config, or pass --max-rounds N on the CLI` — instead of letting runaway tool loops burn the context window. Handles both cases: the well-behaved model finishing its final tool-free round at the cap, and the stubborn model that still emits a `tool_call` on the final round (SDK raises `LMStudioPredictionError` with `"final prediction round"` in the message, now caught specifically in `_run_turn` so the broader `LMStudioServerError` catch at `run()` level does not mistake it for a disconnect). (#97) +- **CLI `--max-rounds N` flag is now actually wired** — previously accepted by `lmcode chat` but silently ignored. Now mutates `get_settings().agent.max_rounds` for the session before launching the agent, so `lmcode chat --max-rounds 25` works as expected. + ### Fixed - **Strict mode now truly disables tools** — previously the `strict` permission mode label said "no tools — pure chat only" but the runtime still passed the full tool list to `model.act()`, so the model could happily emit tool calls and the runtime would execute them silently. Strict now routes through `model.respond()` — the pure-chat SDK primitive that has no tool concept at all — so the model never even sees a tool schema. (`model.act(tools=[])` is not a viable alternative: the SDK rejects it with `LMStudioValueError`.) (#99) - **Strict mode hard system prompt (second layer of defence)** — the SDK-level fix stops the model from emitting *real* tool calls, but cannot stop it from *fabricating* tool output in plain text (e.g. Qwen 7B happily replied "Here is the file content:" followed by invented code because the base prompt said "Always call `read_file` before describing file contents"). Strict now passes a *separate* `lms.Chat` seeded with a dedicated `_STRICT_SYSTEM_PROMPT` that explicitly forbids pretending to read files, fabricating output, or describing tool calls — while still listing what the model *can* do (review pasted code, answer general questions, rubber-duck debugging). `self._chat` is untouched so Tab-switching back to ask/auto keeps the base prompt and full history. (#99) diff --git a/docs/features/config.md b/docs/features/config.md index 666bff6..b4c048b 100644 --- a/docs/features/config.md +++ b/docs/features/config.md @@ -23,8 +23,9 @@ base_url = "http://localhost:1234" model = "auto" [agent] -# Maximum number of agent loop iterations per run -max_rounds = 50 +# Max tool-call rounds per turn. Applies to ask and auto modes. +# Override for a session with --max-rounds N or LMCODE_AGENT__MAX_ROUNDS=N. +max_rounds = 10 # Whether to confirm before writing files confirm_writes = false @@ -50,7 +51,7 @@ class LMStudioSettings(BaseSettings): model: str = "auto" class AgentSettings(BaseSettings): - max_rounds: int = 50 + max_rounds: int = 10 confirm_writes: bool = False class SessionSettings(BaseSettings): diff --git a/src/lmcode/agent/core.py b/src/lmcode/agent/core.py index 8e9c190..e15a407 100644 --- a/src/lmcode/agent/core.py +++ b/src/lmcode/agent/core.py @@ -75,6 +75,17 @@ # Spinner / tips constants # --------------------------------------------------------------------------- +#: Substring found in :class:`lms.LMStudioPredictionError` when the model +#: requests a tool call on the final round that was capped by +#: ``max_prediction_rounds``. The SDK raises unconditionally in that case +#: (our ``handle_invalid_tool_request`` callback can only override the +#: message), so we match on this marker to convert the crash into a +#: graceful "limit reached" warning instead of letting the run() loop's +#: broader ``LMStudioServerError`` catch mistake it for a disconnect. +#: Source of truth: ``lmstudio.async_api.AsyncLLM.act`` — the string is +#: literally ``"Model requested tool use on final prediction round."``. +_MAX_ROUNDS_ERR_MARKER: str = "final prediction round" + #: Rich spinner style used during model inference. _SPINNER: str = "circleHalves" @@ -382,6 +393,11 @@ def __init__(self, model_id: str = "auto") -> None: self._show_stats: bool = get_settings().ui.show_stats self._always_allowed_tools: set[str] = set() self._inference_config: dict[str, Any] = {} # passed as config= to model.act() + # Set by _run_turn when ``max_prediction_rounds`` was hit this turn + # (either the SDK raised LMStudioPredictionError on the final round, + # or ActResult.rounds reached the configured cap). run() reads this + # after _run_turn returns to print an inline warning to the user. + self._last_turn_limit_reached: bool = False # Set by _wrap_tool when the user rejects a tool call and types a # redirect instruction. run() detects this after the turn ends and # feeds the instruction back as a brand-new user message. @@ -1185,6 +1201,13 @@ async def _keepalive() -> None: act_result: Any = None respond_result: Any = None strict_start: float | None = None + # Reset the flag at the start of every turn. run() reads it after + # _run_turn returns to decide whether to print the limit warning. + self._last_turn_limit_reached = False + # Pull max_rounds here (not later) so tests can patch + # get_settings() once and see a single call. None disables the cap. + max_rounds = get_settings().agent.max_rounds + max_prediction_rounds: int | None = max_rounds if max_rounds and max_rounds > 0 else None try: if self._mode == "strict": # Strict mode (#99): use ``model.respond()`` — the pure @@ -1212,18 +1235,43 @@ async def _keepalive() -> None: on_prediction_fragment=_on_fragment, ) else: - act_result = await model.act( - chat, - tools=tools, - config=self._inference_config if self._inference_config else None, - on_message=_on_message, - on_prediction_completed=_on_prediction_completed, - on_prediction_fragment=_on_fragment, - ) + try: + act_result = await model.act( + chat, + tools=tools, + max_prediction_rounds=max_prediction_rounds, + config=self._inference_config if self._inference_config else None, + on_message=_on_message, + on_prediction_completed=_on_prediction_completed, + on_prediction_fragment=_on_fragment, + ) + except lms.LMStudioPredictionError as e: + # Max-rounds hit with a stubborn model: the SDK disabled + # tools on the final round, the model still emitted a + # tool_call, and the SDK raised unconditionally. Convert + # that into a graceful limit-reached signal so run() + # prints a warning instead of the run() top-level catch + # mistaking it for a server disconnect (LMStudioServerError + # is a *parent* of LMStudioPredictionError). + if _MAX_ROUNDS_ERR_MARKER in str(e): + self._last_turn_limit_reached = True + else: + raise finally: stop_evt.set() await keepalive + # Well-behaved model case: the SDK forced a tool-free final round and + # the model produced a normal text response, but rounds == the cap + # still means the task likely got truncated. Flag it so the user + # knows why the answer might feel incomplete. + if ( + act_result is not None + and max_prediction_rounds is not None + and getattr(act_result, "rounds", 0) >= max_prediction_rounds + ): + self._last_turn_limit_reached = True + # ``respond()`` returns a single ``PredictionResult`` with ``.stats`` # directly, whereas ``act()`` fires ``_on_prediction_completed`` once # per round. Normalise so the downstream stats code runs identically. @@ -1398,6 +1446,18 @@ def _cycle_mode() -> None: f"[{TEXT_MUTED}] — run /compact to summarise " f"the conversation[/]" ) + + if self._last_turn_limit_reached: + cap = get_settings().agent.max_rounds + console.print( + f"\n[{WARNING}]agent stopped after {cap} rounds[/]" + f"[{TEXT_MUTED}] — raise the limit with " + f"LMCODE_AGENT__MAX_ROUNDS=N, set " + f"agent.max_rounds in config, or pass " + f"--max-rounds N on the CLI. " + f"Type to continue the conversation.[/]" + ) + console.print(Rule(style=f"dim {ACCENT}")) # If the user rejected a tool call with redirect diff --git a/src/lmcode/cli/app.py b/src/lmcode/cli/app.py index 0358da6..39b45e0 100644 --- a/src/lmcode/cli/app.py +++ b/src/lmcode/cli/app.py @@ -42,7 +42,8 @@ def main( if ctx.invoked_subcommand is None: from lmcode.cli.chat import chat - chat(model="auto", max_rounds=50) + # max_rounds=None → use agent.max_rounds from config.toml / env var + chat(model="auto", max_rounds=None) # --------------------------------------------------------------------------- diff --git a/src/lmcode/cli/chat.py b/src/lmcode/cli/chat.py index 7b2e7f4..f616607 100644 --- a/src/lmcode/cli/chat.py +++ b/src/lmcode/cli/chat.py @@ -385,10 +385,19 @@ def _exit_no_model() -> None: @app.callback(invoke_without_command=True) def chat( model: str = typer.Option("auto", "--model", "-m", help="Model ID (default: auto-detect)."), - max_rounds: int = typer.Option(50, "--max-rounds", help="Maximum agent loop iterations."), + max_rounds: int | None = typer.Option( + None, + "--max-rounds", + help="Max tool-call rounds per turn (overrides agent.max_rounds in config).", + ), ) -> None: """Start an interactive coding agent session in the current directory.""" settings = get_settings() + if max_rounds is not None: + # CLI flag overrides config.toml / env var for this session only. + # The Agent reads get_settings().agent.max_rounds at the start of + # every turn, so mutating it here is enough — no re-init needed. + settings.agent.max_rounds = max_rounds connected, detected_model = _probe_lmstudio() if not connected: diff --git a/src/lmcode/config/settings.py b/src/lmcode/config/settings.py index 4e5eeb7..362c3d7 100644 --- a/src/lmcode/config/settings.py +++ b/src/lmcode/config/settings.py @@ -32,7 +32,15 @@ def base_url(self) -> str: class AgentSettings(BaseSettings): """Runtime behaviour settings for the agent loop.""" - max_rounds: int = 50 + #: Maximum number of tool-call rounds inside a single ``model.act()`` + #: invocation. Passed to the LM Studio SDK as ``max_prediction_rounds``. + #: 10 is the sweet spot for Qwen 7B: enough headroom for multi-file + #: edits, tight enough to stop runaway loops before they burn context. + #: Raise via ``LMCODE_AGENT__MAX_ROUNDS=N`` or ``agent.max_rounds`` in + #: ``config.toml``, or pass ``--max-rounds N`` on the CLI for a single + #: session. Applies to ask and auto modes; strict mode routes through + #: ``model.respond()`` which has no round concept. + max_rounds: int = 10 permission_mode: Literal["ask", "auto", "strict"] = "ask" timeout_seconds: int = 30 max_file_bytes: int = 100_000 # ~100 KB; see issue #2 for token-aware improvement diff --git a/tests/test_agent/test_core.py b/tests/test_agent/test_core.py index 0e97fe1..78f5a7a 100644 --- a/tests/test_agent/test_core.py +++ b/tests/test_agent/test_core.py @@ -74,8 +74,15 @@ def test_agent_ensure_chat_returns_same_instance() -> None: # --------------------------------------------------------------------------- -def _make_mock_model(response_text: str) -> MagicMock: - """Build a mock model whose act() calls on_message with a fake AssistantResponse.""" +def _make_mock_model(response_text: str, rounds: int = 1) -> MagicMock: + """Build a mock model whose act() calls on_message with a fake AssistantResponse. + + The returned ``ActResult``-shaped mock has a real ``rounds`` int + (default ``1``, well below the default ``max_rounds=10`` cap) so + the post-act limit check in :meth:`Agent._run_turn` does not crash + on a ``MagicMock >= int`` comparison. Pass ``rounds=N`` to simulate + a turn that used all N rounds for the limit-warning test. + """ async def act_side_effect( chat: object, tools: object, on_message: object = None, **kwargs: object @@ -88,7 +95,12 @@ async def act_side_effect( msg.role = "assistant" msg.tool_calls = None on_message(msg) - return MagicMock() + # Return an ActResult-shaped mock with concrete numeric fields so + # `getattr(result, "rounds", 0) >= max_prediction_rounds` works. + result = MagicMock() + result.rounds = rounds + result.total_time_seconds = 0.1 + return result mock = MagicMock() mock.act = AsyncMock(side_effect=act_side_effect) @@ -294,6 +306,154 @@ async def test_run_turn_non_strict_modes_use_act_with_tools() -> None: ) +# --------------------------------------------------------------------------- +# max_rounds safety boundary (#97) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_run_turn_passes_max_rounds_to_act() -> None: + """``max_prediction_rounds`` must flow from the agent config into ``model.act()``. + + Pins the config→SDK plumbing: if someone removes the kwarg from the + ``act()`` call, auto mode is once again unbounded and can loop + forever — the exact bug #97 was filed to fix. + """ + agent = Agent() + agent._mode = "auto" + mock_model = _make_mock_model("ok") + + # Patch get_settings so the test does not depend on ~/.config state. + mock_settings = MagicMock() + mock_settings.agent.max_rounds = 7 + mock_settings.agent.max_file_bytes = 100_000 + with ( + patch("lmcode.agent.core.read_lmcode_md", return_value=None), + patch("lmcode.agent.core.get_settings", return_value=mock_settings), + ): + await agent._run_turn(mock_model, "hi") + + call_kwargs = mock_model.act.await_args.kwargs + assert call_kwargs["max_prediction_rounds"] == 7 + + +@pytest.mark.asyncio +async def test_run_turn_max_rounds_none_when_config_zero() -> None: + """A non-positive ``max_rounds`` disables the cap (passes ``None`` to the SDK). + + Users who explicitly set ``max_rounds = 0`` in config.toml are opting + out of the safety boundary; we must not send ``0`` to the SDK because + ``model.act()`` rejects values < 1 with ``LMStudioValueError``. + """ + agent = Agent() + agent._mode = "auto" + mock_model = _make_mock_model("ok") + + mock_settings = MagicMock() + mock_settings.agent.max_rounds = 0 + mock_settings.agent.max_file_bytes = 100_000 + with ( + patch("lmcode.agent.core.read_lmcode_md", return_value=None), + patch("lmcode.agent.core.get_settings", return_value=mock_settings), + ): + await agent._run_turn(mock_model, "hi") + + call_kwargs = mock_model.act.await_args.kwargs + assert call_kwargs["max_prediction_rounds"] is None + + +@pytest.mark.asyncio +async def test_run_turn_flags_limit_reached_when_rounds_equal_cap() -> None: + """Well-behaved model case: ``ActResult.rounds == cap`` → set the flag. + + Even when the SDK closes the loop cleanly (model gave up tools on the + final round and produced a text answer), reaching the cap means the + task was likely truncated. ``self._last_turn_limit_reached`` must be + set so ``run()`` prints the warning. + """ + agent = Agent() + agent._mode = "auto" + mock_model = _make_mock_model("done", rounds=5) + + mock_settings = MagicMock() + mock_settings.agent.max_rounds = 5 + mock_settings.agent.max_file_bytes = 100_000 + with ( + patch("lmcode.agent.core.read_lmcode_md", return_value=None), + patch("lmcode.agent.core.get_settings", return_value=mock_settings), + ): + await agent._run_turn(mock_model, "hi") + + assert agent._last_turn_limit_reached is True + + +@pytest.mark.asyncio +async def test_run_turn_flags_limit_reached_on_final_round_error() -> None: + """Stubborn model case: SDK raises ``LMStudioPredictionError`` on the final round. + + When the model ignores the tools-disabled signal on the final round + and emits a tool_call anyway, the SDK raises unconditionally with a + message containing ``"final prediction round"``. ``_run_turn`` must + catch this *specific* error, set the limit flag, and NOT let it + bubble to the ``run()`` top-level handler (which catches + ``LMStudioServerError`` — the parent class — and would misinterpret + the crash as a server disconnect). + """ + import lmstudio as lms + + agent = Agent() + agent._mode = "auto" + + async def _raise_final_round( + chat: object, tools: object, on_message: object = None, **kwargs: object + ) -> None: + raise lms.LMStudioPredictionError("Model requested tool use on final prediction round.") + + mock_model = MagicMock() + mock_model.act = AsyncMock(side_effect=_raise_final_round) + + mock_settings = MagicMock() + mock_settings.agent.max_rounds = 3 + mock_settings.agent.max_file_bytes = 100_000 + with ( + patch("lmcode.agent.core.read_lmcode_md", return_value=None), + patch("lmcode.agent.core.get_settings", return_value=mock_settings), + ): + # Must NOT raise — the error is caught and converted to a flag. + await agent._run_turn(mock_model, "loop forever please") + + assert agent._last_turn_limit_reached is True + + +@pytest.mark.asyncio +async def test_run_turn_reraises_unrelated_prediction_error() -> None: + """A prediction error that is NOT the final-round case must propagate. + + The catch in ``_run_turn`` is narrow on purpose — matching only the + SDK's ``"final prediction round"`` marker. Any other + ``LMStudioPredictionError`` (malformed tool schema, invalid + response format, etc.) is a real bug the user needs to see. + """ + import lmstudio as lms + + agent = Agent() + agent._mode = "auto" + + async def _raise_other( + chat: object, tools: object, on_message: object = None, **kwargs: object + ) -> None: + raise lms.LMStudioPredictionError("something else went wrong") + + mock_model = MagicMock() + mock_model.act = AsyncMock(side_effect=_raise_other) + + with patch("lmcode.agent.core.read_lmcode_md", return_value=None): + with pytest.raises(lms.LMStudioPredictionError, match="something else"): + await agent._run_turn(mock_model, "hi") + + assert agent._last_turn_limit_reached is False + + # --------------------------------------------------------------------------- # _wrap_tool_verbose — positional-arg merging # ---------------------------------------------------------------------------