Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ lmcode uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added
- **`max_rounds` safety boundary** — `model.act()` now runs with `max_prediction_rounds` set from `agent.max_rounds` (default `10`, down from the previously unused `50`). Applies to ask and auto modes (strict routes through `model.respond()` which has no round concept). When the limit is hit the agent prints an inline warning — `agent stopped after N rounds — raise the limit with LMCODE_AGENT__MAX_ROUNDS=N, set agent.max_rounds in config, or pass --max-rounds N on the CLI` — instead of letting runaway tool loops burn the context window. Handles both cases: the well-behaved model finishing its final tool-free round at the cap, and the stubborn model that still emits a `tool_call` on the final round (SDK raises `LMStudioPredictionError` with `"final prediction round"` in the message, now caught specifically in `_run_turn` so the broader `LMStudioServerError` catch at `run()` level does not mistake it for a disconnect). (#97)
- **CLI `--max-rounds N` flag is now actually wired** — previously accepted by `lmcode chat` but silently ignored. Now mutates `get_settings().agent.max_rounds` for the session before launching the agent, so `lmcode chat --max-rounds 25` works as expected.

### Fixed
- **Strict mode now truly disables tools** — previously the `strict` permission mode label said "no tools — pure chat only" but the runtime still passed the full tool list to `model.act()`, so the model could happily emit tool calls and the runtime would execute them silently. Strict now routes through `model.respond()` — the pure-chat SDK primitive that has no tool concept at all — so the model never even sees a tool schema. (`model.act(tools=[])` is not a viable alternative: the SDK rejects it with `LMStudioValueError`.) (#99)
- **Strict mode hard system prompt (second layer of defence)** — the SDK-level fix stops the model from emitting *real* tool calls, but cannot stop it from *fabricating* tool output in plain text (e.g. Qwen 7B happily replied "Here is the file content:" followed by invented code because the base prompt said "Always call `read_file` before describing file contents"). Strict now passes a *separate* `lms.Chat` seeded with a dedicated `_STRICT_SYSTEM_PROMPT` that explicitly forbids pretending to read files, fabricating output, or describing tool calls — while still listing what the model *can* do (review pasted code, answer general questions, rubber-duck debugging). `self._chat` is untouched so Tab-switching back to ask/auto keeps the base prompt and full history. (#99)
Expand Down
7 changes: 4 additions & 3 deletions docs/features/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ base_url = "http://localhost:1234"
model = "auto"

[agent]
# Maximum number of agent loop iterations per run
max_rounds = 50
# Max tool-call rounds per turn. Applies to ask and auto modes.
# Override for a session with --max-rounds N or LMCODE_AGENT__MAX_ROUNDS=N.
max_rounds = 10

# Whether to confirm before writing files
confirm_writes = false
Expand All @@ -50,7 +51,7 @@ class LMStudioSettings(BaseSettings):
model: str = "auto"

class AgentSettings(BaseSettings):
max_rounds: int = 50
max_rounds: int = 10
confirm_writes: bool = False

class SessionSettings(BaseSettings):
Expand Down
76 changes: 68 additions & 8 deletions src/lmcode/agent/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@
# Spinner / tips constants
# ---------------------------------------------------------------------------

#: Substring found in :class:`lms.LMStudioPredictionError` when the model
#: requests a tool call on the final round that was capped by
#: ``max_prediction_rounds``. The SDK raises unconditionally in that case
#: (our ``handle_invalid_tool_request`` callback can only override the
#: message), so we match on this marker to convert the crash into a
#: graceful "limit reached" warning instead of letting the run() loop's
#: broader ``LMStudioServerError`` catch mistake it for a disconnect.
#: Source of truth: ``lmstudio.async_api.AsyncLLM.act`` — the string is
#: literally ``"Model requested tool use on final prediction round."``.
_MAX_ROUNDS_ERR_MARKER: str = "final prediction round"

#: Rich spinner style used during model inference.
_SPINNER: str = "circleHalves"

Expand Down Expand Up @@ -382,6 +393,11 @@ def __init__(self, model_id: str = "auto") -> None:
self._show_stats: bool = get_settings().ui.show_stats
self._always_allowed_tools: set[str] = set()
self._inference_config: dict[str, Any] = {} # passed as config= to model.act()
# Set by _run_turn when ``max_prediction_rounds`` was hit this turn
# (either the SDK raised LMStudioPredictionError on the final round,
# or ActResult.rounds reached the configured cap). run() reads this
# after _run_turn returns to print an inline warning to the user.
self._last_turn_limit_reached: bool = False
# Set by _wrap_tool when the user rejects a tool call and types a
# redirect instruction. run() detects this after the turn ends and
# feeds the instruction back as a brand-new user message.
Expand Down Expand Up @@ -1185,6 +1201,13 @@ async def _keepalive() -> None:
act_result: Any = None
respond_result: Any = None
strict_start: float | None = None
# Reset the flag at the start of every turn. run() reads it after
# _run_turn returns to decide whether to print the limit warning.
self._last_turn_limit_reached = False
# Pull max_rounds here (not later) so tests can patch
# get_settings() once and see a single call. None disables the cap.
max_rounds = get_settings().agent.max_rounds
max_prediction_rounds: int | None = max_rounds if max_rounds and max_rounds > 0 else None
try:
if self._mode == "strict":
# Strict mode (#99): use ``model.respond()`` — the pure
Expand Down Expand Up @@ -1212,18 +1235,43 @@ async def _keepalive() -> None:
on_prediction_fragment=_on_fragment,
)
else:
act_result = await model.act(
chat,
tools=tools,
config=self._inference_config if self._inference_config else None,
on_message=_on_message,
on_prediction_completed=_on_prediction_completed,
on_prediction_fragment=_on_fragment,
)
try:
act_result = await model.act(
chat,
tools=tools,
max_prediction_rounds=max_prediction_rounds,
config=self._inference_config if self._inference_config else None,
on_message=_on_message,
on_prediction_completed=_on_prediction_completed,
on_prediction_fragment=_on_fragment,
)
except lms.LMStudioPredictionError as e:
# Max-rounds hit with a stubborn model: the SDK disabled
# tools on the final round, the model still emitted a
# tool_call, and the SDK raised unconditionally. Convert
# that into a graceful limit-reached signal so run()
# prints a warning instead of the run() top-level catch
# mistaking it for a server disconnect (LMStudioServerError
# is a *parent* of LMStudioPredictionError).
if _MAX_ROUNDS_ERR_MARKER in str(e):
self._last_turn_limit_reached = True
else:
raise
finally:
stop_evt.set()
await keepalive

# Well-behaved model case: the SDK forced a tool-free final round and
# the model produced a normal text response, but rounds == the cap
# still means the task likely got truncated. Flag it so the user
# knows why the answer might feel incomplete.
if (
act_result is not None
and max_prediction_rounds is not None
and getattr(act_result, "rounds", 0) >= max_prediction_rounds
):
self._last_turn_limit_reached = True

# ``respond()`` returns a single ``PredictionResult`` with ``.stats``
# directly, whereas ``act()`` fires ``_on_prediction_completed`` once
# per round. Normalise so the downstream stats code runs identically.
Expand Down Expand Up @@ -1398,6 +1446,18 @@ def _cycle_mode() -> None:
f"[{TEXT_MUTED}] — run /compact to summarise "
f"the conversation[/]"
)

if self._last_turn_limit_reached:
cap = get_settings().agent.max_rounds
console.print(
f"\n[{WARNING}]agent stopped after {cap} rounds[/]"
f"[{TEXT_MUTED}] — raise the limit with "
f"LMCODE_AGENT__MAX_ROUNDS=N, set "
f"agent.max_rounds in config, or pass "
f"--max-rounds N on the CLI. "
f"Type to continue the conversation.[/]"
)

console.print(Rule(style=f"dim {ACCENT}"))

# If the user rejected a tool call with redirect
Expand Down
3 changes: 2 additions & 1 deletion src/lmcode/cli/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def main(
if ctx.invoked_subcommand is None:
from lmcode.cli.chat import chat

chat(model="auto", max_rounds=50)
# max_rounds=None → use agent.max_rounds from config.toml / env var
chat(model="auto", max_rounds=None)


# ---------------------------------------------------------------------------
Expand Down
11 changes: 10 additions & 1 deletion src/lmcode/cli/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,10 +385,19 @@ def _exit_no_model() -> None:
@app.callback(invoke_without_command=True)
def chat(
model: str = typer.Option("auto", "--model", "-m", help="Model ID (default: auto-detect)."),
max_rounds: int = typer.Option(50, "--max-rounds", help="Maximum agent loop iterations."),
max_rounds: int | None = typer.Option(
None,
"--max-rounds",
help="Max tool-call rounds per turn (overrides agent.max_rounds in config).",
),
) -> None:
"""Start an interactive coding agent session in the current directory."""
settings = get_settings()
if max_rounds is not None:
# CLI flag overrides config.toml / env var for this session only.
# The Agent reads get_settings().agent.max_rounds at the start of
# every turn, so mutating it here is enough — no re-init needed.
settings.agent.max_rounds = max_rounds
connected, detected_model = _probe_lmstudio()

if not connected:
Expand Down
10 changes: 9 additions & 1 deletion src/lmcode/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@ def base_url(self) -> str:
class AgentSettings(BaseSettings):
"""Runtime behaviour settings for the agent loop."""

max_rounds: int = 50
#: Maximum number of tool-call rounds inside a single ``model.act()``
#: invocation. Passed to the LM Studio SDK as ``max_prediction_rounds``.
#: 10 is the sweet spot for Qwen 7B: enough headroom for multi-file
#: edits, tight enough to stop runaway loops before they burn context.
#: Raise via ``LMCODE_AGENT__MAX_ROUNDS=N`` or ``agent.max_rounds`` in
#: ``config.toml``, or pass ``--max-rounds N`` on the CLI for a single
#: session. Applies to ask and auto modes; strict mode routes through
#: ``model.respond()`` which has no round concept.
max_rounds: int = 10
permission_mode: Literal["ask", "auto", "strict"] = "ask"
timeout_seconds: int = 30
max_file_bytes: int = 100_000 # ~100 KB; see issue #2 for token-aware improvement
Expand Down
166 changes: 163 additions & 3 deletions tests/test_agent/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,15 @@ def test_agent_ensure_chat_returns_same_instance() -> None:
# ---------------------------------------------------------------------------


def _make_mock_model(response_text: str) -> MagicMock:
"""Build a mock model whose act() calls on_message with a fake AssistantResponse."""
def _make_mock_model(response_text: str, rounds: int = 1) -> MagicMock:
"""Build a mock model whose act() calls on_message with a fake AssistantResponse.

The returned ``ActResult``-shaped mock has a real ``rounds`` int
(default ``1``, well below the default ``max_rounds=10`` cap) so
the post-act limit check in :meth:`Agent._run_turn` does not crash
on a ``MagicMock >= int`` comparison. Pass ``rounds=N`` to simulate
a turn that used all N rounds for the limit-warning test.
"""

async def act_side_effect(
chat: object, tools: object, on_message: object = None, **kwargs: object
Expand All @@ -88,7 +95,12 @@ async def act_side_effect(
msg.role = "assistant"
msg.tool_calls = None
on_message(msg)
return MagicMock()
# Return an ActResult-shaped mock with concrete numeric fields so
# `getattr(result, "rounds", 0) >= max_prediction_rounds` works.
result = MagicMock()
result.rounds = rounds
result.total_time_seconds = 0.1
return result

mock = MagicMock()
mock.act = AsyncMock(side_effect=act_side_effect)
Expand Down Expand Up @@ -294,6 +306,154 @@ async def test_run_turn_non_strict_modes_use_act_with_tools() -> None:
)


# ---------------------------------------------------------------------------
# max_rounds safety boundary (#97)
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_run_turn_passes_max_rounds_to_act() -> None:
"""``max_prediction_rounds`` must flow from the agent config into ``model.act()``.

Pins the config→SDK plumbing: if someone removes the kwarg from the
``act()`` call, auto mode is once again unbounded and can loop
forever — the exact bug #97 was filed to fix.
"""
agent = Agent()
agent._mode = "auto"
mock_model = _make_mock_model("ok")

# Patch get_settings so the test does not depend on ~/.config state.
mock_settings = MagicMock()
mock_settings.agent.max_rounds = 7
mock_settings.agent.max_file_bytes = 100_000
with (
patch("lmcode.agent.core.read_lmcode_md", return_value=None),
patch("lmcode.agent.core.get_settings", return_value=mock_settings),
):
await agent._run_turn(mock_model, "hi")

call_kwargs = mock_model.act.await_args.kwargs
assert call_kwargs["max_prediction_rounds"] == 7


@pytest.mark.asyncio
async def test_run_turn_max_rounds_none_when_config_zero() -> None:
"""A non-positive ``max_rounds`` disables the cap (passes ``None`` to the SDK).

Users who explicitly set ``max_rounds = 0`` in config.toml are opting
out of the safety boundary; we must not send ``0`` to the SDK because
``model.act()`` rejects values < 1 with ``LMStudioValueError``.
"""
agent = Agent()
agent._mode = "auto"
mock_model = _make_mock_model("ok")

mock_settings = MagicMock()
mock_settings.agent.max_rounds = 0
mock_settings.agent.max_file_bytes = 100_000
with (
patch("lmcode.agent.core.read_lmcode_md", return_value=None),
patch("lmcode.agent.core.get_settings", return_value=mock_settings),
):
await agent._run_turn(mock_model, "hi")

call_kwargs = mock_model.act.await_args.kwargs
assert call_kwargs["max_prediction_rounds"] is None


@pytest.mark.asyncio
async def test_run_turn_flags_limit_reached_when_rounds_equal_cap() -> None:
"""Well-behaved model case: ``ActResult.rounds == cap`` → set the flag.

Even when the SDK closes the loop cleanly (model gave up tools on the
final round and produced a text answer), reaching the cap means the
task was likely truncated. ``self._last_turn_limit_reached`` must be
set so ``run()`` prints the warning.
"""
agent = Agent()
agent._mode = "auto"
mock_model = _make_mock_model("done", rounds=5)

mock_settings = MagicMock()
mock_settings.agent.max_rounds = 5
mock_settings.agent.max_file_bytes = 100_000
with (
patch("lmcode.agent.core.read_lmcode_md", return_value=None),
patch("lmcode.agent.core.get_settings", return_value=mock_settings),
):
await agent._run_turn(mock_model, "hi")

assert agent._last_turn_limit_reached is True


@pytest.mark.asyncio
async def test_run_turn_flags_limit_reached_on_final_round_error() -> None:
"""Stubborn model case: SDK raises ``LMStudioPredictionError`` on the final round.

When the model ignores the tools-disabled signal on the final round
and emits a tool_call anyway, the SDK raises unconditionally with a
message containing ``"final prediction round"``. ``_run_turn`` must
catch this *specific* error, set the limit flag, and NOT let it
bubble to the ``run()`` top-level handler (which catches
``LMStudioServerError`` — the parent class — and would misinterpret
the crash as a server disconnect).
"""
import lmstudio as lms

agent = Agent()
agent._mode = "auto"

async def _raise_final_round(
chat: object, tools: object, on_message: object = None, **kwargs: object
) -> None:
raise lms.LMStudioPredictionError("Model requested tool use on final prediction round.")

mock_model = MagicMock()
mock_model.act = AsyncMock(side_effect=_raise_final_round)

mock_settings = MagicMock()
mock_settings.agent.max_rounds = 3
mock_settings.agent.max_file_bytes = 100_000
with (
patch("lmcode.agent.core.read_lmcode_md", return_value=None),
patch("lmcode.agent.core.get_settings", return_value=mock_settings),
):
# Must NOT raise — the error is caught and converted to a flag.
await agent._run_turn(mock_model, "loop forever please")

assert agent._last_turn_limit_reached is True


@pytest.mark.asyncio
async def test_run_turn_reraises_unrelated_prediction_error() -> None:
"""A prediction error that is NOT the final-round case must propagate.

The catch in ``_run_turn`` is narrow on purpose — matching only the
SDK's ``"final prediction round"`` marker. Any other
``LMStudioPredictionError`` (malformed tool schema, invalid
response format, etc.) is a real bug the user needs to see.
"""
import lmstudio as lms

agent = Agent()
agent._mode = "auto"

async def _raise_other(
chat: object, tools: object, on_message: object = None, **kwargs: object
) -> None:
raise lms.LMStudioPredictionError("something else went wrong")

mock_model = MagicMock()
mock_model.act = AsyncMock(side_effect=_raise_other)

with patch("lmcode.agent.core.read_lmcode_md", return_value=None):
with pytest.raises(lms.LMStudioPredictionError, match="something else"):
await agent._run_turn(mock_model, "hi")

assert agent._last_turn_limit_reached is False


# ---------------------------------------------------------------------------
# _wrap_tool_verbose — positional-arg merging
# ---------------------------------------------------------------------------
Expand Down
Loading