From cc5ed99d0d361dbdfc38c5796eab2cd6e8e16459 Mon Sep 17 00:00:00 2001
From: Nick Roth <nlr06886@gmail.com>
Date: Sun, 1 Mar 2026 16:48:11 -0600
Subject: [PATCH] fix(llm): Add LLM_STREAMING_MODE option for custom endpoints

Adds LLM_STREAMING_MODE environment variable to enable streaming mode
for LLM proxies that require it. When enabled, uses LiteLLM with
stream=True and accumulates chunks into plain text response.
---
 src/applypilot/llm.py | 80 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 11 deletions(-)

diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index 030f2cee..e7b7cce0 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -8,6 +8,7 @@
     GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, then LLM_URL.
   - Credentials come from provider env vars or generic LLM_API_KEY.
   - LLM_URL is optional for custom OpenAI-compatible endpoints.
+  - LLM_STREAMING_MODE enables streaming mode for LLM proxies that require it.
 """
 
 from __future__ import annotations
@@ -51,6 +52,7 @@ class LLMConfig:
     api_base: str | None
     model: str
     api_key: str
+    use_streaming: bool = False
 
 
 class ChatMessage(TypedDict):
@@ -79,9 +81,7 @@ def _env_get(env: Mapping[str, str], key: str) -> str:
 def _provider_from_model(model: str) -> str:
     provider, _, model_name = model.partition("/")
     if not provider or not model_name:
-        raise RuntimeError(
-            "LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini')."
-        )
+        raise RuntimeError("LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini').")
     return provider
 
 
@@ -106,9 +106,7 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
             provider, _ = inferred
             model = f"{provider}/{model}"
         else:
-            raise RuntimeError(
-                "LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini')."
-            )
+            raise RuntimeError("LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini').")
     else:
         if not inferred:
             raise RuntimeError(
@@ -130,21 +128,21 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
     api_key = _env_get(env_map, api_key_env) or _env_get(env_map, "LLM_API_KEY")
 
     if not api_key and not local_url:
-        key_help = (
-            f"{api_key_env} or LLM_API_KEY"
-            if provider in provider_api_key_env
-            else "LLM_API_KEY"
-        )
+        key_help = f"{api_key_env} or LLM_API_KEY" if provider in provider_api_key_env else "LLM_API_KEY"
         raise RuntimeError(
             f"Missing credentials for LLM_MODEL '{model}'. Set {key_help}, or set LLM_URL for "
             "a local OpenAI-compatible endpoint."
         )
 
+    # Check if streaming mode is enabled via environment variable
+    use_streaming = _env_get(env_map, "LLM_STREAMING_MODE").lower() in ("true", "1", "yes")
+
     return LLMConfig(
         provider=provider,
         api_base=local_url.rstrip("/") if local_url else None,
         model=model,
         api_key=api_key,
+        use_streaming=use_streaming,
     )
 
 
@@ -155,6 +153,7 @@ def __init__(self, config: LLMConfig) -> None:
         self.config = config
         self.provider = config.provider
         self.model = config.model
+        self._use_streaming = config.use_streaming
         litellm.suppress_debug_info = True
 
     def chat(
@@ -169,6 +168,18 @@ def chat(
         **extra: Unpack[LiteLLMExtra],
     ) -> str:
         """Send a completion request and return plain text content."""
+        # Use streaming mode when configured (required by some LLM proxies)
+        if self._use_streaming:
+            return self._chat_streaming(
+                messages=messages,
+                max_output_tokens=max_output_tokens,
+                temperature=temperature,
+                num_retries=num_retries,
+                drop_params=drop_params,
+                **extra,
+            )
+
+        # Standard non-streaming call
         try:
             if temperature is None:
                 response = litellm.completion(
@@ -208,6 +219,53 @@ def chat(
         except Exception as exc:  # pragma: no cover - provider SDK exception types vary by backend/version.
             raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
 
+    def _chat_streaming(
+        self,
+        messages: list[ChatMessage],
+        *,
+        max_output_tokens: int = 10000,
+        temperature: float | None = None,
+        num_retries: int = _MAX_RETRIES,
+        drop_params: bool = True,
+        **extra: Unpack[LiteLLMExtra],
+    ) -> str:
+        """Use streaming completion mode.
+
+        Some LLM proxies require streaming mode. This method uses stream=True
+        and accumulates the chunks into a plain text response.
+        """
+        try:
+            kwargs: dict[str, Any] = {
+                "model": self.model,
+                "messages": messages,
+                "max_tokens": max_output_tokens,
+                "num_retries": num_retries,
+                "drop_params": drop_params,
+                "api_key": self.config.api_key or None,
+                "api_base": self.config.api_base or None,
+                "stream": True,
+            }
+            if temperature is not None:
+                kwargs["temperature"] = temperature
+
+            response = litellm.completion(**kwargs)
+
+            # Accumulate content from streaming chunks
+            content_parts = []
+            for chunk in response:
+                if hasattr(chunk, "choices") and chunk.choices:
+                    delta = chunk.choices[0].delta
+                    if hasattr(delta, "content") and delta.content:
+                        content_parts.append(delta.content)
+
+            text = "".join(content_parts).strip()
+
+            if not text:
+                raise RuntimeError("LLM response contained no text content.")
+            return text
+        except Exception as exc:
+            raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
+
     def close(self) -> None:
         """No-op. LiteLLM completion() is stateless per call."""
         return None