-
Notifications
You must be signed in to change notification settings - Fork 262
Description
Streaming endpoint hangs on stream: true requests (Critical)
Repo: https://github.com/microsoft/foundry-local
Title: stream: true requests to /v1/chat/completions hang indefinitely (v0.5)
Description: When sending a POST to /v1/chat/completions with "stream": true, the TCP connection hangs forever — no SSE chunks, no response headers, no timeout. The same request with "stream": false returns correctly.
Workaround build a proxy: A full HTTP proxy (streaming-proxy.ts) that intercepts stream: true requests, rewrites them to stream: false, waits for the complete response, then re-encodes it as SSE chunks (data: {...}\n\n + data: [DONE]\n\n) including proper finish_reason, tool-call chunking, and role deltas.
// ---------------------------------------------------------------------------
// src/streaming-proxy.ts — Streaming‐to‐non‐streaming proxy for Foundry Local
//
// Foundry Local (as of v0.5.0) sometimes doesn't respond to streaming
// requests at all (the TCP connection hangs forever once "stream": true is
// in the body). Non-streaming requests, on the other hand, work perfectly.
//
// This tiny HTTP proxy transparently converts streaming requests into
// non-streaming ones, then re-encodes the single JSON response as
// server-sent events (SSE) — the format the OpenAI SDK expects.
//
// Non-streaming requests and non-chat-completions endpoints are proxied
// through unchanged.
//
// Usage:
// const { proxyBaseUrl, close } = await startStreamingProxy(foundryEndpoint);
// // Point the Copilot SDK BYOK provider at proxyBaseUrl instead of foundryEndpoint
// ---------------------------------------------------------------------------
import http from "http";
export interface StreamingProxyHandle {
/** Base URL for the proxy, e.g. "http://127.0.0.1:54321/v1" */
proxyBaseUrl: string;
/** Shut down the proxy server. */
close: () => Promise<void>;
}
/**
* Start the streaming proxy.
*
* @param upstreamBaseUrl Foundry Local endpoint including `/v1`, e.g.
* `http://127.0.0.1:51995/v1`
* @returns A handle with `proxyBaseUrl` and `close()`.
*/
export async function startStreamingProxy(
upstreamBaseUrl: string,
): Promise<StreamingProxyHandle> {
// Normalise: strip trailing slash if any
const upstream = upstreamBaseUrl.replace(/\/+$/, "");
// Strip /v1 suffix from upstream so it's not doubled when the SDK's
// request URL already includes /v1 (e.g. /v1/chat/completions).
const upstreamOrigin = upstream.replace(/\/v1$/, "");
const server = http.createServer(async (req, res) => {
const target = `${upstreamOrigin}${req.url ?? ""}`;
// ── Read the incoming body ──────────────────────────────────────────
const chunks: Buffer[] = [];
for await (const chunk of req) chunks.push(chunk as Buffer);
const rawBody = Buffer.concat(chunks).toString("utf-8");
// Decide whether this is a streaming chat-completions request
let body: any = null;
let isStreamingChat = false;
const isChatCompletions = (req.url ?? "").includes("/chat/completions");
if (rawBody && isChatCompletions) {
try {
body = JSON.parse(rawBody);
if (body.stream === true) {
isStreamingChat = true;
body.stream = false; // ← the key conversion
}
} catch {
// Not valid JSON — just forward as-is
}
}
const outBody = isStreamingChat ? JSON.stringify(body) : rawBody;
// ── Forward to upstream Foundry Local ────────────────────────────────
const headers: Record<string, string> = {
"content-type": req.headers["content-type"] ?? "application/json",
};
if (req.headers.authorization) {
headers.authorization = req.headers.authorization;
}
try {
const upRes = await fetch(target, {
method: req.method ?? "POST",
headers,
body: req.method !== "GET" ? outBody : undefined,
});
if (!upRes.ok) {
res.writeHead(upRes.status, { "content-type": "application/json" });
const errText = await upRes.text();
res.end(errText);
return;
}
// ── If this was a streaming request: re-encode as SSE ─────────────
if (isStreamingChat) {
const upJson = await upRes.json() as any;
res.writeHead(200, {
"content-type": "text/event-stream",
"cache-control": "no-cache",
connection: "keep-alive",
});
const id = upJson.id ?? `chatcmpl-proxy-${Date.now()}`;
const created = upJson.created ?? Math.floor(Date.now() / 1000);
const model = upJson.model ?? "";
for (const choice of upJson.choices ?? []) {
const msg = choice.message ?? {};
const idx = choice.index ?? 0;
// 1. Role chunk
const roleChunk = {
id,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: idx,
delta: { role: msg.role ?? "assistant" },
finish_reason: null,
},
],
};
res.write(`data: ${JSON.stringify(roleChunk)}\n\n`);
// 2. Content chunks (split into ~80-char pieces for realism)
if (msg.content) {
const content = msg.content as string;
const CHUNK_SIZE = 80;
for (let i = 0; i < content.length; i += CHUNK_SIZE) {
const piece = content.slice(i, i + CHUNK_SIZE);
const contentChunk = {
id,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: idx,
delta: { content: piece },
finish_reason: null,
},
],
};
res.write(`data: ${JSON.stringify(contentChunk)}\n\n`);
}
}
// 3. Tool-call chunks (if any)
if (msg.tool_calls && Array.isArray(msg.tool_calls)) {
for (let ti = 0; ti < msg.tool_calls.length; ti++) {
const tc = msg.tool_calls[ti];
// First chunk: id + function name
const tcStartChunk = {
id,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: idx,
delta: {
tool_calls: [
{
index: ti,
id: tc.id ?? `call_${ti}`,
type: "function",
function: {
name: tc.function?.name ?? "",
arguments: "",
},
},
],
},
finish_reason: null,
},
],
};
res.write(`data: ${JSON.stringify(tcStartChunk)}\n\n`);
// Arguments in ~80-char chunks
const args = tc.function?.arguments ?? "{}";
const CHUNK_SIZE = 80;
for (let i = 0; i < args.length; i += CHUNK_SIZE) {
const piece = args.slice(i, i + CHUNK_SIZE);
const tcArgChunk = {
id,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: idx,
delta: {
tool_calls: [
{
index: ti,
function: { arguments: piece },
},
],
},
finish_reason: null,
},
],
};
res.write(`data: ${JSON.stringify(tcArgChunk)}\n\n`);
}
}
}
// 4. Finish chunk
const finishChunk = {
id,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: idx,
delta: {},
finish_reason: choice.finish_reason ?? "stop",
},
],
};
res.write(`data: ${JSON.stringify(finishChunk)}\n\n`);
}
// 5. [DONE]
res.write("data: [DONE]\n\n");
res.end();
return;
}
// ── Non-streaming: pass through unchanged ─────────────────────────
const resBody = await upRes.text();
const resHeaders: Record<string, string> = {};
upRes.headers.forEach((v, k) => {
resHeaders[k] = v;
});
res.writeHead(upRes.status, resHeaders);
res.end(resBody);
} catch (err: any) {
console.error("[streaming-proxy] upstream error:", err.message ?? err);
res.writeHead(502, { "content-type": "application/json" });
res.end(JSON.stringify({ error: "proxy_error", message: err.message }));
}
});
// Listen on a random port
return new Promise((resolve, reject) => {
server.listen(0, "127.0.0.1", () => {
const addr = server.address();
if (!addr || typeof addr === "string") {
reject(new Error("Failed to start streaming proxy"));
return;
}
const proxyBaseUrl = `http://127.0.0.1:${addr.port}/v1`;
console.log(` [streaming-proxy] Listening on ${proxyBaseUrl} → ${upstream}`);
resolve({
proxyBaseUrl,
close: () =>
new Promise<void>((res) => {
server.close(() => res());
}),
});
});
});
}
Impact: Any SDK consumer expecting standard OpenAI streaming (which is the default for most clients) will hang. This is the single biggest friction point for Foundry Local adoption with the GitHub Copilot SDK.
Repro:
curl -X POST http://127.0.0.1:<port>/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"qwen2.5-coder-1.5b","messages":[{"role":"user","content":"Hi"}],"stream":true}'
# ← Hangs forever. Change stream to false and it works.