From 9a76657431970bf22a190e5e31cb90b241499a38 Mon Sep 17 00:00:00 2001 From: Jonathan Santilli <1774227+jonathansantilli@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:16:50 +0100 Subject: [PATCH 1/2] fix(layer3): produce clean resource IDs for remote HTTP/SSE URLs Layer 3 resource discovery composed IDs as `${kind}:${url}`, which for http/sse kinds collided with the URL's own scheme and produced malformed values like `http:https://mcp.linear.app/mcp`. These IDs leaked into `rule_id` and `file_path` on Layer 3 findings. Introduce `url-validation.ts` with `normalizeRemoteUrl` (reject non http/https schemes, missing hosts, canonicalise trailing slashes) and `buildResourceId` (use the URL itself for http/sse kinds, preserve the `:` shape for npm/pypi/git). Wire both into the scan resource collector so IDs are clean and URLs validated before becoming findings. --- src/layer3-dynamic/url-validation.ts | 93 ++++++++++++++++++++ src/scan.ts | 38 ++++---- tests/cli/deep-scan-command.test.ts | 24 ++--- tests/commands/clawhub-wrapper.test.ts | 4 +- tests/commands/skills-wrapper.test.ts | 4 +- tests/layer1/user-scope-discovery.test.ts | 2 +- tests/layer3/consent-flow.test.ts | 6 +- tests/layer3/deep-resource-discovery.test.ts | 10 +-- tests/layer3/layer3-integration.test.ts | 4 +- tests/layer3/no-outbound-calls.test.ts | 4 +- tests/layer3/url-validation.test.ts | 91 +++++++++++++++++++ tests/path-display.test.ts | 4 +- 12 files changed, 238 insertions(+), 46 deletions(-) create mode 100644 src/layer3-dynamic/url-validation.ts create mode 100644 tests/layer3/url-validation.test.ts diff --git a/src/layer3-dynamic/url-validation.ts b/src/layer3-dynamic/url-validation.ts new file mode 100644 index 0000000..68d2f03 --- /dev/null +++ b/src/layer3-dynamic/url-validation.ts @@ -0,0 +1,93 @@ +/** + * URL validation and normalisation helpers for Layer 3 remote resource handling. + * + * These helpers guarantee that remote resources (HTTP/SSE MCP endpoints, + * skill-referenced URLs) use a safe, canonical form before they are fed into + * finding `rule_id` / `file_path` fields or into the fetcher. + * + * Historically, L3 resource IDs were composed as `${kind}:${url}` which, for + * http/sse kinds, produced malformed values like `http:https://mcp.linear.app/mcp` + * (the kind collides with the URL's own scheme). `buildResourceId` avoids that + * double-scheme shape by reusing the URL itself as the id for http/sse kinds. + */ +export type RemoteScheme = "http" | "https"; + +export interface NormalizeRemoteUrlResult { + ok: true; + url: string; + scheme: RemoteScheme; +} + +export interface NormalizeRemoteUrlError { + ok: false; + reason: "empty" | "unsupported_scheme" | "missing_host" | "missing_scheme" | "invalid_url"; +} + +/** + * Validate and canonicalise a remote URL. Rejects non http/https schemes, + * missing hosts, and malformed inputs. Normalises a bare-host path to a + * single trailing slash and strips trailing slashes from longer paths. + */ +export function normalizeRemoteUrl( + input: string, +): NormalizeRemoteUrlResult | NormalizeRemoteUrlError { + if (typeof input !== "string" || input.trim().length === 0) { + return { ok: false, reason: "empty" }; + } + + const trimmed = input.trim(); + + // Quick reject for bare `http:` / `https:` without `//` and host. + if (/^https?:\/?$/iu.test(trimmed)) { + return { ok: false, reason: "missing_host" }; + } + + // Must start with http:// or https:// (case-insensitive). + if (!/^https?:\/\//iu.test(trimmed)) { + return { ok: false, reason: "missing_scheme" }; + } + + let parsed: URL; + try { + parsed = new URL(trimmed); + } catch { + return { ok: false, reason: "invalid_url" }; + } + + const scheme = parsed.protocol.replace(":", "").toLowerCase(); + if (scheme !== "http" && scheme !== "https") { + return { ok: false, reason: "unsupported_scheme" }; + } + + if (parsed.hostname.length === 0) { + return { ok: false, reason: "missing_host" }; + } + + // Normalise trailing slashes: keep `/` for root paths, strip for others. + if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) { + parsed.pathname = parsed.pathname.replace(/\/+$/u, ""); + } + + return { + ok: true, + url: parsed.toString(), + scheme: scheme as RemoteScheme, + }; +} + +export type DeepScanResourceKind = "npm" | "pypi" | "git" | "http" | "sse"; + +/** + * Build a canonical resource id used for findings (`rule_id`, `file_path`) and + * for consent prompts. For http/sse kinds the id is the URL itself (no + * `http:` / `sse:` prefix) to avoid the malformed `http:https://...` shape. + * For npm/pypi/git, the `:` prefix is preserved because those + * locators are not URLs and other code (e.g. `isRegistryMetadataResource`) + * keys on that prefix. + */ +export function buildResourceId(kind: DeepScanResourceKind, locator: string): string { + if (kind === "http" || kind === "sse") { + return locator; + } + return `${kind}:${locator}`; +} diff --git a/src/scan.ts b/src/scan.ts index 2ef1e15..6175259 100644 --- a/src/scan.ts +++ b/src/scan.ts @@ -5,6 +5,7 @@ import { collectLocalTextAnalysisTargets, type LocalTextAnalysisTarget, } from "./layer3-dynamic/local-text-analysis.js"; +import { buildResourceId, normalizeRemoteUrl } from "./layer3-dynamic/url-validation.js"; import { runStaticPipeline } from "./pipeline.js"; import type { StaticFileInput } from "./layer2-static/engine.js"; import { applyReportSummary } from "./report-summary.js"; @@ -690,18 +691,21 @@ function collectDeepScanResourcesFromParsed( } if (typeof config.url === "string" && isHttpLikeUrl(config.url)) { - const kind = inferHttpKind(config.url); - const id = `${kind}:${config.url}`; - if (!resources.has(id)) { - resources.set(id, { - id, - request: { + const normalized = normalizeRemoteUrl(config.url); + if (normalized.ok) { + const kind = inferHttpKind(normalized.url); + const id = buildResourceId(kind, normalized.url); + if (!resources.has(id)) { + resources.set(id, { id, - kind, - locator: config.url, - }, - commandPreview: `GET ${config.url} (from ${filePath} -> ${container.key}.${serverName}.url)`, - }); + request: { + id, + kind, + locator: normalized.url, + }, + commandPreview: `GET ${normalized.url} (from ${filePath} -> ${container.key}.${serverName}.url)`, + }); + } } } @@ -733,8 +737,12 @@ function collectDeepScanResourcesFromParsed( if (typeof config.url !== "string" || !isHttpLikeUrl(config.url)) { return; } - const kind = inferHttpKind(config.url); - const id = `${kind}:${config.url}`; + const normalized = normalizeRemoteUrl(config.url); + if (!normalized.ok) { + return; + } + const kind = inferHttpKind(normalized.url); + const id = buildResourceId(kind, normalized.url); if (resources.has(id)) { return; } @@ -743,9 +751,9 @@ function collectDeepScanResourcesFromParsed( request: { id, kind, - locator: config.url, + locator: normalized.url, }, - commandPreview: `GET ${config.url} (from ${filePath} -> ${remoteArray.key}.${index}.url)`, + commandPreview: `GET ${normalized.url} (from ${filePath} -> ${remoteArray.key}.${index}.url)`, }); }); } diff --git a/tests/cli/deep-scan-command.test.ts b/tests/cli/deep-scan-command.test.ts index 0051ec7..988ab1f 100644 --- a/tests/cli/deep-scan-command.test.ts +++ b/tests/cli/deep-scan-command.test.ts @@ -63,9 +63,9 @@ describe("scan --deep behavior", () => { it("executes deep scan resources with consent and merges findings into exit code", async () => { const deepResources: DeepScanResource[] = [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, @@ -115,9 +115,9 @@ describe("scan --deep behavior", () => { async () => [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, @@ -190,9 +190,9 @@ describe("scan --deep behavior", () => { const stdout: string[] = []; const deepResources: DeepScanResource[] = [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, @@ -240,9 +240,9 @@ describe("scan --deep behavior", () => { it("allows choosing a deep agent and runs approved meta-agent commands", async () => { const deepResources: DeepScanResource[] = [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, @@ -327,9 +327,9 @@ describe("scan --deep behavior", () => { it("unwraps Claude JSON result envelopes from meta-agent output", async () => { const deepResources: DeepScanResource[] = [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, @@ -406,9 +406,9 @@ describe("scan --deep behavior", () => { it("disables deep-scan prompts when --no-tui is passed", async () => { const deepResources: DeepScanResource[] = [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, diff --git a/tests/commands/clawhub-wrapper.test.ts b/tests/commands/clawhub-wrapper.test.ts index 37c73d3..7e6180d 100644 --- a/tests/commands/clawhub-wrapper.test.ts +++ b/tests/commands/clawhub-wrapper.test.ts @@ -456,9 +456,9 @@ describe("clawhub wrapper execution", () => { it("does not invoke deep consent callbacks in non-interactive mode", async () => { const discoverDeepResources = vi.fn(async () => [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, diff --git a/tests/commands/skills-wrapper.test.ts b/tests/commands/skills-wrapper.test.ts index 1070c2c..6873a23 100644 --- a/tests/commands/skills-wrapper.test.ts +++ b/tests/commands/skills-wrapper.test.ts @@ -683,9 +683,9 @@ describe("skills wrapper execution", () => { it("does not invoke deep consent callbacks in non-interactive mode", async () => { const discoverDeepResources = vi.fn(async () => [ { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", request: { - id: "http:https://mcp.example/tools", + id: "https://mcp.example/tools", kind: "http", locator: "https://mcp.example/tools", }, diff --git a/tests/layer1/user-scope-discovery.test.ts b/tests/layer1/user-scope-discovery.test.ts index 7a3c1c4..96f5519 100644 --- a/tests/layer1/user-scope-discovery.test.ts +++ b/tests/layer1/user-scope-discovery.test.ts @@ -100,7 +100,7 @@ describe("user-scope discovery", () => { includeUserScope: true, homeDir: home, }); - expect(resources.map((resource) => resource.id)).toEqual(["sse:https://example.com/sse"]); + expect(resources.map((resource) => resource.id)).toEqual(["https://example.com/sse"]); }); it("includes user-scope paths for additional tools (roo)", async () => { diff --git a/tests/layer3/consent-flow.test.ts b/tests/layer3/consent-flow.test.ts index c602e48..4b4677a 100644 --- a/tests/layer3/consent-flow.test.ts +++ b/tests/layer3/consent-flow.test.ts @@ -8,8 +8,8 @@ const RESOURCES: DeepScanResource[] = [ commandPreview: "fetch npm metadata", }, { - id: "http:https://example.com", - request: { id: "http:https://example.com", kind: "http", locator: "https://example.com" }, + id: "https://example.com", + request: { id: "https://example.com", kind: "http", locator: "https://example.com" }, commandPreview: "fetch remote metadata", }, ]; @@ -25,7 +25,7 @@ describe("task 26 deep scan consent flow", () => { const outcomes = await runDeepScanWithConsent( RESOURCES, - async (resource) => resource.id === "http:https://example.com", + async (resource) => resource.id === "https://example.com", execute, ); diff --git a/tests/layer3/deep-resource-discovery.test.ts b/tests/layer3/deep-resource-discovery.test.ts index 32a8f7e..56860b8 100644 --- a/tests/layer3/deep-resource-discovery.test.ts +++ b/tests/layer3/deep-resource-discovery.test.ts @@ -36,10 +36,10 @@ describe("deep resource discovery", () => { const resources = discoverDeepScanResources(root); expect(resources.map((resource) => resource.id)).toEqual([ - "http:https://example.com/mcp/tools", + "https://example.com/mcp/tools", + "https://example.com/sse", "npm:@example/local-mcp", "pypi:example-python-mcp", - "sse:https://example.com/sse", ]); expect( resources.every( @@ -80,8 +80,8 @@ describe("deep resource discovery", () => { const resources = discoverDeepScanResources(root); expect(resources.map((resource) => resource.id)).toEqual([ + "https://example.com/eventstream", "npm:@example/alias-mcp", - "sse:https://example.com/eventstream", ]); }); @@ -117,8 +117,8 @@ describe("deep resource discovery", () => { }); expect(resources.map((resource) => resource.id)).toEqual([ - "http:https://internal.example.com/mcp", - "sse:https://internal.example.com/eventstream", + "https://internal.example.com/eventstream", + "https://internal.example.com/mcp", ]); }); }); diff --git a/tests/layer3/layer3-integration.test.ts b/tests/layer3/layer3-integration.test.ts index 7339701..e0faaa3 100644 --- a/tests/layer3/layer3-integration.test.ts +++ b/tests/layer3/layer3-integration.test.ts @@ -27,7 +27,7 @@ describe("task 28 layer3 integration", () => { }, }, { - resourceId: "http:https://example.invalid/schema", + resourceId: "https://example.invalid/schema", approved: true, status: "ok", result: { @@ -121,7 +121,7 @@ describe("task 28 layer3 integration", () => { it("derives tool-description and toxic-flow findings from metadata tools", () => { const outcomes: DeepScanOutcome[] = [ { - resourceId: "http:https://mcp.example/tools", + resourceId: "https://mcp.example/tools", approved: true, status: "ok", result: { diff --git a/tests/layer3/no-outbound-calls.test.ts b/tests/layer3/no-outbound-calls.test.ts index 608c238..e5dfa8c 100644 --- a/tests/layer3/no-outbound-calls.test.ts +++ b/tests/layer3/no-outbound-calls.test.ts @@ -37,11 +37,11 @@ describe("deep scan makes no outbound HTTP calls", () => { const testCases = [ { - id: "http:https://mcp.evil.com/tools", + id: "https://mcp.evil.com/tools", request: { kind: "http", locator: "https://mcp.evil.com/tools" }, }, { - id: "sse:https://mcp.evil.com/sse", + id: "https://mcp.evil.com/sse", request: { kind: "sse", locator: "https://mcp.evil.com/sse" }, }, { id: "npm:@evil/backdoor", request: { kind: "npm", locator: "@evil/backdoor" } }, diff --git a/tests/layer3/url-validation.test.ts b/tests/layer3/url-validation.test.ts new file mode 100644 index 0000000..c22eca7 --- /dev/null +++ b/tests/layer3/url-validation.test.ts @@ -0,0 +1,91 @@ +import { describe, expect, it } from "vitest"; +import { buildResourceId, normalizeRemoteUrl } from "../../src/layer3-dynamic/url-validation"; + +describe("normalizeRemoteUrl", () => { + it("accepts http and https URLs and returns them in canonical form", () => { + const http = normalizeRemoteUrl("http://0.0.0.0:8007/mcp"); + expect(http.ok).toBe(true); + if (http.ok) { + expect(http.url).toBe("http://0.0.0.0:8007/mcp"); + expect(http.scheme).toBe("http"); + } + + const https = normalizeRemoteUrl("https://mcp.linear.app/mcp"); + expect(https.ok).toBe(true); + if (https.ok) { + expect(https.url).toBe("https://mcp.linear.app/mcp"); + expect(https.scheme).toBe("https"); + } + }); + + it("rejects non http/https schemes", () => { + expect(normalizeRemoteUrl("file:///etc/passwd").ok).toBe(false); + expect(normalizeRemoteUrl("ftp://ftp.example.com/pub").ok).toBe(false); + expect(normalizeRemoteUrl("ssh://user@example.com").ok).toBe(false); + const result = normalizeRemoteUrl("javascript:alert(1)"); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.reason).toBe("missing_scheme"); + } + }); + + it("rejects URLs without a host", () => { + const bare = normalizeRemoteUrl("http://"); + expect(bare.ok).toBe(false); + const justScheme = normalizeRemoteUrl("http:"); + expect(justScheme.ok).toBe(false); + if (!justScheme.ok) { + expect(justScheme.reason).toBe("missing_host"); + } + }); + + it("rejects empty / whitespace input", () => { + expect(normalizeRemoteUrl("").ok).toBe(false); + expect(normalizeRemoteUrl(" ").ok).toBe(false); + }); + + it("strips trailing slashes from non-root paths and preserves root slash", () => { + const withSlash = normalizeRemoteUrl("https://example.com/api/v1/"); + expect(withSlash.ok).toBe(true); + if (withSlash.ok) { + expect(withSlash.url).toBe("https://example.com/api/v1"); + } + + // Root path: URL class already canonicalises to trailing `/` which we keep. + const root = normalizeRemoteUrl("https://example.com"); + expect(root.ok).toBe(true); + if (root.ok) { + expect(root.url).toBe("https://example.com/"); + } + }); +}); + +describe("buildResourceId", () => { + it("uses the URL itself (no double scheme) for http and sse kinds", () => { + expect(buildResourceId("http", "https://mcp.linear.app/mcp")).toBe( + "https://mcp.linear.app/mcp", + ); + expect(buildResourceId("http", "http://0.0.0.0:8007/mcp")).toBe("http://0.0.0.0:8007/mcp"); + expect(buildResourceId("sse", "https://example.com/sse")).toBe("https://example.com/sse"); + }); + + it("preserves the kind prefix for registry metadata kinds", () => { + expect(buildResourceId("npm", "@org/pkg")).toBe("npm:@org/pkg"); + expect(buildResourceId("pypi", "requests")).toBe("pypi:requests"); + expect(buildResourceId("git", "https://github.com/org/repo")).toBe( + "git:https://github.com/org/repo", + ); + }); + + it("produces resource IDs that do not start with http:http or http:https", () => { + for (const url of [ + "http://0.0.0.0:8007/mcp", + "https://mcp.linear.app/mcp", + "https://example.com/sse", + ]) { + const id = buildResourceId("http", url); + expect(id.startsWith("http:http://")).toBe(false); + expect(id.startsWith("http:https://")).toBe(false); + } + }); +}); diff --git a/tests/path-display.test.ts b/tests/path-display.test.ts index f5fe522..f3dcea3 100644 --- a/tests/path-display.test.ts +++ b/tests/path-display.test.ts @@ -9,8 +9,8 @@ describe("path display", () => { }); it("keeps URI-like resource identifiers unchanged", () => { - expect(toAbsoluteDisplayPath("/tmp/codegate-case3", "http:https://mcp.linear.app/mcp")).toBe( - "http:https://mcp.linear.app/mcp", + expect(toAbsoluteDisplayPath("/tmp/codegate-case3", "https://mcp.linear.app/mcp")).toBe( + "https://mcp.linear.app/mcp", ); }); }); From 846398905d0b6a9280d5470d155ef7db357a8e99 Mon Sep 17 00:00:00 2001 From: Jonathan Santilli <1774227+jonathansantilli@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:21:07 +0100 Subject: [PATCH 2/2] fix(layer3): add configurable timeout and byte-size guard for remote fetches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A slow or malicious remote host could previously hang a scan or flood it with gigabytes of data through a Layer 3 resource probe. Harden the remote fetcher with two new limits: * `layer3_remote_fetch_timeout_ms` (default 5000) — per-attempt timeout enforced via AbortController. Overridable by `CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS`. * `layer3_remote_fetch_max_bytes` (default 1_048_576) — rejects declared Content-Length above the cap immediately, and aborts the streaming read once bytes exceed it (so servers that lie about / omit the header cannot bypass the guard). Overridable by `CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES`. Both limits are exposed only via config file and env var — no CLI flag. The fetcher exports `resourceFetcherOptionsFromConfig` as a small helper for callers plumbing the limits from `CodeGateConfig`. --- src/config.ts | 60 +++++++++ src/layer3-dynamic/resource-fetcher.ts | 126 ++++++++++++++++- .../tool-description-acquisition.ts | 12 +- .../config/layer3-remote-fetch-limits.test.ts | 101 ++++++++++++++ tests/layer3/resource-fetcher-limits.test.ts | 127 ++++++++++++++++++ 5 files changed, 417 insertions(+), 9 deletions(-) create mode 100644 tests/config/layer3-remote-fetch-limits.test.ts create mode 100644 tests/layer3/resource-fetcher-limits.test.ts diff --git a/src/config.ts b/src/config.ts index 685774a..adab95c 100644 --- a/src/config.ts +++ b/src/config.ts @@ -76,6 +76,22 @@ export interface CodeGateConfig { workflow_audits?: WorkflowAuditConfig; suppress_findings: string[]; suppression_rules?: SuppressionRule[]; + /** + * Timeout (in milliseconds) applied to Layer 3 remote resource fetches + * (npm/PyPI registry lookups, git ls-remote, and any http/sse MCP probes). + * Kept deliberately low so a slow or deliberately stalling host cannot + * hang a scan. Overridable via `CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS`. + */ + layer3_remote_fetch_timeout_ms: number; + /** + * Maximum response size (in bytes) accepted from a Layer 3 remote fetch. + * A declared `Content-Length` above this value is rejected immediately, + * and the streaming reader aborts once the running byte count exceeds + * this limit (defends against servers that lie about or omit + * `Content-Length`). Overridable via + * `CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES`. + */ + layer3_remote_fetch_max_bytes: number; } interface PartialTuiConfig { @@ -122,6 +138,8 @@ interface PartialCodeGateConfig { }; suppress_findings?: string[]; suppression_rules?: SuppressionRule[]; + layer3_remote_fetch_timeout_ms?: number; + layer3_remote_fetch_max_bytes?: number; } export interface CliConfigOverrides { @@ -175,8 +193,36 @@ export const DEFAULT_CONFIG: CodeGateConfig = { workflow_audits: { enabled: false }, suppress_findings: [], suppression_rules: [], + layer3_remote_fetch_timeout_ms: 5000, + layer3_remote_fetch_max_bytes: 1_048_576, }; +/** Env var name that overrides `layer3_remote_fetch_timeout_ms`. */ +export const LAYER3_REMOTE_FETCH_TIMEOUT_ENV = "CODEGATE_LAYER3_REMOTE_FETCH_TIMEOUT_MS"; +/** Env var name that overrides `layer3_remote_fetch_max_bytes`. */ +export const LAYER3_REMOTE_FETCH_MAX_BYTES_ENV = "CODEGATE_LAYER3_REMOTE_FETCH_MAX_BYTES"; + +function normalizePositiveInteger(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value) && value > 0) { + return Math.floor(value); + } + if (typeof value === "string" && value.trim().length > 0) { + const parsed = Number(value.trim()); + if (Number.isFinite(parsed) && parsed > 0) { + return Math.floor(parsed); + } + } + return undefined; +} + +function readEnvOverride(name: string): number | undefined { + const raw = process.env[name]; + if (raw === undefined) { + return undefined; + } + return normalizePositiveInteger(raw); +} + interface PartialRulePolicyConfig { disable?: boolean; ignore?: string[]; @@ -627,6 +673,20 @@ export function resolveEffectiveConfig(options: ResolveConfigOptions): CodeGateC ...(globalConfig.suppression_rules ?? []), ...(projectConfig.suppression_rules ?? []), ], + layer3_remote_fetch_timeout_ms: + pickFirst( + readEnvOverride(LAYER3_REMOTE_FETCH_TIMEOUT_ENV), + normalizePositiveInteger(projectConfig.layer3_remote_fetch_timeout_ms), + normalizePositiveInteger(globalConfig.layer3_remote_fetch_timeout_ms), + DEFAULT_CONFIG.layer3_remote_fetch_timeout_ms, + ) ?? DEFAULT_CONFIG.layer3_remote_fetch_timeout_ms, + layer3_remote_fetch_max_bytes: + pickFirst( + readEnvOverride(LAYER3_REMOTE_FETCH_MAX_BYTES_ENV), + normalizePositiveInteger(projectConfig.layer3_remote_fetch_max_bytes), + normalizePositiveInteger(globalConfig.layer3_remote_fetch_max_bytes), + DEFAULT_CONFIG.layer3_remote_fetch_max_bytes, + ) ?? DEFAULT_CONFIG.layer3_remote_fetch_max_bytes, }; } diff --git a/src/layer3-dynamic/resource-fetcher.ts b/src/layer3-dynamic/resource-fetcher.ts index d1f3275..d6a3a74 100644 --- a/src/layer3-dynamic/resource-fetcher.ts +++ b/src/layer3-dynamic/resource-fetcher.ts @@ -11,6 +11,29 @@ export interface ResourceRequest { export interface ResourceFetcherOptions { maxRetries?: number; timeoutMs?: number; + /** + * Maximum number of bytes accepted in the response body. Enforced against + * both the declared `Content-Length` header (if present) and the running + * byte count during streaming read. Defaults to 1 MiB. + */ + maxBytes?: number; +} + +export const DEFAULT_FETCH_TIMEOUT_MS = 5000; +export const DEFAULT_FETCH_MAX_BYTES = 1_048_576; + +/** + * Extract Layer 3 remote-fetch limits from the resolved CodeGate config. + * Kept here so callers don't have to remember the config field names. + */ +export function resourceFetcherOptionsFromConfig(config: { + layer3_remote_fetch_timeout_ms: number; + layer3_remote_fetch_max_bytes: number; +}): ResourceFetcherOptions { + return { + timeoutMs: config.layer3_remote_fetch_timeout_ms, + maxBytes: config.layer3_remote_fetch_max_bytes, + }; } export interface ResourceFetcherDeps { @@ -58,12 +81,80 @@ function endpointFor(request: ResourceRequest): string { return request.locator; } -async function parseResponse(response: Response): Promise { +/** + * Read a response body while enforcing `maxBytes`. Returns the collected + * string, or throws a tagged error if the declared `Content-Length` or the + * streamed size exceeds the cap. + */ +async function readBodyWithLimit(response: Response, maxBytes: number): Promise { + const declared = response.headers.get("content-length"); + if (declared !== null) { + const parsed = Number(declared); + if (Number.isFinite(parsed) && parsed > maxBytes) { + // Drain & release the stream without reading bytes. + try { + await response.body?.cancel(); + } catch { + // no-op: cancel failures are non-fatal. + } + throw new Error( + `response_too_large: declared Content-Length ${parsed} exceeds limit ${maxBytes}`, + ); + } + } + + const body = response.body; + if (!body) { + // No stream (e.g., HEAD or empty body): fall back to text(). + const text = await response.text(); + if (Buffer.byteLength(text, "utf8") > maxBytes) { + throw new Error(`response_too_large: body ${Buffer.byteLength(text, "utf8")} > ${maxBytes}`); + } + return text; + } + + const reader = body.getReader(); + const chunks: Uint8Array[] = []; + let total = 0; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } + if (!value) { + continue; + } + total += value.byteLength; + if (total > maxBytes) { + try { + await reader.cancel(); + } catch { + // no-op + } + throw new Error(`response_too_large: streamed ${total} > ${maxBytes}`); + } + chunks.push(value); + } + } finally { + try { + reader.releaseLock(); + } catch { + // releaseLock throws if the reader was already cancelled; ignore. + } + } + + const buffer = Buffer.concat(chunks.map((chunk) => Buffer.from(chunk))); + return buffer.toString("utf8"); +} + +async function parseResponse(response: Response, maxBytes: number): Promise { const contentType = response.headers.get("content-type") ?? ""; + const text = await readBodyWithLimit(response, maxBytes); if (contentType.includes("application/json")) { - return (await response.json()) as unknown; + return JSON.parse(text) as unknown; } - return await response.text(); + return text; } function timeoutError(error: unknown): boolean { @@ -72,6 +163,11 @@ function timeoutError(error: unknown): boolean { return message.includes("timeout") || message.includes("aborted"); } +function isResponseTooLarge(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return message.startsWith("response_too_large"); +} + export async function fetchResourceMetadata( request: ResourceRequest, customDeps: ResourceFetcherDeps = defaultDeps(), @@ -104,14 +200,19 @@ export async function fetchResourceMetadata( } const endpoint = endpointFor(request); - const timeoutMs = options.timeoutMs ?? 5000; + const timeoutMs = options.timeoutMs ?? DEFAULT_FETCH_TIMEOUT_MS; + const maxBytes = options.maxBytes ?? DEFAULT_FETCH_MAX_BYTES; for (let attempt = 0; attempt <= maxRetries; attempt += 1) { try { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); - const response = await deps.fetch(endpoint, { signal: controller.signal }); - clearTimeout(timer); + let response: Response; + try { + response = await deps.fetch(endpoint, { signal: controller.signal }); + } finally { + clearTimeout(timer); + } if (response.status === 401 || response.status === 403) { return { @@ -135,13 +236,24 @@ export async function fetchResourceMetadata( }; } + const metadata = await parseResponse(response, maxBytes); return { status: "ok", attempts: attempt + 1, elapsedMs: deps.now() - startedAt, - metadata: await parseResponse(response), + metadata, }; } catch (error) { + // Size-limit breaches are deterministic — do not retry, surface as network_error. + if (isResponseTooLarge(error)) { + return { + status: "network_error", + attempts: attempt + 1, + elapsedMs: deps.now() - startedAt, + error: error instanceof Error ? error.message : String(error), + }; + } + if (attempt < maxRetries) { await deps.sleep(100 * (attempt + 1)); continue; diff --git a/src/layer3-dynamic/tool-description-acquisition.ts b/src/layer3-dynamic/tool-description-acquisition.ts index 7566b71..e9a435d 100644 --- a/src/layer3-dynamic/tool-description-acquisition.ts +++ b/src/layer3-dynamic/tool-description-acquisition.ts @@ -1,6 +1,7 @@ import { fetchResourceMetadata, type ResourceFetchResult, + type ResourceFetcherOptions, type ResourceKind, type ResourceRequest, } from "./resource-fetcher.js"; @@ -37,9 +38,16 @@ export interface ToolDescriptionAcquisitionDeps { fetchMetadata: (request: ResourceRequest) => Promise; } -function defaultDeps(): ToolDescriptionAcquisitionDeps { +export interface ToolDescriptionAcquisitionOptions { + fetchOptions?: ResourceFetcherOptions; +} + +function defaultDeps( + options: ToolDescriptionAcquisitionOptions = {}, +): ToolDescriptionAcquisitionDeps { return { - fetchMetadata: async (request) => fetchResourceMetadata(request), + fetchMetadata: async (request) => + fetchResourceMetadata(request, undefined, options.fetchOptions), }; } diff --git a/tests/config/layer3-remote-fetch-limits.test.ts b/tests/config/layer3-remote-fetch-limits.test.ts new file mode 100644 index 0000000..16c2056 --- /dev/null +++ b/tests/config/layer3-remote-fetch-limits.test.ts @@ -0,0 +1,101 @@ +import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { + DEFAULT_CONFIG, + LAYER3_REMOTE_FETCH_MAX_BYTES_ENV, + LAYER3_REMOTE_FETCH_TIMEOUT_ENV, + resolveEffectiveConfig, +} from "../../src/config"; + +const tempDirs: string[] = []; + +function makeTempDir(prefix: string): string { + const dir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(dir); + return dir; +} + +afterEach(() => { + delete process.env[LAYER3_REMOTE_FETCH_TIMEOUT_ENV]; + delete process.env[LAYER3_REMOTE_FETCH_MAX_BYTES_ENV]; + tempDirs.splice(0); +}); + +describe("layer3 remote fetch limits config", () => { + it("defaults to 5000ms / 1 MiB", () => { + const home = makeTempDir("codegate-l3-defaults-home-"); + const project = makeTempDir("codegate-l3-defaults-project-"); + const resolved = resolveEffectiveConfig({ scanTarget: project, homeDir: home }); + expect(resolved.layer3_remote_fetch_timeout_ms).toBe(5000); + expect(resolved.layer3_remote_fetch_max_bytes).toBe(1_048_576); + expect(DEFAULT_CONFIG.layer3_remote_fetch_timeout_ms).toBe(5000); + expect(DEFAULT_CONFIG.layer3_remote_fetch_max_bytes).toBe(1_048_576); + }); + + it("honours project config > global config > defaults", () => { + const home = makeTempDir("codegate-l3-files-home-"); + const project = makeTempDir("codegate-l3-files-project-"); + mkdirSync(join(home, ".codegate"), { recursive: true }); + writeFileSync( + join(home, ".codegate", "config.json"), + JSON.stringify({ + layer3_remote_fetch_timeout_ms: 7000, + layer3_remote_fetch_max_bytes: 2048, + }), + "utf8", + ); + writeFileSync( + join(project, ".codegate.json"), + JSON.stringify({ layer3_remote_fetch_timeout_ms: 2500 }), + "utf8", + ); + + const resolved = resolveEffectiveConfig({ scanTarget: project, homeDir: home }); + // Project override wins on timeout. + expect(resolved.layer3_remote_fetch_timeout_ms).toBe(2500); + // No project override on max_bytes => global value applies. + expect(resolved.layer3_remote_fetch_max_bytes).toBe(2048); + }); + + it("env var takes precedence over project and global config", () => { + const home = makeTempDir("codegate-l3-env-home-"); + const project = makeTempDir("codegate-l3-env-project-"); + mkdirSync(join(home, ".codegate"), { recursive: true }); + writeFileSync( + join(home, ".codegate", "config.json"), + JSON.stringify({ + layer3_remote_fetch_timeout_ms: 4000, + layer3_remote_fetch_max_bytes: 2048, + }), + "utf8", + ); + writeFileSync( + join(project, ".codegate.json"), + JSON.stringify({ + layer3_remote_fetch_timeout_ms: 3000, + layer3_remote_fetch_max_bytes: 4096, + }), + "utf8", + ); + + process.env[LAYER3_REMOTE_FETCH_TIMEOUT_ENV] = "1500"; + process.env[LAYER3_REMOTE_FETCH_MAX_BYTES_ENV] = "16384"; + + const resolved = resolveEffectiveConfig({ scanTarget: project, homeDir: home }); + expect(resolved.layer3_remote_fetch_timeout_ms).toBe(1500); + expect(resolved.layer3_remote_fetch_max_bytes).toBe(16384); + }); + + it("ignores invalid env var values and falls back to config/defaults", () => { + const home = makeTempDir("codegate-l3-bad-env-home-"); + const project = makeTempDir("codegate-l3-bad-env-project-"); + process.env[LAYER3_REMOTE_FETCH_TIMEOUT_ENV] = "not-a-number"; + process.env[LAYER3_REMOTE_FETCH_MAX_BYTES_ENV] = "-42"; + + const resolved = resolveEffectiveConfig({ scanTarget: project, homeDir: home }); + expect(resolved.layer3_remote_fetch_timeout_ms).toBe(5000); + expect(resolved.layer3_remote_fetch_max_bytes).toBe(1_048_576); + }); +}); diff --git a/tests/layer3/resource-fetcher-limits.test.ts b/tests/layer3/resource-fetcher-limits.test.ts new file mode 100644 index 0000000..a5ed6d2 --- /dev/null +++ b/tests/layer3/resource-fetcher-limits.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, it, vi } from "vitest"; +import { + DEFAULT_FETCH_MAX_BYTES, + DEFAULT_FETCH_TIMEOUT_MS, + fetchResourceMetadata, + resourceFetcherOptionsFromConfig, + type ResourceFetcherDeps, +} from "../../src/layer3-dynamic/resource-fetcher"; + +function depsWithFetch(fetchImpl: ResourceFetcherDeps["fetch"]): ResourceFetcherDeps { + return { + fetch: fetchImpl, + runCommand: async () => ({ code: 0, stdout: "ok", stderr: "" }), + sleep: async () => {}, + now: () => Date.now(), + }; +} + +describe("resource-fetcher size limits", () => { + it("rejects responses whose declared Content-Length exceeds maxBytes", async () => { + const fetch = vi.fn(async () => { + // Body that would exceed the limit; the fetcher should bail out before reading. + return new Response("x".repeat(10), { + status: 200, + headers: { + "content-type": "application/json", + "content-length": "5000000", + }, + }); + }); + + const result = await fetchResourceMetadata( + { id: "npm:big", kind: "npm", locator: "big-package" }, + depsWithFetch(fetch), + { maxBytes: 1024, maxRetries: 0 }, + ); + + expect(result.status).toBe("network_error"); + expect(result.error).toContain("response_too_large"); + }); + + it("aborts mid-stream when bytes exceed maxBytes despite a missing Content-Length", async () => { + // A stream that emits chunks larger than the configured limit. + const body = new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode("aaaaaaaaaa")); // 10 bytes + controller.enqueue(new TextEncoder().encode("bbbbbbbbbb")); // 20 bytes total + controller.close(); + }, + }); + const fetch = vi.fn(async () => { + return new Response(body, { + status: 200, + headers: { "content-type": "application/json" }, + }); + }); + + const result = await fetchResourceMetadata( + { id: "http:stream", kind: "http", locator: "https://example.com/stream" }, + depsWithFetch(fetch), + { maxBytes: 15, maxRetries: 0 }, + ); + + expect(result.status).toBe("network_error"); + expect(result.error).toContain("response_too_large"); + }); + + it("accepts responses within the size limit and returns parsed metadata", async () => { + const payload = JSON.stringify({ name: "@org/pkg", version: "1.0.0" }); + const fetch = vi.fn(async () => { + return new Response(payload, { + status: 200, + headers: { + "content-type": "application/json", + "content-length": String(Buffer.byteLength(payload, "utf8")), + }, + }); + }); + + const result = await fetchResourceMetadata( + { id: "npm:pkg", kind: "npm", locator: "@org/pkg" }, + depsWithFetch(fetch), + { maxBytes: 4096 }, + ); + + expect(result.status).toBe("ok"); + expect(result.metadata).toEqual({ name: "@org/pkg", version: "1.0.0" }); + }); + + it("passes the configured timeout to AbortController", async () => { + let receivedSignal: AbortSignal | null = null; + const fetch = vi.fn(async (_input, init?: RequestInit) => { + receivedSignal = init?.signal ?? null; + return new Response("{}", { + status: 200, + headers: { "content-type": "application/json" }, + }); + }); + + await fetchResourceMetadata( + { id: "npm:ok", kind: "npm", locator: "ok" }, + depsWithFetch(fetch), + { timeoutMs: 123, maxRetries: 0 }, + ); + + // We cannot observe the timeout directly, but we can verify the abort + // signal plumbed through the fetch call. + expect(receivedSignal).not.toBeNull(); + }); +}); + +describe("resourceFetcherOptionsFromConfig", () => { + it("maps CodeGate config fields to fetcher options", () => { + const options = resourceFetcherOptionsFromConfig({ + layer3_remote_fetch_timeout_ms: 7500, + layer3_remote_fetch_max_bytes: 2048, + }); + expect(options).toEqual({ timeoutMs: 7500, maxBytes: 2048 }); + }); +}); + +describe("fetcher defaults", () => { + it("exposes sensible defaults", () => { + expect(DEFAULT_FETCH_TIMEOUT_MS).toBe(5000); + expect(DEFAULT_FETCH_MAX_BYTES).toBe(1_048_576); + }); +});