From 356b58cc52f513006203064bdf496c0491fbf52b Mon Sep 17 00:00:00 2001
From: Jonathan Santilli <1774227+jonathansantilli@users.noreply.github.com>
Date: Wed, 22 Apr 2026 17:44:55 +0100
Subject: [PATCH] fix(scan): disable user-scope walk when CLI scans a single
 file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #53 closed the cross-scan leak for skill-dir scans but not for
single-file scans of configs like ~/.claude/settings.json. Symptom:

  $ codegate-ai scan ~/.claude/settings.json
  → finding with file_path=~/.agents/skills/api-design-guide/.../SKILL.md

Root cause: the CLI stages single-file targets into a temp dir outside
$HOME. The staged dir is not inside homeDir, so
shouldKeepUserScopeCandidate short-circuits to `return true` and every
sibling user-scope match (e.g. a hidden-unicode hit in a completely
unrelated skill) gets attributed to the config scan.

Fix:
- cli.ts: when resolvedTarget.explicitCandidates is non-empty (the
  target was a staged local file), force scan_user_scope=false for that
  scan. Explicit opt-in via --include-user-scope still overrides. This
  matches user expectation: "scan this file" ≠ "scan my whole home."
- scan.ts: shouldKeepUserScopeCandidate now also handles engine-level
  file targets correctly (if the target is a file inside homeDir, only
  the target file itself is a valid user-scope candidate). This is
  defence in depth for library callers that bypass the CLI.

Tests:
- Existing 3 cases in tests/layer2/cross-scan-attribution.test.ts still
  pass.
- New: engine-level file-target scan drops sibling user-scope candidates.

Verified 154 test files / 720 tests pass. Lint + prettier clean.
---
 src/cli.ts                                  | 13 +++++-
 src/scan.ts                                 | 45 +++++++++++++++++----
 tests/layer2/cross-scan-attribution.test.ts | 42 +++++++++++++++++++
 3 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/src/cli.ts b/src/cli.ts
index 3e57b9e..4dd7e35 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -493,8 +493,19 @@ function addScanCommand(program: Command, version: string, deps: CliDeps): void
                 ? true
                 : (baseConfig.workflow_audits?.enabled ?? false),
           },
+          // When the target was a single local file that got staged into a
+          // temp dir (explicitCandidates set), walking the full user-scope
+          // tree is off-target: the user asked to scan one file, not their
+          // whole home. Leaving user-scope on here let sibling findings
+          // (e.g. `~/.agents/skills/*/SKILL.md`) leak into single-file
+          // scans of configs like `.claude/settings.json`. Explicit opt-in
+          // via `--include-user-scope` still forces it on.
           scan_user_scope:
-            options.includeUserScope === true ? true : (baseConfig.scan_user_scope ?? false),
+            options.includeUserScope === true
+              ? true
+              : resolvedTarget.explicitCandidates && resolvedTarget.explicitCandidates.length > 0
+                ? false
+                : (baseConfig.scan_user_scope ?? false),
         };
 
         if (options.resetState) {
diff --git a/src/scan.ts b/src/scan.ts
index 8d0036c..9038bea 100644
--- a/src/scan.ts
+++ b/src/scan.ts
@@ -260,22 +260,53 @@ function isPathInside(root: string, candidatePath: string): boolean {
  * User-scope patterns (e.g. `~/.agents/skills/&ast;/SKILL.md`) walk the whole
  * home directory, so they can match files belonging to completely unrelated
  * skills or agents. When the scan target is itself a specific location
- * **inside** the user's home — e.g. scanning a single skill directory — any
- * user-scope match outside that scan target belongs to a different scan and
- * must not be attributed here.
+ * **inside** the user's home — e.g. scanning a single skill directory or a
+ * single config file like `~/.claude/settings.json` — any user-scope match
+ * that does not belong to that target is a cross-scan leak and must be
+ * dropped.
  *
- * When the scan target lives outside the home directory (for example a
- * project root in a workspace), user-scope matches are accepted as legitimate
- * host-wide context for that scan.
+ * Three cases:
+ * - `scanTarget` is a directory inside `homeDir`: only keep candidates inside
+ *   that directory (existing PR #53 behavior).
+ * - `scanTarget` is a file inside `homeDir`: only keep candidates that resolve
+ *   to that exact file. "Inside" semantics do not apply to files, so the
+ *   previous check let every sibling through.
+ * - `scanTarget` lives outside the home directory (or cannot be stat'd,
+ *   e.g. a URL or a staged path that has been cleaned up): user-scope
+ *   matches are accepted as legitimate host-wide context.
  */
 function shouldKeepUserScopeCandidate(
   scanTarget: string,
   homeDir: string,
   candidatePath: string,
 ): boolean {
-  if (isPathInside(homeDir, scanTarget)) {
+  if (!isPathInside(homeDir, scanTarget)) {
+    return true;
+  }
+
+  // Follow symlinks the same way the rest of the scan code does (walker,
+  // wildcard-base check, `isRegularFile`): `statSync` resolves them. If the
+  // target cannot be stat'd (missing / permission denied / URL that was never
+  // a local path), fall through to the pre-PR-#53 outside-home behavior so
+  // we do not over-filter project-scope scans on unusual inputs.
+  let targetStat;
+  try {
+    targetStat = statSync(scanTarget);
+  } catch {
+    return true;
+  }
+
+  if (targetStat.isFile()) {
+    // Nothing is "inside" a file. The only user-scope candidate that can
+    // legitimately belong to a file-target scan is the file itself.
+    return resolve(candidatePath) === resolve(scanTarget);
+  }
+
+  if (targetStat.isDirectory()) {
     return isPathInside(scanTarget, candidatePath);
   }
+
+  // Sockets, devices, etc. — behave like the outside-home case.
   return true;
 }
 
diff --git a/tests/layer2/cross-scan-attribution.test.ts b/tests/layer2/cross-scan-attribution.test.ts
index 50f1351..24136dd 100644
--- a/tests/layer2/cross-scan-attribution.test.ts
+++ b/tests/layer2/cross-scan-attribution.test.ts
@@ -155,4 +155,46 @@ describe("cross-scan attribution — Layer 2 hidden-unicode rule", () => {
       ),
     ).toBe(true);
   });
+
+  it("engine-level: file-target scan drops user-scope siblings", async () => {
+    // Covers the case where runScanEngine is called directly with a file
+    // target inside homeDir (library/embedded callers). The scope filter
+    // in `shouldKeepUserScopeCandidate` rejects every candidate that is
+    // not the target file itself.
+    //
+    // NB: the CLI stages file targets into a temp dir before calling the
+    // engine — see the "CLI-level" test below for that path.
+    const home = mkdtempSync(join(tmpdir(), "codegate-cross-scan-file-home-"));
+
+    // Sibling: a skill with hidden Unicode under home.
+    mkdirSync(join(home, ".agents", "skills", "bar"), { recursive: true });
+    writeFileSync(
+      join(home, ".agents", "skills", "bar", "SKILL.md"),
+      "Sibling skill​ with hidden zero-width space.\n",
+      "utf8",
+    );
+
+    // Target dir wrapping a single config file — simulates a consumer that
+    // has already placed the file in a dedicated dir (runScanEngine
+    // rejects bare files, hence the wrapper).
+    const targetDir = join(home, ".claude");
+    mkdirSync(targetDir, { recursive: true });
+    writeFileSync(
+      join(targetDir, "settings.json"),
+      `{\n  "permissions": { "allow": [] }\n}\n`,
+      "utf8",
+    );
+
+    const report = await runScanEngine({
+      version: "0.1.0",
+      scanTarget: targetDir,
+      config: BASE_CONFIG,
+      homeDir: home,
+    });
+
+    const leaked = report.findings.filter(
+      (f) => typeof f.file_path === "string" && f.file_path.includes(".agents/skills/bar"),
+    );
+    expect(leaked).toEqual([]);
+  });
 });