diff --git a/docs/checks/content-discoverability.md b/docs/checks/content-discoverability.md
index 546b5b3..85534e9 100644
--- a/docs/checks/content-discoverability.md
+++ b/docs/checks/content-discoverability.md
@@ -41,6 +41,21 @@ If any of these redirect cross-host (e.g., `example.com` redirects to `docs.exam
 
 If your `llms.txt` lives at a location not covered by these candidates, AFDocs won't find it. You can either move it to one of the candidate locations or [open an issue](https://github.com/agent-ecosystem/afdocs/issues) to suggest expanding the candidate list.
 
+### Canonical selection
+
+When more than one candidate returns a file (e.g. an apex `llms.txt` for the marketing site _and_ a `/docs/llms.txt` for the docs section), AFDocs picks one as **canonical**. The canonical file is the single source of truth for downstream checks: link sampling, size, validation, freshness, and link-resolution all operate on it alone. Other discovered files still appear in `details.discoveredFiles` for visibility, and `cache-header-hygiene` still verifies headers on every llms.txt found.
+
+The selection rule is _most-specific-to-the-baseUrl wins_. AFDocs picks the file whose directory is the longest prefix of the URL you passed. For example:
+
+| You passed            | Files found                                 | Canonical                       |
+| --------------------- | ------------------------------------------- | ------------------------------- |
+| `example.com/docs`    | `/llms.txt` and `/docs/llms.txt`            | `/docs/llms.txt`                |
+| `example.com`         | `/llms.txt` and `/docs/llms.txt`            | `/llms.txt`                     |
+| `example.com/docs/v1` | `/llms.txt`, `/docs/llms.txt`, `/docs/v1/…` | `/docs/v1/llms.txt`             |
+| `example.com/docs/v1` | `/llms.txt` and `/docs/llms.txt`            | `/docs/llms.txt` (longer match) |
+
+Use `--llms-txt-url` (or the `llmsTxtUrl` config option) to override the heuristic when the canonical lives at a non-standard path. See the [CLI reference](/reference/cli#llms-txt-selection) for details.
+
 ### How to fix
 
 **If this check fails**, create an `llms.txt` at one of the candidate locations above. The file should contain an H1 title, a blockquote summary, and markdown links to your key documentation pages. See the [llms.txt specification](https://llmstxt.org/) for the format.
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index 7938d9a..c9a9190 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -166,6 +166,27 @@ Use `--canonical-origin` when your site's URLs in `sitemap.xml` and `llms.txt` d
 afdocs check https://preview-xyz-example.app/docs --canonical-origin https://example.com
 ```
 
+### llms.txt selection
+
+| Flag                   | Default | Description                                                              |
+| ---------------------- | ------- | ------------------------------------------------------------------------ |
+| `--llms-txt-url <url>` |         | Explicit llms.txt URL to use as canonical (bypasses discovery heuristic) |
+
+By default, `afdocs` discovers llms.txt at three candidate locations: `{baseUrl}/llms.txt`, `{origin}/llms.txt`, and `{origin}/docs/llms.txt`. When more than one of these returns a file, the most-specific one — the one whose directory is the longest prefix of the URL you passed — is used as canonical. Downstream checks (size, validity, link sampling) all operate on the canonical file.
+
+For most sites this heuristic does the right thing. Use `--llms-txt-url` to override it when:
+
+- The canonical llms.txt lives at a non-standard path (e.g. `/docs/v3/llms.txt`)
+- A monorepo serves multiple docs surfaces at one origin and you want to score one specifically
+- You want to verify a specific file before publishing
+
+```bash
+# Score a docs section explicitly, ignoring an apex /llms.txt
+afdocs check https://example.com/docs --llms-txt-url https://example.com/docs/llms.txt
+```
+
+When the override is set, `llms-txt-exists` probes only that URL and reports failure if it isn't reachable. The cross-host redirect fallback is skipped.
+
 ### Size thresholds
 
 | Flag                   | Default  | Description                            |
diff --git a/docs/reference/config-file.md b/docs/reference/config-file.md
index ddde547..f6d2919 100644
--- a/docs/reference/config-file.md
+++ b/docs/reference/config-file.md
@@ -32,6 +32,7 @@ options:
   preferredLocale: en
   preferredVersion: v3
   canonicalOrigin: https://example.com
+  llmsTxtUrl: https://example.com/docs/llms.txt
   thresholds:
     pass: 50000
     fail: 100000
@@ -70,18 +71,19 @@ skipChecks:
 
 Override default runner options. All fields are optional:
 
-| Field              | Default     | Description                                                |
-| ------------------ | ----------- | ---------------------------------------------------------- |
-| `maxLinksToTest`   | `50`        | Maximum number of pages to sample                          |
-| `samplingStrategy` | `random`    | `random`, `deterministic`, `curated`, or `none`            |
-| `maxConcurrency`   | `3`         | Maximum concurrent HTTP requests                           |
-| `requestDelay`     | `200`       | Delay between requests in milliseconds                     |
-| `requestTimeout`   | `30000`     | Timeout for individual HTTP requests in milliseconds       |
-| `preferredLocale`  | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`) |
-| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`)     |
-| `canonicalOrigin`  |             | The production domain your content links to                |
-| `thresholds.pass`  | `50000`     | Page size pass threshold in characters                     |
-| `thresholds.fail`  | `100000`    | Page size fail threshold in characters                     |
+| Field              | Default     | Description                                                                                 |
+| ------------------ | ----------- | ------------------------------------------------------------------------------------------- |
+| `maxLinksToTest`   | `50`        | Maximum number of pages to sample                                                           |
+| `samplingStrategy` | `random`    | `random`, `deterministic`, `curated`, or `none`                                             |
+| `maxConcurrency`   | `3`         | Maximum concurrent HTTP requests                                                            |
+| `requestDelay`     | `200`       | Delay between requests in milliseconds                                                      |
+| `requestTimeout`   | `30000`     | Timeout for individual HTTP requests in milliseconds                                        |
+| `preferredLocale`  | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`)                                  |
+| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`)                                      |
+| `canonicalOrigin`  |             | The production domain your content links to                                                 |
+| `llmsTxtUrl`       |             | Explicit llms.txt URL to use as canonical (overrides the discovery heuristic; see CLI docs) |
+| `thresholds.pass`  | `50000`     | Page size pass threshold in characters                                                      |
+| `thresholds.fail`  | `100000`    | Page size fail threshold in characters                                                      |
 
 ### `pages` (optional)
 
diff --git a/src/checks/content-discoverability/llms-txt-exists.ts b/src/checks/content-discoverability/llms-txt-exists.ts
index c6b4104..fc45816 100644
--- a/src/checks/content-discoverability/llms-txt-exists.ts
+++ b/src/checks/content-discoverability/llms-txt-exists.ts
@@ -1,4 +1,5 @@
 import { registerCheck } from '../registry.js';
+import { selectCanonicalLlmsTxt } from '../../helpers/llms-txt.js';
 import { isCrossHostRedirect } from '../../helpers/to-md-urls.js';
 import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
 
@@ -16,7 +17,8 @@ function getCandidateUrls(baseUrl: string, origin: string): string[] {
 }
 
 async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
-  const candidates = getCandidateUrls(ctx.baseUrl, ctx.origin);
+  const explicitUrl = ctx.options.llmsTxtUrl;
+  const candidates = explicitUrl ? [explicitUrl] : getCandidateUrls(ctx.baseUrl, ctx.origin);
   const discovered: DiscoveredFile[] = [];
   const checkedUrls: Array<{
     url: string;
@@ -68,8 +70,11 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
 
   // When no llms.txt found, check if any candidates redirected cross-host.
   // If so, try {redirected_origin}/llms.txt as a fallback.
+  // Skip the fallback when the user explicitly specified an llmsTxtUrl —
+  // they told us exactly where to look, so silently probing other origins
+  // would defeat the purpose of the override.
   const redirectedOrigins: string[] = [];
-  if (discovered.length === 0) {
+  if (discovered.length === 0 && !explicitUrl) {
     const checkedSet = new Set(checkedUrls.map((u) => u.url));
     const seenOrigins = new Set<string>();
     for (const checked of checkedUrls) {
@@ -134,6 +139,12 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
     (fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '') +
     (rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : '');
 
+  // Pick the canonical llms.txt — the one downstream checks should use as the
+  // single source of truth for sampling links, measuring size, validating
+  // structure, etc. When multiple llms.txt files exist (apex + docs section),
+  // the heuristic prefers the most-specific one relative to the baseUrl.
+  const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl);
+
   // Store discovered files for downstream checks
   const details: Record<string, unknown> = {
     candidateUrls: checkedUrls,
@@ -142,6 +153,16 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
     rateLimited,
   };
 
+  if (canonical) {
+    details.canonicalLlmsTxt = canonical;
+    details.canonicalUrl = canonical.url;
+    if (explicitUrl) {
+      details.canonicalSource = 'explicit';
+    } else if (discovered.length > 1) {
+      details.canonicalSource = 'heuristic';
+    }
+  }
+
   if (redirectedOrigins.length > 0) {
     details.redirectedOrigins = redirectedOrigins;
   }
@@ -174,11 +195,14 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
       redirectedOrigins.length > 0
         ? `; candidates redirected cross-host to ${redirectedOrigins.join(', ')} (agents can't follow cross-host redirects)`
         : '';
+    const message = explicitUrl
+      ? `No llms.txt found at the URL specified via --llms-txt-url (${explicitUrl})${redirectNote}${suffix}`
+      : `No llms.txt found at any candidate location (${candidates.join(', ')})${redirectNote}${suffix}`;
     return {
       id: 'llms-txt-exists',
       category: 'content-discoverability',
       status: 'fail',
-      message: `No llms.txt found at any candidate location (${candidates.join(', ')})${redirectNote}${suffix}`,
+      message,
       details,
     };
   }
@@ -203,11 +227,24 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise<CheckResult> {
     details.sameContent = allSame;
   }
 
+  // Build a message that surfaces which file was picked as canonical, so users
+  // can see at a glance which one drives the rest of the report.
+  let message: string;
+  if (explicitUrl && canonical) {
+    message = `llms.txt found at ${canonical.url} (specified via --llms-txt-url)`;
+  } else if (discovered.length === 1) {
+    message = `llms.txt found at ${discovered[0].url}`;
+  } else if (canonical) {
+    message = `llms.txt found at ${discovered.length} locations; using ${canonical.url} as canonical`;
+  } else {
+    message = `llms.txt found at ${discovered.length} location(s)`;
+  }
+
   return {
     id: 'llms-txt-exists',
     category: 'content-discoverability',
     status: 'pass',
-    message: `llms.txt found at ${discovered.length} location(s)${suffix}`,
+    message: message + suffix,
     details,
   };
 }
diff --git a/src/checks/content-discoverability/llms-txt-links-markdown.ts b/src/checks/content-discoverability/llms-txt-links-markdown.ts
index fd8ff5a..83cb206 100644
--- a/src/checks/content-discoverability/llms-txt-links-markdown.ts
+++ b/src/checks/content-discoverability/llms-txt-links-markdown.ts
@@ -1,9 +1,10 @@
 import { registerCheck } from '../registry.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
 import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
+import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js';
 import { toMdUrls } from '../../helpers/to-md-urls.js';
 import { looksLikeMarkdown } from '../../helpers/detect-markdown.js';
-import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
+import type { CheckContext, CheckResult } from '../../types.js';
 
 interface LinkMarkdownResult {
   url: string;
@@ -25,7 +26,7 @@ function hasMarkdownExtension(url: string): boolean {
 
 async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise<CheckResult> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
-  const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
   if (discovered.length === 0) {
     return {
diff --git a/src/checks/content-discoverability/llms-txt-links-resolve.ts b/src/checks/content-discoverability/llms-txt-links-resolve.ts
index a09c10d..fee728c 100644
--- a/src/checks/content-discoverability/llms-txt-links-resolve.ts
+++ b/src/checks/content-discoverability/llms-txt-links-resolve.ts
@@ -2,7 +2,8 @@ import { registerCheck } from '../registry.js';
 import { LINK_RESOLVE_THRESHOLD } from '../../constants.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
 import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
-import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
+import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js';
+import type { CheckContext, CheckResult } from '../../types.js';
 
 interface LinkCheckResult {
   url: string;
@@ -13,7 +14,7 @@ interface LinkCheckResult {
 
 async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise<CheckResult> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
-  const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
   if (discovered.length === 0) {
     return {
diff --git a/src/checks/content-discoverability/llms-txt-size.ts b/src/checks/content-discoverability/llms-txt-size.ts
index 8929149..93ac3dc 100644
--- a/src/checks/content-discoverability/llms-txt-size.ts
+++ b/src/checks/content-discoverability/llms-txt-size.ts
@@ -1,9 +1,10 @@
 import { registerCheck } from '../registry.js';
-import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
+import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js';
+import type { CheckContext, CheckResult } from '../../types.js';
 
 async function checkLlmsTxtSize(ctx: CheckContext): Promise<CheckResult> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
-  const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
   if (discovered.length === 0) {
     return {
diff --git a/src/checks/content-discoverability/llms-txt-valid.ts b/src/checks/content-discoverability/llms-txt-valid.ts
index d919952..d53dea7 100644
--- a/src/checks/content-discoverability/llms-txt-valid.ts
+++ b/src/checks/content-discoverability/llms-txt-valid.ts
@@ -1,5 +1,6 @@
 import { registerCheck } from '../registry.js';
-import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
+import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js';
+import type { CheckContext, CheckResult } from '../../types.js';
 
 interface ValidationResult {
   url: string;
@@ -48,7 +49,7 @@ function validateLlmsTxt(content: string, url: string): ValidationResult {
 
 async function checkLlmsTxtValid(ctx: CheckContext): Promise<CheckResult> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
-  const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
   if (discovered.length === 0) {
     return {
diff --git a/src/checks/observability/cache-header-hygiene.ts b/src/checks/observability/cache-header-hygiene.ts
index a8f9884..2a6f2b4 100644
--- a/src/checks/observability/cache-header-hygiene.ts
+++ b/src/checks/observability/cache-header-hygiene.ts
@@ -118,7 +118,9 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
   // Collect URLs to check: llms.txt files + sampled page URLs
   const urlsToCheck: string[] = [];
 
-  // llms.txt URLs
+  // llms.txt URLs — intentionally checks ALL discovered files (not just the
+  // canonical) so that multiple llms.txt locations (apex + docs) are each
+  // expected to have appropriate cache headers.
   const existsResult = ctx.previousResults.get('llms-txt-exists');
   const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
   for (const file of discovered) {
diff --git a/src/cli/commands/check.ts b/src/cli/commands/check.ts
index 713dfbc..36cd7b2 100644
--- a/src/cli/commands/check.ts
+++ b/src/cli/commands/check.ts
@@ -42,6 +42,10 @@ export function registerCheckCommand(program: Command): void {
       '--canonical-origin <url>',
       'The production domain your content links to (for preview/staging testing)',
     )
+    .option(
+      '--llms-txt-url <url>',
+      'Explicit llms.txt URL to use as canonical (bypasses discovery heuristic)',
+    )
     .action(async (rawUrl: string | undefined, opts: Record<string, unknown>) => {
       // Load config: explicit path or auto-discover
       let config;
@@ -199,6 +203,24 @@ export function registerCheckCommand(program: Command): void {
         }
       }
 
+      let llmsTxtUrl: string | undefined;
+      const rawLlmsTxtUrl = (opts.llmsTxtUrl as string | undefined) ?? config?.options?.llmsTxtUrl;
+      if (rawLlmsTxtUrl) {
+        try {
+          llmsTxtUrl = new URL(normalizeUrl(rawLlmsTxtUrl)).toString();
+        } catch {
+          process.stderr.write(`Error: Invalid --llms-txt-url "${rawLlmsTxtUrl}".\n`);
+          process.exitCode = 1;
+          return;
+        }
+        const targetOrigin = new URL(url).origin;
+        if (new URL(llmsTxtUrl).origin !== targetOrigin) {
+          process.stderr.write(
+            `Warning: --llms-txt-url origin (${new URL(llmsTxtUrl).origin}) differs from target origin (${targetOrigin}). The flag will still be used as canonical.\n`,
+          );
+        }
+      }
+
       const report = await runChecks(url, {
         checkIds,
         skipCheckIds,
@@ -214,6 +236,7 @@ export function registerCheckCommand(program: Command): void {
         ...(preferredLocale && { preferredLocale }),
         ...(preferredVersion && { preferredVersion }),
         ...(canonicalOrigin && { canonicalOrigin }),
+        ...(llmsTxtUrl && { llmsTxtUrl }),
       });
 
       let output: string;
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
index fb58206..1bdcb72 100644
--- a/src/helpers/get-page-urls.ts
+++ b/src/helpers/get-page-urls.ts
@@ -1,6 +1,7 @@
 import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt-valid.js';
 import { MAX_SITEMAP_URLS } from '../constants.js';
-import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
+import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
+import { isMdUrl, toHtmlUrl } from './to-md-urls.js';
 import type { CheckContext, DiscoveredFile } from '../types.js';
 
 /**
@@ -38,7 +39,7 @@ export function parseSitemapUrls(xml: string): { urls: string[]; sitemapIndexUrl
 
 export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise<string[]> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
-  const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[];
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
   const urls = extractLinksFromLlmsTxtFiles(discovered);
   return walkAggregateLinks(ctx, urls);
@@ -75,43 +76,78 @@ function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] {
   return Array.from(urls);
 }
 
+/**
+ * Maximum depth of nested aggregate (.txt) files to follow when expanding
+ * an llms.txt index. Most sites use 1-2 levels (e.g. a top-level index that
+ * points to per-section indexes). 5 covers deeper trees like Alchemy's
+ * `/docs/llms.txt → /docs/chains/llms.txt → /docs/chains/{chain}/llms.txt`
+ * while still terminating on pathological cycles.
+ */
+const MAX_AGGREGATE_DEPTH = 5;
+
+/**
+ * Hard cap on the number of aggregate .txt files we'll fetch in a single
+ * walk, regardless of depth. Protects against very large sites
+ * (Alchemy has ~80 per-chain files) ballooning into hundreds of HTTP
+ * requests. 200 is enough headroom for the largest realistic case while
+ * still being a clear safety net.
+ */
+const MAX_AGGREGATE_FILES = 200;
+
 /**
  * Identify .txt links that are likely aggregate/index files (progressive
- * disclosure pattern) and walk them one level deep to find page URLs.
+ * disclosure pattern) and recursively walk them to find page URLs.
  *
  * A link is considered walkable when it ends in .txt and is on the same
- * origin as the site being tested. This covers both sub-product llms.txt
- * files (Cloudflare) and aggregate content files (Supabase).
+ * origin as the site being tested. This covers:
+ *   - sub-product llms.txt files (Cloudflare)
+ *   - aggregate content files (Supabase)
+ *   - multi-level nested indexes (Alchemy: top → section → sub-section)
+ *
+ * Walking is bounded by `MAX_AGGREGATE_DEPTH` and `MAX_AGGREGATE_FILES`
+ * so a malformed or extremely large index can't make us fetch unboundedly.
+ * Each aggregate is fetched at most once even if referenced multiple times.
  */
 async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<string[]> {
   const pageUrls: string[] = [];
-  const aggregateUrls: string[] = [];
+  const visitedAggregates = new Set<string>();
+  const queue: Array<{ url: string; depth: number }> = [];
 
   const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
 
-  for (const url of urls) {
+  function isSameOrigin(parsed: URL): boolean {
+    return parsed.origin === ctx.origin || parsed.origin === siteOrigin;
+  }
+
+  /** Sort a discovered URL into the page bucket or the aggregate-walk queue. */
+  function classify(url: string, parentDepth: number): void {
     try {
       const parsed = new URL(url);
       if (/\.txt$/i.test(parsed.pathname)) {
-        // .txt files are either aggregate indexes to walk (same origin)
-        // or external resources to skip — never page URLs themselves
-        if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
-          aggregateUrls.push(url);
+        // Aggregates: same-origin, not yet visited, within depth budget.
+        // Cross-origin .txt links are external resources we don't control.
+        if (
+          isSameOrigin(parsed) &&
+          !visitedAggregates.has(url) &&
+          parentDepth + 1 <= MAX_AGGREGATE_DEPTH
+        ) {
+          visitedAggregates.add(url);
+          queue.push({ url, depth: parentDepth + 1 });
         }
-      } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
-        // Only include same-origin page URLs; cross-origin links are
-        // external resources the site owner doesn't control.
+      } else if (isSameOrigin(parsed)) {
         pageUrls.push(normalizePageUrl(url));
       }
     } catch {
+      // Unparseable URL: keep it so the caller's later filtering can decide.
       pageUrls.push(normalizePageUrl(url));
     }
   }
 
-  if (aggregateUrls.length === 0) return pageUrls;
+  // Seed from the canonical llms.txt's links (parent depth 0).
+  for (const url of urls) classify(url, 0);
 
-  // Fetch aggregate files and extract their links
-  for (const aggUrl of aggregateUrls) {
+  while (queue.length > 0 && visitedAggregates.size <= MAX_AGGREGATE_FILES) {
+    const { url: aggUrl, depth } = queue.shift()!;
     try {
       const response = await ctx.http.fetch(aggUrl);
       if (!response.ok) continue;
@@ -130,20 +166,7 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<st
       };
       const subUrls = extractLinksFromLlmsTxtFiles([subFile]);
 
-      for (const subUrl of subUrls) {
-        // Only keep same-origin page URLs (skip further .txt nesting)
-        try {
-          const parsed = new URL(subUrl);
-          if (
-            (parsed.origin === ctx.origin || parsed.origin === siteOrigin) &&
-            !isNonPageUrl(subUrl)
-          ) {
-            pageUrls.push(subUrl);
-          }
-        } catch {
-          // Skip malformed URLs
-        }
-      }
+      for (const subUrl of subUrls) classify(subUrl, depth);
     } catch {
       // Skip failed fetches
     }
@@ -155,12 +178,21 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<st
 /**
  * Directly fetch llms.txt candidate URLs and extract links.
  * Used when `llms-txt-exists` hasn't run (e.g. standalone check mode).
+ *
+ * Mirrors the canonical-selection logic in `llms-txt-exists` so that the same
+ * single source of truth drives sampling whether or not `llms-txt-exists` ran.
  */
 async function fetchLlmsTxtUrls(ctx: CheckContext): Promise<string[]> {
-  const candidates = new Set<string>();
-  candidates.add(`${ctx.baseUrl}/llms.txt`);
-  candidates.add(`${ctx.origin}/llms.txt`);
-  candidates.add(`${ctx.origin}/docs/llms.txt`);
+  const explicitUrl = ctx.options.llmsTxtUrl;
+  const candidates = explicitUrl
+    ? [explicitUrl]
+    : Array.from(
+        new Set([
+          `${ctx.baseUrl}/llms.txt`,
+          `${ctx.origin}/llms.txt`,
+          `${ctx.origin}/docs/llms.txt`,
+        ]),
+      );
 
   const discovered: DiscoveredFile[] = [];
   for (const url of candidates) {
@@ -184,7 +216,9 @@ async function fetchLlmsTxtUrls(ctx: CheckContext): Promise<string[]> {
     }
   }
 
-  const urls = extractLinksFromLlmsTxtFiles(discovered);
+  const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl);
+  const filesForAnalysis = canonical ? [canonical] : [];
+  const urls = extractLinksFromLlmsTxtFiles(filesForAnalysis);
   return walkAggregateLinks(ctx, urls);
 }
 
diff --git a/src/helpers/index.ts b/src/helpers/index.ts
index acdbc2d..4179f1c 100644
--- a/src/helpers/index.ts
+++ b/src/helpers/index.ts
@@ -8,6 +8,7 @@ export {
   parseSitemapDirectives,
 } from './get-page-urls.js';
 export type { PageUrlResult, SampledPages } from './get-page-urls.js';
+export { selectCanonicalLlmsTxt, getLlmsTxtFilesForAnalysis } from './llms-txt.js';
 export { toMdUrls, isNonPageUrl } from './to-md-urls.js';
 export { htmlToMarkdown } from './html-to-markdown.js';
 export { fetchPage } from './fetch-page.js';
diff --git a/src/helpers/llms-txt.ts b/src/helpers/llms-txt.ts
new file mode 100644
index 0000000..7570551
--- /dev/null
+++ b/src/helpers/llms-txt.ts
@@ -0,0 +1,104 @@
+import type { CheckResult, DiscoveredFile } from '../types.js';
+
+/**
+ * Get the directory portion of a URL's pathname (the part before the filename),
+ * without a trailing slash. Returns '' for root-level files.
+ *
+ *   /llms.txt          -> ''
+ *   /docs/llms.txt     -> '/docs'
+ *   /docs/v1/llms.txt  -> '/docs/v1'
+ */
+function getFileDir(fileUrl: string): string {
+  try {
+    const path = new URL(fileUrl).pathname;
+    const dir = path.replace(/\/[^/]*$/, '');
+    return dir === '/' ? '' : dir;
+  } catch {
+    return '';
+  }
+}
+
+/**
+ * Returns true when the file's directory is a (non-strict) prefix of the
+ * baseUrl's pathname AND the origins match. Files on a different origin
+ * (e.g. discovered via cross-host redirect fallback) never qualify here.
+ */
+function fileDirIsPrefixOfBase(fileUrl: string, baseUrl: string): boolean {
+  try {
+    const f = new URL(fileUrl);
+    const b = new URL(baseUrl);
+    if (f.origin !== b.origin) return false;
+    const fileDir = getFileDir(fileUrl);
+    const basePath = b.pathname.replace(/\/$/, '');
+    if (fileDir === '') return true;
+    return fileDir === basePath || basePath.startsWith(fileDir + '/');
+  } catch {
+    return false;
+  }
+}
+
+function scoreCandidate(fileUrl: string, baseUrl: string): number {
+  // Files whose directory is a prefix of baseUrl rank above everything else.
+  // Within that group, deeper directories (more specific) rank higher.
+  // The +1 ensures any prefix match outranks a non-prefix match (which scores 0).
+  if (!fileDirIsPrefixOfBase(fileUrl, baseUrl)) return 0;
+  return 1 + getFileDir(fileUrl).length;
+}
+
+/**
+ * Pick the canonical llms.txt from a set of discovered files.
+ *
+ * Priority:
+ *   1. The file whose directory is the longest prefix of the baseUrl's
+ *      pathname (most specific to what the user passed).
+ *   2. Files whose directory is *not* a prefix of the baseUrl rank below
+ *      any prefix-matching file (they cover different parts of the site).
+ *   3. Ties resolved by registration order — i.e. the order returned by
+ *      the candidate discovery, which already lists baseUrl > origin > docs.
+ *
+ * Examples (assuming both files exist):
+ *   baseUrl https://example.com/docs        -> /docs/llms.txt wins over /llms.txt
+ *   baseUrl https://example.com             -> /llms.txt wins over /docs/llms.txt
+ *   baseUrl https://example.com/docs/v1     -> /docs/v1/llms.txt > /docs/llms.txt > /llms.txt
+ */
+export function selectCanonicalLlmsTxt(
+  discovered: DiscoveredFile[],
+  baseUrl: string,
+): DiscoveredFile | undefined {
+  if (discovered.length === 0) return undefined;
+  if (discovered.length === 1) return discovered[0];
+
+  let best = discovered[0];
+  let bestScore = scoreCandidate(best.url, baseUrl);
+
+  for (let i = 1; i < discovered.length; i++) {
+    const score = scoreCandidate(discovered[i].url, baseUrl);
+    if (score > bestScore) {
+      best = discovered[i];
+      bestScore = score;
+    }
+  }
+
+  return best;
+}
+
+/**
+ * Pick the discovered llms.txt file(s) that downstream checks should treat
+ * as the source of truth for sampling links, measuring size, validating
+ * structure, etc.
+ *
+ * When `llms-txt-exists` selected a canonical file (the common case), only
+ * that file is returned. Falls back to the full `discoveredFiles` array for
+ * backward compatibility with callers (e.g. unit tests) that populate
+ * `previousResults` directly without going through `llms-txt-exists`.
+ *
+ * Returns an empty array when no llms.txt is available.
+ */
+export function getLlmsTxtFilesForAnalysis(
+  existsResult: CheckResult | undefined,
+): DiscoveredFile[] {
+  if (!existsResult?.details) return [];
+  const canonical = existsResult.details.canonicalLlmsTxt as DiscoveredFile | undefined;
+  if (canonical) return [canonical];
+  return (existsResult.details.discoveredFiles as DiscoveredFile[] | undefined) ?? [];
+}
diff --git a/src/types.ts b/src/types.ts
index a5e773d..ed03263 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -84,6 +84,16 @@ export interface CheckOptions {
   preferredVersion?: string;
   /** Canonical origin to rewrite in fetched content (for preview/staging testing). */
   canonicalOrigin?: string;
+  /**
+   * Explicit URL to use as the canonical llms.txt for downstream sampling and
+   * analysis. When set, the standard candidate-discovery heuristic is bypassed
+   * and only this URL is probed.
+   *
+   * Useful when a site has both an apex llms.txt (e.g. for marketing) and a
+   * docs-section llms.txt, and the heuristic would otherwise pick the wrong
+   * one.
+   */
+  llmsTxtUrl?: string;
 }
 
 export interface SizeThresholds {
diff --git a/test/integration/check-pipeline.test.ts b/test/integration/check-pipeline.test.ts
index 12cbb01..12bd04c 100644
--- a/test/integration/check-pipeline.test.ts
+++ b/test/integration/check-pipeline.test.ts
@@ -1086,6 +1086,101 @@ describe('check pipeline: markdown-content-parity reads from pageCache', () => {
   });
 });
 
+describe('check pipeline: canonical llms.txt selection', () => {
+  it('docs llms.txt drives downstream sampling when both apex and docs exist', async () => {
+    // Mirrors the scenario from issue #53: an apex llms.txt full of marketing
+    // links and a small docs llms.txt with the actual documentation links.
+    // When the user passes a docs URL, the docs llms.txt should be canonical
+    // and downstream checks should sample from its links — not the apex's.
+    const apexContent = `# Marketing\n\n> Apex marketing page.\n\n## Links\n\n${Array.from(
+      { length: 20 },
+      (_, i) => `- [Marketing ${i}](http://canon-pipe.local/blog/post-${i}): Blog post ${i}`,
+    ).join('\n')}\n`;
+    const docsContent =
+      '# Docs\n\n> Docs index.\n\n## Links\n\n- [Guide](http://canon-pipe.local/docs/guide): Guide\n';
+
+    server.use(
+      http.get('http://canon-pipe.local/llms.txt', () => HttpResponse.text(apexContent)),
+      http.get('http://canon-pipe.local/docs/llms.txt', () => HttpResponse.text(docsContent)),
+      http.get(
+        'http://canon-pipe.local/docs/guide',
+        () =>
+          new HttpResponse('<html><body><h1>Guide</h1></body></html>', {
+            status: 200,
+            headers: { 'Content-Type': 'text/html' },
+          }),
+      ),
+      http.head(
+        'http://canon-pipe.local/docs/guide',
+        () => new HttpResponse(null, { status: 200 }),
+      ),
+    );
+    mockSitemapNotFound(server, 'http://canon-pipe.local/docs');
+
+    const report = await runChecks('http://canon-pipe.local/docs', {
+      checkIds: ['llms-txt-exists', 'llms-txt-size', 'llms-txt-valid', 'llms-txt-links-resolve'],
+      requestDelay: 0,
+    });
+
+    const existsResult = report.results.find((r) => r.id === 'llms-txt-exists')!;
+    const sizeResult = report.results.find((r) => r.id === 'llms-txt-size')!;
+    const validResult = report.results.find((r) => r.id === 'llms-txt-valid')!;
+    const resolveResult = report.results.find((r) => r.id === 'llms-txt-links-resolve')!;
+
+    // llms-txt-exists picks the docs file as canonical
+    expect(existsResult.status).toBe('pass');
+    expect(existsResult.details?.canonicalUrl).toBe('http://canon-pipe.local/docs/llms.txt');
+
+    // Downstream checks should report on the docs file only, not the apex.
+    // size: docs file is small, so it passes — the apex (with 20 inline links)
+    // would not influence the result.
+    expect(sizeResult.status).toBe('pass');
+    const sizes = sizeResult.details?.sizes as Array<{ url: string; characters: number }>;
+    expect(sizes).toHaveLength(1);
+    expect(sizes[0].url).toBe('http://canon-pipe.local/docs/llms.txt');
+
+    // valid: validates the docs file (which has H1, blockquote, sections, links)
+    expect(validResult.status).toBe('pass');
+
+    // links-resolve: only tests the single docs link, not the 20 marketing links
+    expect(resolveResult.status).toBe('pass');
+    expect(resolveResult.details?.totalLinks).toBe(1);
+  });
+
+  it('--llms-txt-url forces a specific file even when others would be discovered', async () => {
+    const apexContent =
+      '# Apex\n\n> Apex.\n\n## Links\n\n- [Blog](http://override-pipe.local/blog): Blog\n';
+    const explicitContent =
+      '# Explicit\n\n> Explicit.\n\n## Links\n\n- [Guide](http://override-pipe.local/docs/guide): Guide\n';
+
+    server.use(
+      http.get('http://override-pipe.local/llms.txt', () => HttpResponse.text(apexContent)),
+      http.get('http://override-pipe.local/custom/llms.txt', () =>
+        HttpResponse.text(explicitContent),
+      ),
+      http.get(
+        'http://override-pipe.local/docs/llms.txt',
+        () => new HttpResponse(null, { status: 404 }),
+      ),
+    );
+
+    const report = await runChecks('http://override-pipe.local', {
+      checkIds: ['llms-txt-exists', 'llms-txt-valid'],
+      requestDelay: 0,
+      llmsTxtUrl: 'http://override-pipe.local/custom/llms.txt',
+    });
+
+    const existsResult = report.results.find((r) => r.id === 'llms-txt-exists')!;
+    expect(existsResult.details?.canonicalUrl).toBe('http://override-pipe.local/custom/llms.txt');
+
+    const validResult = report.results.find((r) => r.id === 'llms-txt-valid')!;
+    const validations = validResult.details?.validations as Array<{ url: string }>;
+    // Only the explicit file should be validated, not the apex
+    expect(validations).toHaveLength(1);
+    expect(validations[0].url).toBe('http://override-pipe.local/custom/llms.txt');
+  });
+});
+
 describe('check pipeline: effectiveOrigin propagation', () => {
   it('llms-txt-exists sets effectiveOrigin which llms-txt-freshness uses', async () => {
     // llms.txt redirects cross-host; sitemap lives at the redirected host
diff --git a/test/unit/checks/llms-txt-exists.test.ts b/test/unit/checks/llms-txt-exists.test.ts
index b68b217..65890ac 100644
--- a/test/unit/checks/llms-txt-exists.test.ts
+++ b/test/unit/checks/llms-txt-exists.test.ts
@@ -233,4 +233,109 @@ describe('llms-txt-exists', () => {
     expect(report.results[0].details?.rateLimited).toBe(1);
     expect(report.results[0].message).toContain('rate-limited (HTTP 429)');
   });
+
+  describe('canonical selection', () => {
+    const APEX_LLMS_TXT = `# Apex marketing\n\n> Apex.\n\n## Links\n\n- [Blog](http://canon.local/blog/post): Blog\n`;
+    const DOCS_LLMS_TXT = `# Docs\n\n> Docs index.\n\n## Links\n\n- [Guide](http://canon.local/docs/guide): Guide\n`;
+
+    it('picks /docs/llms.txt as canonical when baseUrl is the docs path', async () => {
+      server.use(
+        http.get('http://canon.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)),
+        http.get('http://canon.local/docs/llms.txt', () => HttpResponse.text(DOCS_LLMS_TXT)),
+      );
+
+      const report = await runChecks('http://canon.local/docs', {
+        checkIds: ['llms-txt-exists'],
+        requestDelay: 0,
+      });
+
+      expect(report.results[0].status).toBe('pass');
+      expect(report.results[0].details?.canonicalUrl).toBe('http://canon.local/docs/llms.txt');
+      expect(report.results[0].details?.canonicalSource).toBe('heuristic');
+      expect(report.results[0].message).toContain('using http://canon.local/docs/llms.txt');
+    });
+
+    it('picks the apex llms.txt as canonical when baseUrl is the origin', async () => {
+      server.use(
+        http.get('http://canon-apex.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)),
+        http.get('http://canon-apex.local/docs/llms.txt', () => HttpResponse.text(DOCS_LLMS_TXT)),
+      );
+
+      const report = await runChecks('http://canon-apex.local', {
+        checkIds: ['llms-txt-exists'],
+        requestDelay: 0,
+      });
+
+      expect(report.results[0].status).toBe('pass');
+      expect(report.results[0].details?.canonicalUrl).toBe('http://canon-apex.local/llms.txt');
+    });
+
+    it('omits canonicalSource when only one file is discovered', async () => {
+      server.use(
+        http.get('http://canon-single.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)),
+        http.get(
+          'http://canon-single.local/docs/llms.txt',
+          () => new HttpResponse(null, { status: 404 }),
+        ),
+      );
+
+      const report = await runChecks('http://canon-single.local', {
+        checkIds: ['llms-txt-exists'],
+        requestDelay: 0,
+      });
+
+      expect(report.results[0].status).toBe('pass');
+      expect(report.results[0].details?.canonicalUrl).toBe('http://canon-single.local/llms.txt');
+      expect(report.results[0].details?.canonicalSource).toBeUndefined();
+      expect(report.results[0].message).toBe(
+        'llms.txt found at http://canon-single.local/llms.txt',
+      );
+    });
+  });
+
+  describe('--llms-txt-url override', () => {
+    const VALID = `# Override\n\n> Override docs.\n\n## Links\n\n- [Page](http://override.local/x): X\n`;
+
+    it('probes only the explicit URL and uses it as canonical', async () => {
+      server.use(
+        // The discovery heuristic would normally hit /llms.txt and /docs/llms.txt,
+        // but with the override only the explicit URL is probed.
+        http.get('http://override.local/custom/llms.txt', () => HttpResponse.text(VALID)),
+      );
+
+      const report = await runChecks('http://override.local', {
+        checkIds: ['llms-txt-exists'],
+        requestDelay: 0,
+        llmsTxtUrl: 'http://override.local/custom/llms.txt',
+      });
+
+      expect(report.results[0].status).toBe('pass');
+      expect(report.results[0].details?.canonicalUrl).toBe('http://override.local/custom/llms.txt');
+      expect(report.results[0].details?.canonicalSource).toBe('explicit');
+      expect(report.results[0].message).toContain('specified via --llms-txt-url');
+      // candidateUrls should only include the explicit URL
+      const candidates = report.results[0].details?.candidateUrls as Array<{ url: string }>;
+      expect(candidates).toHaveLength(1);
+      expect(candidates[0].url).toBe('http://override.local/custom/llms.txt');
+    });
+
+    it('reports an explicit-URL-aware failure when the override 404s', async () => {
+      server.use(
+        http.get(
+          'http://override-missing.local/custom/llms.txt',
+          () => new HttpResponse(null, { status: 404 }),
+        ),
+      );
+
+      const report = await runChecks('http://override-missing.local', {
+        checkIds: ['llms-txt-exists'],
+        requestDelay: 0,
+        llmsTxtUrl: 'http://override-missing.local/custom/llms.txt',
+      });
+
+      expect(report.results[0].status).toBe('fail');
+      expect(report.results[0].message).toContain('--llms-txt-url');
+      expect(report.results[0].message).toContain('http://override-missing.local/custom/llms.txt');
+    });
+  });
 });
diff --git a/test/unit/cli/check-command.test.ts b/test/unit/cli/check-command.test.ts
index 1c43d7e..00b8858 100644
--- a/test/unit/cli/check-command.test.ts
+++ b/test/unit/cli/check-command.test.ts
@@ -576,6 +576,106 @@ describe('check command config integration', () => {
     stderrSpy.mockRestore();
   });
 
+  it('accepts --llms-txt-url and uses it as canonical', async () => {
+    const customLlmsTxt = `# Custom\n\n> Custom docs.\n\n## Links\n\n- [Page](http://cmd-llms-url.local/x): X\n`;
+    server.use(
+      // The discovery heuristic would normally fall back to /llms.txt, but the
+      // explicit URL should be the only thing probed.
+      http.get('http://cmd-llms-url.local/custom/llms.txt', () => HttpResponse.text(customLlmsTxt)),
+    );
+
+    const writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true);
+
+    const { run } = await import('../../../src/cli/index.js');
+    await run([
+      'node',
+      'afdocs',
+      'check',
+      'http://cmd-llms-url.local',
+      '--checks',
+      'llms-txt-exists',
+      '--format',
+      'json',
+      '--llms-txt-url',
+      'http://cmd-llms-url.local/custom/llms.txt',
+      '--request-delay',
+      '0',
+    ]);
+    await new Promise((r) => setTimeout(r, 100));
+
+    const output = writeSpy.mock.calls.map((c) => c[0]).join('');
+    const parsed = JSON.parse(output.trim());
+    expect(parsed.results[0].status).toBe('pass');
+    expect(parsed.results[0].details.canonicalUrl).toBe(
+      'http://cmd-llms-url.local/custom/llms.txt',
+    );
+    expect(parsed.results[0].details.canonicalSource).toBe('explicit');
+
+    writeSpy.mockRestore();
+  });
+
+  it('rejects invalid --llms-txt-url', async () => {
+    const stderrSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true);
+
+    const { run } = await import('../../../src/cli/index.js');
+    await run([
+      'node',
+      'afdocs',
+      'check',
+      'http://cmd-llms-url-bad.local',
+      '--llms-txt-url',
+      ':::not-a-url:::',
+      '--request-delay',
+      '0',
+    ]);
+    await new Promise((r) => setTimeout(r, 100));
+
+    const output = stderrSpy.mock.calls.map((c) => c[0]).join('');
+    expect(output).toContain('Invalid --llms-txt-url');
+    expect(process.exitCode).toBe(1);
+
+    stderrSpy.mockRestore();
+  });
+
+  it('warns when --llms-txt-url origin differs from target origin', async () => {
+    const customLlmsTxt = `# Other\n\n> Other.\n\n## Links\n\n- [P](http://other.local/x): X\n`;
+    server.use(
+      http.get('http://other.local/custom/llms.txt', () => HttpResponse.text(customLlmsTxt)),
+    );
+
+    const stdoutSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true);
+    const stderrSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true);
+
+    const { run } = await import('../../../src/cli/index.js');
+    await run([
+      'node',
+      'afdocs',
+      'check',
+      'http://cmd-llms-url-cross.local',
+      '--checks',
+      'llms-txt-exists',
+      '--format',
+      'json',
+      '--llms-txt-url',
+      'http://other.local/custom/llms.txt',
+      '--request-delay',
+      '0',
+    ]);
+    await new Promise((r) => setTimeout(r, 100));
+
+    const stderr = stderrSpy.mock.calls.map((c) => c[0]).join('');
+    expect(stderr).toContain('--llms-txt-url origin');
+    expect(stderr).toContain('differs from target origin');
+
+    // Check still runs and uses the explicit URL
+    const stdout = stdoutSpy.mock.calls.map((c) => c[0]).join('');
+    const parsed = JSON.parse(stdout.trim());
+    expect(parsed.results[0].details.canonicalUrl).toBe('http://other.local/custom/llms.txt');
+
+    stdoutSpy.mockRestore();
+    stderrSpy.mockRestore();
+  });
+
   it('infers base URL from config pages when url field is omitted', async () => {
     server.use(
       http.get('http://cfg-infer.local/llms.txt', () => HttpResponse.text(VALID_LLMS_TXT)),
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
index 3726bae..7703c47 100644
--- a/test/unit/helpers/get-page-urls.test.ts
+++ b/test/unit/helpers/get-page-urls.test.ts
@@ -1297,6 +1297,75 @@ describe('getPageUrls', () => {
     expect(result.urls).toEqual(['http://walk-empty.local/docs/page']);
   });
 
+  it('walks nested aggregate .txt files recursively (Alchemy pattern)', async () => {
+    // Three-level nested structure: top index → section index → leaf pages.
+    // Mirrors how Alchemy's docs llms.txt is organized:
+    //   /docs/llms.txt → /docs/chains/llms.txt → /docs/chains/ethereum/llms.txt
+    const rootContent = `# Docs\n- [Chains](http://nested-walk.local/chains/llms.txt)\n`;
+    const chainsContent = `# Chains\n- [Ethereum](http://nested-walk.local/chains/ethereum/llms.txt)\n- [Solana](http://nested-walk.local/chains/solana/llms.txt)\n`;
+    const ethContent = `# Ethereum\n- [eth_call](http://nested-walk.local/chains/ethereum/eth-call.md): Call\n- [eth_chainId](http://nested-walk.local/chains/ethereum/eth-chain-id.md): Chain ID\n`;
+    const solContent = `# Solana\n- [getBalance](http://nested-walk.local/chains/solana/get-balance.md): Balance\n`;
+
+    server.use(
+      http.get('http://nested-walk.local/chains/llms.txt', () => HttpResponse.text(chainsContent)),
+      http.get('http://nested-walk.local/chains/ethereum/llms.txt', () =>
+        HttpResponse.text(ethContent),
+      ),
+      http.get('http://nested-walk.local/chains/solana/llms.txt', () =>
+        HttpResponse.text(solContent),
+      ),
+    );
+
+    const ctx = makeCtx('http://nested-walk.local', rootContent);
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toContain('http://nested-walk.local/chains/ethereum/eth-call');
+    expect(result.urls).toContain('http://nested-walk.local/chains/ethereum/eth-chain-id');
+    expect(result.urls).toContain('http://nested-walk.local/chains/solana/get-balance');
+    expect(result.urls).toHaveLength(3);
+  });
+
+  it('does not refetch the same aggregate when referenced from multiple parents', async () => {
+    let sharedFetches = 0;
+    const rootContent = `# Docs\n- [SectionA](http://nested-dup.local/a/llms.txt)\n- [SectionB](http://nested-dup.local/b/llms.txt)\n`;
+    const aContent = `# A\n- [Shared](http://nested-dup.local/shared/llms.txt)\n- [PageA](http://nested-dup.local/a/page.md)\n`;
+    const bContent = `# B\n- [Shared](http://nested-dup.local/shared/llms.txt)\n- [PageB](http://nested-dup.local/b/page.md)\n`;
+    const sharedContent = `# Shared\n- [SharedPage](http://nested-dup.local/shared/page.md)\n`;
+
+    server.use(
+      http.get('http://nested-dup.local/a/llms.txt', () => HttpResponse.text(aContent)),
+      http.get('http://nested-dup.local/b/llms.txt', () => HttpResponse.text(bContent)),
+      http.get('http://nested-dup.local/shared/llms.txt', () => {
+        sharedFetches++;
+        return HttpResponse.text(sharedContent);
+      }),
+    );
+
+    const ctx = makeCtx('http://nested-dup.local', rootContent);
+    const result = await getPageUrls(ctx);
+
+    // Shared aggregate should only be fetched once even though both A and B reference it.
+    expect(sharedFetches).toBe(1);
+    expect(result.urls).toContain('http://nested-dup.local/a/page');
+    expect(result.urls).toContain('http://nested-dup.local/b/page');
+    expect(result.urls).toContain('http://nested-dup.local/shared/page');
+  });
+
+  it('terminates on cycles in the aggregate graph', async () => {
+    const rootContent = `# Docs\n- [A](http://nested-cycle.local/a/llms.txt)\n`;
+    // a → b → a → … would loop forever without cycle detection.
+    const aContent = `# A\n- [B](http://nested-cycle.local/b/llms.txt)\n- [Page](http://nested-cycle.local/a/page.md)\n`;
+    const bContent = `# B\n- [A](http://nested-cycle.local/a/llms.txt)\n`;
+
+    server.use(
+      http.get('http://nested-cycle.local/a/llms.txt', () => HttpResponse.text(aContent)),
+      http.get('http://nested-cycle.local/b/llms.txt', () => HttpResponse.text(bContent)),
+    );
+
+    const ctx = makeCtx('http://nested-cycle.local', rootContent);
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toContain('http://nested-cycle.local/a/page');
+  });
+
   // ── .md URL normalization ──
 
   it('normalizes .md URLs from llms.txt to HTML equivalents', async () => {
diff --git a/test/unit/helpers/llms-txt.test.ts b/test/unit/helpers/llms-txt.test.ts
new file mode 100644
index 0000000..a697fa9
--- /dev/null
+++ b/test/unit/helpers/llms-txt.test.ts
@@ -0,0 +1,120 @@
+import { describe, it, expect } from 'vitest';
+import {
+  selectCanonicalLlmsTxt,
+  getLlmsTxtFilesForAnalysis,
+} from '../../../src/helpers/llms-txt.js';
+import type { CheckResult, DiscoveredFile } from '../../../src/types.js';
+
+function file(url: string, content = '# stub'): DiscoveredFile {
+  return { url, content, status: 200, redirected: false };
+}
+
+describe('selectCanonicalLlmsTxt', () => {
+  it('returns undefined for empty input', () => {
+    expect(selectCanonicalLlmsTxt([], 'https://example.com')).toBeUndefined();
+  });
+
+  it('returns the only file when one is provided', () => {
+    const f = file('https://example.com/llms.txt');
+    expect(selectCanonicalLlmsTxt([f], 'https://example.com')).toBe(f);
+  });
+
+  it('prefers /docs/llms.txt over apex when baseUrl is /docs', () => {
+    const apex = file('https://example.com/llms.txt', '# Apex');
+    const docs = file('https://example.com/docs/llms.txt', '# Docs');
+    const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs');
+    expect(picked).toBe(docs);
+  });
+
+  it('prefers apex over /docs/llms.txt when baseUrl is the origin', () => {
+    const apex = file('https://example.com/llms.txt', '# Apex');
+    const docs = file('https://example.com/docs/llms.txt', '# Docs');
+    const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com');
+    expect(picked).toBe(apex);
+  });
+
+  it('prefers the deepest matching prefix when several files cover baseUrl', () => {
+    const apex = file('https://example.com/llms.txt');
+    const docs = file('https://example.com/docs/llms.txt');
+    const v1 = file('https://example.com/docs/v1/llms.txt');
+    const picked = selectCanonicalLlmsTxt([apex, docs, v1], 'https://example.com/docs/v1');
+    expect(picked).toBe(v1);
+  });
+
+  it('falls back to /docs/llms.txt when /docs/v1/llms.txt is missing', () => {
+    const apex = file('https://example.com/llms.txt');
+    const docs = file('https://example.com/docs/llms.txt');
+    const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs/v1');
+    expect(picked).toBe(docs);
+  });
+
+  it('ignores files on a different origin (treats them as non-prefix matches)', () => {
+    const sameOrigin = file('https://example.com/llms.txt');
+    const otherOrigin = file('https://other.com/docs/llms.txt');
+    const picked = selectCanonicalLlmsTxt([otherOrigin, sameOrigin], 'https://example.com/docs');
+    expect(picked).toBe(sameOrigin);
+  });
+
+  it('handles trailing slashes on baseUrl gracefully', () => {
+    const apex = file('https://example.com/llms.txt');
+    const docs = file('https://example.com/docs/llms.txt');
+    const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs/');
+    expect(picked).toBe(docs);
+  });
+
+  it('does not pick /docs/llms.txt when baseUrl is /api (different subtree)', () => {
+    const apex = file('https://example.com/llms.txt');
+    const docs = file('https://example.com/docs/llms.txt');
+    const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/api');
+    expect(picked).toBe(apex);
+  });
+
+  it('falls back to non-prefix file when nothing matches', () => {
+    const apiFile = file('https://example.com/api/llms.txt');
+    const picked = selectCanonicalLlmsTxt([apiFile], 'https://example.com/docs');
+    expect(picked).toBe(apiFile);
+  });
+});
+
+describe('getLlmsTxtFilesForAnalysis', () => {
+  function makeResult(details: Record<string, unknown>): CheckResult {
+    return {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'ok',
+      details,
+    };
+  }
+
+  it('returns empty array when result is undefined', () => {
+    expect(getLlmsTxtFilesForAnalysis(undefined)).toEqual([]);
+  });
+
+  it('returns empty array when result has no details', () => {
+    const res: CheckResult = {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'fail',
+      message: 'no',
+    };
+    expect(getLlmsTxtFilesForAnalysis(res)).toEqual([]);
+  });
+
+  it('returns canonical when present', () => {
+    const canonical = file('https://example.com/docs/llms.txt');
+    const other = file('https://example.com/llms.txt');
+    const res = makeResult({
+      canonicalLlmsTxt: canonical,
+      discoveredFiles: [other, canonical],
+    });
+    expect(getLlmsTxtFilesForAnalysis(res)).toEqual([canonical]);
+  });
+
+  it('falls back to discoveredFiles when no canonical (legacy callers)', () => {
+    const a = file('https://example.com/llms.txt');
+    const b = file('https://example.com/docs/llms.txt');
+    const res = makeResult({ discoveredFiles: [a, b] });
+    expect(getLlmsTxtFilesForAnalysis(res)).toEqual([a, b]);
+  });
+});