diff --git a/docs/checks/content-discoverability.md b/docs/checks/content-discoverability.md index 546b5b3..85534e9 100644 --- a/docs/checks/content-discoverability.md +++ b/docs/checks/content-discoverability.md @@ -41,6 +41,21 @@ If any of these redirect cross-host (e.g., `example.com` redirects to `docs.exam If your `llms.txt` lives at a location not covered by these candidates, AFDocs won't find it. You can either move it to one of the candidate locations or [open an issue](https://github.com/agent-ecosystem/afdocs/issues) to suggest expanding the candidate list. +### Canonical selection + +When more than one candidate returns a file (e.g. an apex `llms.txt` for the marketing site _and_ a `/docs/llms.txt` for the docs section), AFDocs picks one as **canonical**. The canonical file is the single source of truth for downstream checks: link sampling, size, validation, freshness, and link-resolution all operate on it alone. Other discovered files still appear in `details.discoveredFiles` for visibility, and `cache-header-hygiene` still verifies headers on every llms.txt found. + +The selection rule is _most-specific-to-the-baseUrl wins_. AFDocs picks the file whose directory is the longest prefix of the URL you passed. For example: + +| You passed | Files found | Canonical | +| --------------------- | ------------------------------------------- | ------------------------------- | +| `example.com/docs` | `/llms.txt` and `/docs/llms.txt` | `/docs/llms.txt` | +| `example.com` | `/llms.txt` and `/docs/llms.txt` | `/llms.txt` | +| `example.com/docs/v1` | `/llms.txt`, `/docs/llms.txt`, `/docs/v1/…` | `/docs/v1/llms.txt` | +| `example.com/docs/v1` | `/llms.txt` and `/docs/llms.txt` | `/docs/llms.txt` (longer match) | + +Use `--llms-txt-url` (or the `llmsTxtUrl` config option) to override the heuristic when the canonical lives at a non-standard path. See the [CLI reference](/reference/cli#llms-txt-selection) for details. + ### How to fix **If this check fails**, create an `llms.txt` at one of the candidate locations above. The file should contain an H1 title, a blockquote summary, and markdown links to your key documentation pages. See the [llms.txt specification](https://llmstxt.org/) for the format. diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 7938d9a..c9a9190 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -166,6 +166,27 @@ Use `--canonical-origin` when your site's URLs in `sitemap.xml` and `llms.txt` d afdocs check https://preview-xyz-example.app/docs --canonical-origin https://example.com ``` +### llms.txt selection + +| Flag | Default | Description | +| ---------------------- | ------- | ------------------------------------------------------------------------ | +| `--llms-txt-url ` | | Explicit llms.txt URL to use as canonical (bypasses discovery heuristic) | + +By default, `afdocs` discovers llms.txt at three candidate locations: `{baseUrl}/llms.txt`, `{origin}/llms.txt`, and `{origin}/docs/llms.txt`. When more than one of these returns a file, the most-specific one — the one whose directory is the longest prefix of the URL you passed — is used as canonical. Downstream checks (size, validity, link sampling) all operate on the canonical file. + +For most sites this heuristic does the right thing. Use `--llms-txt-url` to override it when: + +- The canonical llms.txt lives at a non-standard path (e.g. `/docs/v3/llms.txt`) +- A monorepo serves multiple docs surfaces at one origin and you want to score one specifically +- You want to verify a specific file before publishing + +```bash +# Score a docs section explicitly, ignoring an apex /llms.txt +afdocs check https://example.com/docs --llms-txt-url https://example.com/docs/llms.txt +``` + +When the override is set, `llms-txt-exists` probes only that URL and reports failure if it isn't reachable. The cross-host redirect fallback is skipped. + ### Size thresholds | Flag | Default | Description | diff --git a/docs/reference/config-file.md b/docs/reference/config-file.md index ddde547..f6d2919 100644 --- a/docs/reference/config-file.md +++ b/docs/reference/config-file.md @@ -32,6 +32,7 @@ options: preferredLocale: en preferredVersion: v3 canonicalOrigin: https://example.com + llmsTxtUrl: https://example.com/docs/llms.txt thresholds: pass: 50000 fail: 100000 @@ -70,18 +71,19 @@ skipChecks: Override default runner options. All fields are optional: -| Field | Default | Description | -| ------------------ | ----------- | ---------------------------------------------------------- | -| `maxLinksToTest` | `50` | Maximum number of pages to sample | -| `samplingStrategy` | `random` | `random`, `deterministic`, `curated`, or `none` | -| `maxConcurrency` | `3` | Maximum concurrent HTTP requests | -| `requestDelay` | `200` | Delay between requests in milliseconds | -| `requestTimeout` | `30000` | Timeout for individual HTTP requests in milliseconds | -| `preferredLocale` | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`) | -| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`) | -| `canonicalOrigin` | | The production domain your content links to | -| `thresholds.pass` | `50000` | Page size pass threshold in characters | -| `thresholds.fail` | `100000` | Page size fail threshold in characters | +| Field | Default | Description | +| ------------------ | ----------- | ------------------------------------------------------------------------------------------- | +| `maxLinksToTest` | `50` | Maximum number of pages to sample | +| `samplingStrategy` | `random` | `random`, `deterministic`, `curated`, or `none` | +| `maxConcurrency` | `3` | Maximum concurrent HTTP requests | +| `requestDelay` | `200` | Delay between requests in milliseconds | +| `requestTimeout` | `30000` | Timeout for individual HTTP requests in milliseconds | +| `preferredLocale` | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`) | +| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`) | +| `canonicalOrigin` | | The production domain your content links to | +| `llmsTxtUrl` | | Explicit llms.txt URL to use as canonical (overrides the discovery heuristic; see CLI docs) | +| `thresholds.pass` | `50000` | Page size pass threshold in characters | +| `thresholds.fail` | `100000` | Page size fail threshold in characters | ### `pages` (optional) diff --git a/src/checks/content-discoverability/llms-txt-exists.ts b/src/checks/content-discoverability/llms-txt-exists.ts index c6b4104..fc45816 100644 --- a/src/checks/content-discoverability/llms-txt-exists.ts +++ b/src/checks/content-discoverability/llms-txt-exists.ts @@ -1,4 +1,5 @@ import { registerCheck } from '../registry.js'; +import { selectCanonicalLlmsTxt } from '../../helpers/llms-txt.js'; import { isCrossHostRedirect } from '../../helpers/to-md-urls.js'; import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; @@ -16,7 +17,8 @@ function getCandidateUrls(baseUrl: string, origin: string): string[] { } async function checkLlmsTxtExists(ctx: CheckContext): Promise { - const candidates = getCandidateUrls(ctx.baseUrl, ctx.origin); + const explicitUrl = ctx.options.llmsTxtUrl; + const candidates = explicitUrl ? [explicitUrl] : getCandidateUrls(ctx.baseUrl, ctx.origin); const discovered: DiscoveredFile[] = []; const checkedUrls: Array<{ url: string; @@ -68,8 +70,11 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { // When no llms.txt found, check if any candidates redirected cross-host. // If so, try {redirected_origin}/llms.txt as a fallback. + // Skip the fallback when the user explicitly specified an llmsTxtUrl — + // they told us exactly where to look, so silently probing other origins + // would defeat the purpose of the override. const redirectedOrigins: string[] = []; - if (discovered.length === 0) { + if (discovered.length === 0 && !explicitUrl) { const checkedSet = new Set(checkedUrls.map((u) => u.url)); const seenOrigins = new Set(); for (const checked of checkedUrls) { @@ -134,6 +139,12 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { (fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '') + (rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : ''); + // Pick the canonical llms.txt — the one downstream checks should use as the + // single source of truth for sampling links, measuring size, validating + // structure, etc. When multiple llms.txt files exist (apex + docs section), + // the heuristic prefers the most-specific one relative to the baseUrl. + const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl); + // Store discovered files for downstream checks const details: Record = { candidateUrls: checkedUrls, @@ -142,6 +153,16 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { rateLimited, }; + if (canonical) { + details.canonicalLlmsTxt = canonical; + details.canonicalUrl = canonical.url; + if (explicitUrl) { + details.canonicalSource = 'explicit'; + } else if (discovered.length > 1) { + details.canonicalSource = 'heuristic'; + } + } + if (redirectedOrigins.length > 0) { details.redirectedOrigins = redirectedOrigins; } @@ -174,11 +195,14 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { redirectedOrigins.length > 0 ? `; candidates redirected cross-host to ${redirectedOrigins.join(', ')} (agents can't follow cross-host redirects)` : ''; + const message = explicitUrl + ? `No llms.txt found at the URL specified via --llms-txt-url (${explicitUrl})${redirectNote}${suffix}` + : `No llms.txt found at any candidate location (${candidates.join(', ')})${redirectNote}${suffix}`; return { id: 'llms-txt-exists', category: 'content-discoverability', status: 'fail', - message: `No llms.txt found at any candidate location (${candidates.join(', ')})${redirectNote}${suffix}`, + message, details, }; } @@ -203,11 +227,24 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { details.sameContent = allSame; } + // Build a message that surfaces which file was picked as canonical, so users + // can see at a glance which one drives the rest of the report. + let message: string; + if (explicitUrl && canonical) { + message = `llms.txt found at ${canonical.url} (specified via --llms-txt-url)`; + } else if (discovered.length === 1) { + message = `llms.txt found at ${discovered[0].url}`; + } else if (canonical) { + message = `llms.txt found at ${discovered.length} locations; using ${canonical.url} as canonical`; + } else { + message = `llms.txt found at ${discovered.length} location(s)`; + } + return { id: 'llms-txt-exists', category: 'content-discoverability', status: 'pass', - message: `llms.txt found at ${discovered.length} location(s)${suffix}`, + message: message + suffix, details, }; } diff --git a/src/checks/content-discoverability/llms-txt-links-markdown.ts b/src/checks/content-discoverability/llms-txt-links-markdown.ts index fd8ff5a..83cb206 100644 --- a/src/checks/content-discoverability/llms-txt-links-markdown.ts +++ b/src/checks/content-discoverability/llms-txt-links-markdown.ts @@ -1,9 +1,10 @@ import { registerCheck } from '../registry.js'; import { extractMarkdownLinks } from './llms-txt-valid.js'; import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js'; +import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js'; import { toMdUrls } from '../../helpers/to-md-urls.js'; import { looksLikeMarkdown } from '../../helpers/detect-markdown.js'; -import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; +import type { CheckContext, CheckResult } from '../../types.js'; interface LinkMarkdownResult { url: string; @@ -25,7 +26,7 @@ function hasMarkdownExtension(url: string): boolean { async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); - const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; + const discovered = getLlmsTxtFilesForAnalysis(existsResult); if (discovered.length === 0) { return { diff --git a/src/checks/content-discoverability/llms-txt-links-resolve.ts b/src/checks/content-discoverability/llms-txt-links-resolve.ts index a09c10d..fee728c 100644 --- a/src/checks/content-discoverability/llms-txt-links-resolve.ts +++ b/src/checks/content-discoverability/llms-txt-links-resolve.ts @@ -2,7 +2,8 @@ import { registerCheck } from '../registry.js'; import { LINK_RESOLVE_THRESHOLD } from '../../constants.js'; import { extractMarkdownLinks } from './llms-txt-valid.js'; import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js'; -import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; +import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js'; +import type { CheckContext, CheckResult } from '../../types.js'; interface LinkCheckResult { url: string; @@ -13,7 +14,7 @@ interface LinkCheckResult { async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); - const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; + const discovered = getLlmsTxtFilesForAnalysis(existsResult); if (discovered.length === 0) { return { diff --git a/src/checks/content-discoverability/llms-txt-size.ts b/src/checks/content-discoverability/llms-txt-size.ts index 8929149..93ac3dc 100644 --- a/src/checks/content-discoverability/llms-txt-size.ts +++ b/src/checks/content-discoverability/llms-txt-size.ts @@ -1,9 +1,10 @@ import { registerCheck } from '../registry.js'; -import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; +import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js'; +import type { CheckContext, CheckResult } from '../../types.js'; async function checkLlmsTxtSize(ctx: CheckContext): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); - const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; + const discovered = getLlmsTxtFilesForAnalysis(existsResult); if (discovered.length === 0) { return { diff --git a/src/checks/content-discoverability/llms-txt-valid.ts b/src/checks/content-discoverability/llms-txt-valid.ts index d919952..d53dea7 100644 --- a/src/checks/content-discoverability/llms-txt-valid.ts +++ b/src/checks/content-discoverability/llms-txt-valid.ts @@ -1,5 +1,6 @@ import { registerCheck } from '../registry.js'; -import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; +import { getLlmsTxtFilesForAnalysis } from '../../helpers/llms-txt.js'; +import type { CheckContext, CheckResult } from '../../types.js'; interface ValidationResult { url: string; @@ -48,7 +49,7 @@ function validateLlmsTxt(content: string, url: string): ValidationResult { async function checkLlmsTxtValid(ctx: CheckContext): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); - const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; + const discovered = getLlmsTxtFilesForAnalysis(existsResult); if (discovered.length === 0) { return { diff --git a/src/checks/observability/cache-header-hygiene.ts b/src/checks/observability/cache-header-hygiene.ts index a8f9884..2a6f2b4 100644 --- a/src/checks/observability/cache-header-hygiene.ts +++ b/src/checks/observability/cache-header-hygiene.ts @@ -118,7 +118,9 @@ async function check(ctx: CheckContext): Promise { // Collect URLs to check: llms.txt files + sampled page URLs const urlsToCheck: string[] = []; - // llms.txt URLs + // llms.txt URLs — intentionally checks ALL discovered files (not just the + // canonical) so that multiple llms.txt locations (apex + docs) are each + // expected to have appropriate cache headers. const existsResult = ctx.previousResults.get('llms-txt-exists'); const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; for (const file of discovered) { diff --git a/src/cli/commands/check.ts b/src/cli/commands/check.ts index 713dfbc..36cd7b2 100644 --- a/src/cli/commands/check.ts +++ b/src/cli/commands/check.ts @@ -42,6 +42,10 @@ export function registerCheckCommand(program: Command): void { '--canonical-origin ', 'The production domain your content links to (for preview/staging testing)', ) + .option( + '--llms-txt-url ', + 'Explicit llms.txt URL to use as canonical (bypasses discovery heuristic)', + ) .action(async (rawUrl: string | undefined, opts: Record) => { // Load config: explicit path or auto-discover let config; @@ -199,6 +203,24 @@ export function registerCheckCommand(program: Command): void { } } + let llmsTxtUrl: string | undefined; + const rawLlmsTxtUrl = (opts.llmsTxtUrl as string | undefined) ?? config?.options?.llmsTxtUrl; + if (rawLlmsTxtUrl) { + try { + llmsTxtUrl = new URL(normalizeUrl(rawLlmsTxtUrl)).toString(); + } catch { + process.stderr.write(`Error: Invalid --llms-txt-url "${rawLlmsTxtUrl}".\n`); + process.exitCode = 1; + return; + } + const targetOrigin = new URL(url).origin; + if (new URL(llmsTxtUrl).origin !== targetOrigin) { + process.stderr.write( + `Warning: --llms-txt-url origin (${new URL(llmsTxtUrl).origin}) differs from target origin (${targetOrigin}). The flag will still be used as canonical.\n`, + ); + } + } + const report = await runChecks(url, { checkIds, skipCheckIds, @@ -214,6 +236,7 @@ export function registerCheckCommand(program: Command): void { ...(preferredLocale && { preferredLocale }), ...(preferredVersion && { preferredVersion }), ...(canonicalOrigin && { canonicalOrigin }), + ...(llmsTxtUrl && { llmsTxtUrl }), }); let output: string; diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index fb58206..1bdcb72 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -1,6 +1,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt-valid.js'; import { MAX_SITEMAP_URLS } from '../constants.js'; -import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js'; +import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js'; +import { isMdUrl, toHtmlUrl } from './to-md-urls.js'; import type { CheckContext, DiscoveredFile } from '../types.js'; /** @@ -38,7 +39,7 @@ export function parseSitemapUrls(xml: string): { urls: string[]; sitemapIndexUrl export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); - const discovered = (existsResult?.details?.discoveredFiles ?? []) as DiscoveredFile[]; + const discovered = getLlmsTxtFilesForAnalysis(existsResult); const urls = extractLinksFromLlmsTxtFiles(discovered); return walkAggregateLinks(ctx, urls); @@ -75,43 +76,78 @@ function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] { return Array.from(urls); } +/** + * Maximum depth of nested aggregate (.txt) files to follow when expanding + * an llms.txt index. Most sites use 1-2 levels (e.g. a top-level index that + * points to per-section indexes). 5 covers deeper trees like Alchemy's + * `/docs/llms.txt → /docs/chains/llms.txt → /docs/chains/{chain}/llms.txt` + * while still terminating on pathological cycles. + */ +const MAX_AGGREGATE_DEPTH = 5; + +/** + * Hard cap on the number of aggregate .txt files we'll fetch in a single + * walk, regardless of depth. Protects against very large sites + * (Alchemy has ~80 per-chain files) ballooning into hundreds of HTTP + * requests. 200 is enough headroom for the largest realistic case while + * still being a clear safety net. + */ +const MAX_AGGREGATE_FILES = 200; + /** * Identify .txt links that are likely aggregate/index files (progressive - * disclosure pattern) and walk them one level deep to find page URLs. + * disclosure pattern) and recursively walk them to find page URLs. * * A link is considered walkable when it ends in .txt and is on the same - * origin as the site being tested. This covers both sub-product llms.txt - * files (Cloudflare) and aggregate content files (Supabase). + * origin as the site being tested. This covers: + * - sub-product llms.txt files (Cloudflare) + * - aggregate content files (Supabase) + * - multi-level nested indexes (Alchemy: top → section → sub-section) + * + * Walking is bounded by `MAX_AGGREGATE_DEPTH` and `MAX_AGGREGATE_FILES` + * so a malformed or extremely large index can't make us fetch unboundedly. + * Each aggregate is fetched at most once even if referenced multiple times. */ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { const pageUrls: string[] = []; - const aggregateUrls: string[] = []; + const visitedAggregates = new Set(); + const queue: Array<{ url: string; depth: number }> = []; const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; - for (const url of urls) { + function isSameOrigin(parsed: URL): boolean { + return parsed.origin === ctx.origin || parsed.origin === siteOrigin; + } + + /** Sort a discovered URL into the page bucket or the aggregate-walk queue. */ + function classify(url: string, parentDepth: number): void { try { const parsed = new URL(url); if (/\.txt$/i.test(parsed.pathname)) { - // .txt files are either aggregate indexes to walk (same origin) - // or external resources to skip — never page URLs themselves - if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { - aggregateUrls.push(url); + // Aggregates: same-origin, not yet visited, within depth budget. + // Cross-origin .txt links are external resources we don't control. + if ( + isSameOrigin(parsed) && + !visitedAggregates.has(url) && + parentDepth + 1 <= MAX_AGGREGATE_DEPTH + ) { + visitedAggregates.add(url); + queue.push({ url, depth: parentDepth + 1 }); } - } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { - // Only include same-origin page URLs; cross-origin links are - // external resources the site owner doesn't control. + } else if (isSameOrigin(parsed)) { pageUrls.push(normalizePageUrl(url)); } } catch { + // Unparseable URL: keep it so the caller's later filtering can decide. pageUrls.push(normalizePageUrl(url)); } } - if (aggregateUrls.length === 0) return pageUrls; + // Seed from the canonical llms.txt's links (parent depth 0). + for (const url of urls) classify(url, 0); - // Fetch aggregate files and extract their links - for (const aggUrl of aggregateUrls) { + while (queue.length > 0 && visitedAggregates.size <= MAX_AGGREGATE_FILES) { + const { url: aggUrl, depth } = queue.shift()!; try { const response = await ctx.http.fetch(aggUrl); if (!response.ok) continue; @@ -130,20 +166,7 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { - const candidates = new Set(); - candidates.add(`${ctx.baseUrl}/llms.txt`); - candidates.add(`${ctx.origin}/llms.txt`); - candidates.add(`${ctx.origin}/docs/llms.txt`); + const explicitUrl = ctx.options.llmsTxtUrl; + const candidates = explicitUrl + ? [explicitUrl] + : Array.from( + new Set([ + `${ctx.baseUrl}/llms.txt`, + `${ctx.origin}/llms.txt`, + `${ctx.origin}/docs/llms.txt`, + ]), + ); const discovered: DiscoveredFile[] = []; for (const url of candidates) { @@ -184,7 +216,9 @@ async function fetchLlmsTxtUrls(ctx: CheckContext): Promise { } } - const urls = extractLinksFromLlmsTxtFiles(discovered); + const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl); + const filesForAnalysis = canonical ? [canonical] : []; + const urls = extractLinksFromLlmsTxtFiles(filesForAnalysis); return walkAggregateLinks(ctx, urls); } diff --git a/src/helpers/index.ts b/src/helpers/index.ts index acdbc2d..4179f1c 100644 --- a/src/helpers/index.ts +++ b/src/helpers/index.ts @@ -8,6 +8,7 @@ export { parseSitemapDirectives, } from './get-page-urls.js'; export type { PageUrlResult, SampledPages } from './get-page-urls.js'; +export { selectCanonicalLlmsTxt, getLlmsTxtFilesForAnalysis } from './llms-txt.js'; export { toMdUrls, isNonPageUrl } from './to-md-urls.js'; export { htmlToMarkdown } from './html-to-markdown.js'; export { fetchPage } from './fetch-page.js'; diff --git a/src/helpers/llms-txt.ts b/src/helpers/llms-txt.ts new file mode 100644 index 0000000..7570551 --- /dev/null +++ b/src/helpers/llms-txt.ts @@ -0,0 +1,104 @@ +import type { CheckResult, DiscoveredFile } from '../types.js'; + +/** + * Get the directory portion of a URL's pathname (the part before the filename), + * without a trailing slash. Returns '' for root-level files. + * + * /llms.txt -> '' + * /docs/llms.txt -> '/docs' + * /docs/v1/llms.txt -> '/docs/v1' + */ +function getFileDir(fileUrl: string): string { + try { + const path = new URL(fileUrl).pathname; + const dir = path.replace(/\/[^/]*$/, ''); + return dir === '/' ? '' : dir; + } catch { + return ''; + } +} + +/** + * Returns true when the file's directory is a (non-strict) prefix of the + * baseUrl's pathname AND the origins match. Files on a different origin + * (e.g. discovered via cross-host redirect fallback) never qualify here. + */ +function fileDirIsPrefixOfBase(fileUrl: string, baseUrl: string): boolean { + try { + const f = new URL(fileUrl); + const b = new URL(baseUrl); + if (f.origin !== b.origin) return false; + const fileDir = getFileDir(fileUrl); + const basePath = b.pathname.replace(/\/$/, ''); + if (fileDir === '') return true; + return fileDir === basePath || basePath.startsWith(fileDir + '/'); + } catch { + return false; + } +} + +function scoreCandidate(fileUrl: string, baseUrl: string): number { + // Files whose directory is a prefix of baseUrl rank above everything else. + // Within that group, deeper directories (more specific) rank higher. + // The +1 ensures any prefix match outranks a non-prefix match (which scores 0). + if (!fileDirIsPrefixOfBase(fileUrl, baseUrl)) return 0; + return 1 + getFileDir(fileUrl).length; +} + +/** + * Pick the canonical llms.txt from a set of discovered files. + * + * Priority: + * 1. The file whose directory is the longest prefix of the baseUrl's + * pathname (most specific to what the user passed). + * 2. Files whose directory is *not* a prefix of the baseUrl rank below + * any prefix-matching file (they cover different parts of the site). + * 3. Ties resolved by registration order — i.e. the order returned by + * the candidate discovery, which already lists baseUrl > origin > docs. + * + * Examples (assuming both files exist): + * baseUrl https://example.com/docs -> /docs/llms.txt wins over /llms.txt + * baseUrl https://example.com -> /llms.txt wins over /docs/llms.txt + * baseUrl https://example.com/docs/v1 -> /docs/v1/llms.txt > /docs/llms.txt > /llms.txt + */ +export function selectCanonicalLlmsTxt( + discovered: DiscoveredFile[], + baseUrl: string, +): DiscoveredFile | undefined { + if (discovered.length === 0) return undefined; + if (discovered.length === 1) return discovered[0]; + + let best = discovered[0]; + let bestScore = scoreCandidate(best.url, baseUrl); + + for (let i = 1; i < discovered.length; i++) { + const score = scoreCandidate(discovered[i].url, baseUrl); + if (score > bestScore) { + best = discovered[i]; + bestScore = score; + } + } + + return best; +} + +/** + * Pick the discovered llms.txt file(s) that downstream checks should treat + * as the source of truth for sampling links, measuring size, validating + * structure, etc. + * + * When `llms-txt-exists` selected a canonical file (the common case), only + * that file is returned. Falls back to the full `discoveredFiles` array for + * backward compatibility with callers (e.g. unit tests) that populate + * `previousResults` directly without going through `llms-txt-exists`. + * + * Returns an empty array when no llms.txt is available. + */ +export function getLlmsTxtFilesForAnalysis( + existsResult: CheckResult | undefined, +): DiscoveredFile[] { + if (!existsResult?.details) return []; + const canonical = existsResult.details.canonicalLlmsTxt as DiscoveredFile | undefined; + if (canonical) return [canonical]; + return (existsResult.details.discoveredFiles as DiscoveredFile[] | undefined) ?? []; +} diff --git a/src/types.ts b/src/types.ts index a5e773d..ed03263 100644 --- a/src/types.ts +++ b/src/types.ts @@ -84,6 +84,16 @@ export interface CheckOptions { preferredVersion?: string; /** Canonical origin to rewrite in fetched content (for preview/staging testing). */ canonicalOrigin?: string; + /** + * Explicit URL to use as the canonical llms.txt for downstream sampling and + * analysis. When set, the standard candidate-discovery heuristic is bypassed + * and only this URL is probed. + * + * Useful when a site has both an apex llms.txt (e.g. for marketing) and a + * docs-section llms.txt, and the heuristic would otherwise pick the wrong + * one. + */ + llmsTxtUrl?: string; } export interface SizeThresholds { diff --git a/test/integration/check-pipeline.test.ts b/test/integration/check-pipeline.test.ts index 12cbb01..12bd04c 100644 --- a/test/integration/check-pipeline.test.ts +++ b/test/integration/check-pipeline.test.ts @@ -1086,6 +1086,101 @@ describe('check pipeline: markdown-content-parity reads from pageCache', () => { }); }); +describe('check pipeline: canonical llms.txt selection', () => { + it('docs llms.txt drives downstream sampling when both apex and docs exist', async () => { + // Mirrors the scenario from issue #53: an apex llms.txt full of marketing + // links and a small docs llms.txt with the actual documentation links. + // When the user passes a docs URL, the docs llms.txt should be canonical + // and downstream checks should sample from its links — not the apex's. + const apexContent = `# Marketing\n\n> Apex marketing page.\n\n## Links\n\n${Array.from( + { length: 20 }, + (_, i) => `- [Marketing ${i}](http://canon-pipe.local/blog/post-${i}): Blog post ${i}`, + ).join('\n')}\n`; + const docsContent = + '# Docs\n\n> Docs index.\n\n## Links\n\n- [Guide](http://canon-pipe.local/docs/guide): Guide\n'; + + server.use( + http.get('http://canon-pipe.local/llms.txt', () => HttpResponse.text(apexContent)), + http.get('http://canon-pipe.local/docs/llms.txt', () => HttpResponse.text(docsContent)), + http.get( + 'http://canon-pipe.local/docs/guide', + () => + new HttpResponse('

Guide

', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.head( + 'http://canon-pipe.local/docs/guide', + () => new HttpResponse(null, { status: 200 }), + ), + ); + mockSitemapNotFound(server, 'http://canon-pipe.local/docs'); + + const report = await runChecks('http://canon-pipe.local/docs', { + checkIds: ['llms-txt-exists', 'llms-txt-size', 'llms-txt-valid', 'llms-txt-links-resolve'], + requestDelay: 0, + }); + + const existsResult = report.results.find((r) => r.id === 'llms-txt-exists')!; + const sizeResult = report.results.find((r) => r.id === 'llms-txt-size')!; + const validResult = report.results.find((r) => r.id === 'llms-txt-valid')!; + const resolveResult = report.results.find((r) => r.id === 'llms-txt-links-resolve')!; + + // llms-txt-exists picks the docs file as canonical + expect(existsResult.status).toBe('pass'); + expect(existsResult.details?.canonicalUrl).toBe('http://canon-pipe.local/docs/llms.txt'); + + // Downstream checks should report on the docs file only, not the apex. + // size: docs file is small, so it passes — the apex (with 20 inline links) + // would not influence the result. + expect(sizeResult.status).toBe('pass'); + const sizes = sizeResult.details?.sizes as Array<{ url: string; characters: number }>; + expect(sizes).toHaveLength(1); + expect(sizes[0].url).toBe('http://canon-pipe.local/docs/llms.txt'); + + // valid: validates the docs file (which has H1, blockquote, sections, links) + expect(validResult.status).toBe('pass'); + + // links-resolve: only tests the single docs link, not the 20 marketing links + expect(resolveResult.status).toBe('pass'); + expect(resolveResult.details?.totalLinks).toBe(1); + }); + + it('--llms-txt-url forces a specific file even when others would be discovered', async () => { + const apexContent = + '# Apex\n\n> Apex.\n\n## Links\n\n- [Blog](http://override-pipe.local/blog): Blog\n'; + const explicitContent = + '# Explicit\n\n> Explicit.\n\n## Links\n\n- [Guide](http://override-pipe.local/docs/guide): Guide\n'; + + server.use( + http.get('http://override-pipe.local/llms.txt', () => HttpResponse.text(apexContent)), + http.get('http://override-pipe.local/custom/llms.txt', () => + HttpResponse.text(explicitContent), + ), + http.get( + 'http://override-pipe.local/docs/llms.txt', + () => new HttpResponse(null, { status: 404 }), + ), + ); + + const report = await runChecks('http://override-pipe.local', { + checkIds: ['llms-txt-exists', 'llms-txt-valid'], + requestDelay: 0, + llmsTxtUrl: 'http://override-pipe.local/custom/llms.txt', + }); + + const existsResult = report.results.find((r) => r.id === 'llms-txt-exists')!; + expect(existsResult.details?.canonicalUrl).toBe('http://override-pipe.local/custom/llms.txt'); + + const validResult = report.results.find((r) => r.id === 'llms-txt-valid')!; + const validations = validResult.details?.validations as Array<{ url: string }>; + // Only the explicit file should be validated, not the apex + expect(validations).toHaveLength(1); + expect(validations[0].url).toBe('http://override-pipe.local/custom/llms.txt'); + }); +}); + describe('check pipeline: effectiveOrigin propagation', () => { it('llms-txt-exists sets effectiveOrigin which llms-txt-freshness uses', async () => { // llms.txt redirects cross-host; sitemap lives at the redirected host diff --git a/test/unit/checks/llms-txt-exists.test.ts b/test/unit/checks/llms-txt-exists.test.ts index b68b217..65890ac 100644 --- a/test/unit/checks/llms-txt-exists.test.ts +++ b/test/unit/checks/llms-txt-exists.test.ts @@ -233,4 +233,109 @@ describe('llms-txt-exists', () => { expect(report.results[0].details?.rateLimited).toBe(1); expect(report.results[0].message).toContain('rate-limited (HTTP 429)'); }); + + describe('canonical selection', () => { + const APEX_LLMS_TXT = `# Apex marketing\n\n> Apex.\n\n## Links\n\n- [Blog](http://canon.local/blog/post): Blog\n`; + const DOCS_LLMS_TXT = `# Docs\n\n> Docs index.\n\n## Links\n\n- [Guide](http://canon.local/docs/guide): Guide\n`; + + it('picks /docs/llms.txt as canonical when baseUrl is the docs path', async () => { + server.use( + http.get('http://canon.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)), + http.get('http://canon.local/docs/llms.txt', () => HttpResponse.text(DOCS_LLMS_TXT)), + ); + + const report = await runChecks('http://canon.local/docs', { + checkIds: ['llms-txt-exists'], + requestDelay: 0, + }); + + expect(report.results[0].status).toBe('pass'); + expect(report.results[0].details?.canonicalUrl).toBe('http://canon.local/docs/llms.txt'); + expect(report.results[0].details?.canonicalSource).toBe('heuristic'); + expect(report.results[0].message).toContain('using http://canon.local/docs/llms.txt'); + }); + + it('picks the apex llms.txt as canonical when baseUrl is the origin', async () => { + server.use( + http.get('http://canon-apex.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)), + http.get('http://canon-apex.local/docs/llms.txt', () => HttpResponse.text(DOCS_LLMS_TXT)), + ); + + const report = await runChecks('http://canon-apex.local', { + checkIds: ['llms-txt-exists'], + requestDelay: 0, + }); + + expect(report.results[0].status).toBe('pass'); + expect(report.results[0].details?.canonicalUrl).toBe('http://canon-apex.local/llms.txt'); + }); + + it('omits canonicalSource when only one file is discovered', async () => { + server.use( + http.get('http://canon-single.local/llms.txt', () => HttpResponse.text(APEX_LLMS_TXT)), + http.get( + 'http://canon-single.local/docs/llms.txt', + () => new HttpResponse(null, { status: 404 }), + ), + ); + + const report = await runChecks('http://canon-single.local', { + checkIds: ['llms-txt-exists'], + requestDelay: 0, + }); + + expect(report.results[0].status).toBe('pass'); + expect(report.results[0].details?.canonicalUrl).toBe('http://canon-single.local/llms.txt'); + expect(report.results[0].details?.canonicalSource).toBeUndefined(); + expect(report.results[0].message).toBe( + 'llms.txt found at http://canon-single.local/llms.txt', + ); + }); + }); + + describe('--llms-txt-url override', () => { + const VALID = `# Override\n\n> Override docs.\n\n## Links\n\n- [Page](http://override.local/x): X\n`; + + it('probes only the explicit URL and uses it as canonical', async () => { + server.use( + // The discovery heuristic would normally hit /llms.txt and /docs/llms.txt, + // but with the override only the explicit URL is probed. + http.get('http://override.local/custom/llms.txt', () => HttpResponse.text(VALID)), + ); + + const report = await runChecks('http://override.local', { + checkIds: ['llms-txt-exists'], + requestDelay: 0, + llmsTxtUrl: 'http://override.local/custom/llms.txt', + }); + + expect(report.results[0].status).toBe('pass'); + expect(report.results[0].details?.canonicalUrl).toBe('http://override.local/custom/llms.txt'); + expect(report.results[0].details?.canonicalSource).toBe('explicit'); + expect(report.results[0].message).toContain('specified via --llms-txt-url'); + // candidateUrls should only include the explicit URL + const candidates = report.results[0].details?.candidateUrls as Array<{ url: string }>; + expect(candidates).toHaveLength(1); + expect(candidates[0].url).toBe('http://override.local/custom/llms.txt'); + }); + + it('reports an explicit-URL-aware failure when the override 404s', async () => { + server.use( + http.get( + 'http://override-missing.local/custom/llms.txt', + () => new HttpResponse(null, { status: 404 }), + ), + ); + + const report = await runChecks('http://override-missing.local', { + checkIds: ['llms-txt-exists'], + requestDelay: 0, + llmsTxtUrl: 'http://override-missing.local/custom/llms.txt', + }); + + expect(report.results[0].status).toBe('fail'); + expect(report.results[0].message).toContain('--llms-txt-url'); + expect(report.results[0].message).toContain('http://override-missing.local/custom/llms.txt'); + }); + }); }); diff --git a/test/unit/cli/check-command.test.ts b/test/unit/cli/check-command.test.ts index 1c43d7e..00b8858 100644 --- a/test/unit/cli/check-command.test.ts +++ b/test/unit/cli/check-command.test.ts @@ -576,6 +576,106 @@ describe('check command config integration', () => { stderrSpy.mockRestore(); }); + it('accepts --llms-txt-url and uses it as canonical', async () => { + const customLlmsTxt = `# Custom\n\n> Custom docs.\n\n## Links\n\n- [Page](http://cmd-llms-url.local/x): X\n`; + server.use( + // The discovery heuristic would normally fall back to /llms.txt, but the + // explicit URL should be the only thing probed. + http.get('http://cmd-llms-url.local/custom/llms.txt', () => HttpResponse.text(customLlmsTxt)), + ); + + const writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true); + + const { run } = await import('../../../src/cli/index.js'); + await run([ + 'node', + 'afdocs', + 'check', + 'http://cmd-llms-url.local', + '--checks', + 'llms-txt-exists', + '--format', + 'json', + '--llms-txt-url', + 'http://cmd-llms-url.local/custom/llms.txt', + '--request-delay', + '0', + ]); + await new Promise((r) => setTimeout(r, 100)); + + const output = writeSpy.mock.calls.map((c) => c[0]).join(''); + const parsed = JSON.parse(output.trim()); + expect(parsed.results[0].status).toBe('pass'); + expect(parsed.results[0].details.canonicalUrl).toBe( + 'http://cmd-llms-url.local/custom/llms.txt', + ); + expect(parsed.results[0].details.canonicalSource).toBe('explicit'); + + writeSpy.mockRestore(); + }); + + it('rejects invalid --llms-txt-url', async () => { + const stderrSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + + const { run } = await import('../../../src/cli/index.js'); + await run([ + 'node', + 'afdocs', + 'check', + 'http://cmd-llms-url-bad.local', + '--llms-txt-url', + ':::not-a-url:::', + '--request-delay', + '0', + ]); + await new Promise((r) => setTimeout(r, 100)); + + const output = stderrSpy.mock.calls.map((c) => c[0]).join(''); + expect(output).toContain('Invalid --llms-txt-url'); + expect(process.exitCode).toBe(1); + + stderrSpy.mockRestore(); + }); + + it('warns when --llms-txt-url origin differs from target origin', async () => { + const customLlmsTxt = `# Other\n\n> Other.\n\n## Links\n\n- [P](http://other.local/x): X\n`; + server.use( + http.get('http://other.local/custom/llms.txt', () => HttpResponse.text(customLlmsTxt)), + ); + + const stdoutSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true); + const stderrSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + + const { run } = await import('../../../src/cli/index.js'); + await run([ + 'node', + 'afdocs', + 'check', + 'http://cmd-llms-url-cross.local', + '--checks', + 'llms-txt-exists', + '--format', + 'json', + '--llms-txt-url', + 'http://other.local/custom/llms.txt', + '--request-delay', + '0', + ]); + await new Promise((r) => setTimeout(r, 100)); + + const stderr = stderrSpy.mock.calls.map((c) => c[0]).join(''); + expect(stderr).toContain('--llms-txt-url origin'); + expect(stderr).toContain('differs from target origin'); + + // Check still runs and uses the explicit URL + const stdout = stdoutSpy.mock.calls.map((c) => c[0]).join(''); + const parsed = JSON.parse(stdout.trim()); + expect(parsed.results[0].details.canonicalUrl).toBe('http://other.local/custom/llms.txt'); + + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + }); + it('infers base URL from config pages when url field is omitted', async () => { server.use( http.get('http://cfg-infer.local/llms.txt', () => HttpResponse.text(VALID_LLMS_TXT)), diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 3726bae..7703c47 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -1297,6 +1297,75 @@ describe('getPageUrls', () => { expect(result.urls).toEqual(['http://walk-empty.local/docs/page']); }); + it('walks nested aggregate .txt files recursively (Alchemy pattern)', async () => { + // Three-level nested structure: top index → section index → leaf pages. + // Mirrors how Alchemy's docs llms.txt is organized: + // /docs/llms.txt → /docs/chains/llms.txt → /docs/chains/ethereum/llms.txt + const rootContent = `# Docs\n- [Chains](http://nested-walk.local/chains/llms.txt)\n`; + const chainsContent = `# Chains\n- [Ethereum](http://nested-walk.local/chains/ethereum/llms.txt)\n- [Solana](http://nested-walk.local/chains/solana/llms.txt)\n`; + const ethContent = `# Ethereum\n- [eth_call](http://nested-walk.local/chains/ethereum/eth-call.md): Call\n- [eth_chainId](http://nested-walk.local/chains/ethereum/eth-chain-id.md): Chain ID\n`; + const solContent = `# Solana\n- [getBalance](http://nested-walk.local/chains/solana/get-balance.md): Balance\n`; + + server.use( + http.get('http://nested-walk.local/chains/llms.txt', () => HttpResponse.text(chainsContent)), + http.get('http://nested-walk.local/chains/ethereum/llms.txt', () => + HttpResponse.text(ethContent), + ), + http.get('http://nested-walk.local/chains/solana/llms.txt', () => + HttpResponse.text(solContent), + ), + ); + + const ctx = makeCtx('http://nested-walk.local', rootContent); + const result = await getPageUrls(ctx); + expect(result.urls).toContain('http://nested-walk.local/chains/ethereum/eth-call'); + expect(result.urls).toContain('http://nested-walk.local/chains/ethereum/eth-chain-id'); + expect(result.urls).toContain('http://nested-walk.local/chains/solana/get-balance'); + expect(result.urls).toHaveLength(3); + }); + + it('does not refetch the same aggregate when referenced from multiple parents', async () => { + let sharedFetches = 0; + const rootContent = `# Docs\n- [SectionA](http://nested-dup.local/a/llms.txt)\n- [SectionB](http://nested-dup.local/b/llms.txt)\n`; + const aContent = `# A\n- [Shared](http://nested-dup.local/shared/llms.txt)\n- [PageA](http://nested-dup.local/a/page.md)\n`; + const bContent = `# B\n- [Shared](http://nested-dup.local/shared/llms.txt)\n- [PageB](http://nested-dup.local/b/page.md)\n`; + const sharedContent = `# Shared\n- [SharedPage](http://nested-dup.local/shared/page.md)\n`; + + server.use( + http.get('http://nested-dup.local/a/llms.txt', () => HttpResponse.text(aContent)), + http.get('http://nested-dup.local/b/llms.txt', () => HttpResponse.text(bContent)), + http.get('http://nested-dup.local/shared/llms.txt', () => { + sharedFetches++; + return HttpResponse.text(sharedContent); + }), + ); + + const ctx = makeCtx('http://nested-dup.local', rootContent); + const result = await getPageUrls(ctx); + + // Shared aggregate should only be fetched once even though both A and B reference it. + expect(sharedFetches).toBe(1); + expect(result.urls).toContain('http://nested-dup.local/a/page'); + expect(result.urls).toContain('http://nested-dup.local/b/page'); + expect(result.urls).toContain('http://nested-dup.local/shared/page'); + }); + + it('terminates on cycles in the aggregate graph', async () => { + const rootContent = `# Docs\n- [A](http://nested-cycle.local/a/llms.txt)\n`; + // a → b → a → … would loop forever without cycle detection. + const aContent = `# A\n- [B](http://nested-cycle.local/b/llms.txt)\n- [Page](http://nested-cycle.local/a/page.md)\n`; + const bContent = `# B\n- [A](http://nested-cycle.local/a/llms.txt)\n`; + + server.use( + http.get('http://nested-cycle.local/a/llms.txt', () => HttpResponse.text(aContent)), + http.get('http://nested-cycle.local/b/llms.txt', () => HttpResponse.text(bContent)), + ); + + const ctx = makeCtx('http://nested-cycle.local', rootContent); + const result = await getPageUrls(ctx); + expect(result.urls).toContain('http://nested-cycle.local/a/page'); + }); + // ── .md URL normalization ── it('normalizes .md URLs from llms.txt to HTML equivalents', async () => { diff --git a/test/unit/helpers/llms-txt.test.ts b/test/unit/helpers/llms-txt.test.ts new file mode 100644 index 0000000..a697fa9 --- /dev/null +++ b/test/unit/helpers/llms-txt.test.ts @@ -0,0 +1,120 @@ +import { describe, it, expect } from 'vitest'; +import { + selectCanonicalLlmsTxt, + getLlmsTxtFilesForAnalysis, +} from '../../../src/helpers/llms-txt.js'; +import type { CheckResult, DiscoveredFile } from '../../../src/types.js'; + +function file(url: string, content = '# stub'): DiscoveredFile { + return { url, content, status: 200, redirected: false }; +} + +describe('selectCanonicalLlmsTxt', () => { + it('returns undefined for empty input', () => { + expect(selectCanonicalLlmsTxt([], 'https://example.com')).toBeUndefined(); + }); + + it('returns the only file when one is provided', () => { + const f = file('https://example.com/llms.txt'); + expect(selectCanonicalLlmsTxt([f], 'https://example.com')).toBe(f); + }); + + it('prefers /docs/llms.txt over apex when baseUrl is /docs', () => { + const apex = file('https://example.com/llms.txt', '# Apex'); + const docs = file('https://example.com/docs/llms.txt', '# Docs'); + const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs'); + expect(picked).toBe(docs); + }); + + it('prefers apex over /docs/llms.txt when baseUrl is the origin', () => { + const apex = file('https://example.com/llms.txt', '# Apex'); + const docs = file('https://example.com/docs/llms.txt', '# Docs'); + const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com'); + expect(picked).toBe(apex); + }); + + it('prefers the deepest matching prefix when several files cover baseUrl', () => { + const apex = file('https://example.com/llms.txt'); + const docs = file('https://example.com/docs/llms.txt'); + const v1 = file('https://example.com/docs/v1/llms.txt'); + const picked = selectCanonicalLlmsTxt([apex, docs, v1], 'https://example.com/docs/v1'); + expect(picked).toBe(v1); + }); + + it('falls back to /docs/llms.txt when /docs/v1/llms.txt is missing', () => { + const apex = file('https://example.com/llms.txt'); + const docs = file('https://example.com/docs/llms.txt'); + const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs/v1'); + expect(picked).toBe(docs); + }); + + it('ignores files on a different origin (treats them as non-prefix matches)', () => { + const sameOrigin = file('https://example.com/llms.txt'); + const otherOrigin = file('https://other.com/docs/llms.txt'); + const picked = selectCanonicalLlmsTxt([otherOrigin, sameOrigin], 'https://example.com/docs'); + expect(picked).toBe(sameOrigin); + }); + + it('handles trailing slashes on baseUrl gracefully', () => { + const apex = file('https://example.com/llms.txt'); + const docs = file('https://example.com/docs/llms.txt'); + const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/docs/'); + expect(picked).toBe(docs); + }); + + it('does not pick /docs/llms.txt when baseUrl is /api (different subtree)', () => { + const apex = file('https://example.com/llms.txt'); + const docs = file('https://example.com/docs/llms.txt'); + const picked = selectCanonicalLlmsTxt([apex, docs], 'https://example.com/api'); + expect(picked).toBe(apex); + }); + + it('falls back to non-prefix file when nothing matches', () => { + const apiFile = file('https://example.com/api/llms.txt'); + const picked = selectCanonicalLlmsTxt([apiFile], 'https://example.com/docs'); + expect(picked).toBe(apiFile); + }); +}); + +describe('getLlmsTxtFilesForAnalysis', () => { + function makeResult(details: Record): CheckResult { + return { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'ok', + details, + }; + } + + it('returns empty array when result is undefined', () => { + expect(getLlmsTxtFilesForAnalysis(undefined)).toEqual([]); + }); + + it('returns empty array when result has no details', () => { + const res: CheckResult = { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'fail', + message: 'no', + }; + expect(getLlmsTxtFilesForAnalysis(res)).toEqual([]); + }); + + it('returns canonical when present', () => { + const canonical = file('https://example.com/docs/llms.txt'); + const other = file('https://example.com/llms.txt'); + const res = makeResult({ + canonicalLlmsTxt: canonical, + discoveredFiles: [other, canonical], + }); + expect(getLlmsTxtFilesForAnalysis(res)).toEqual([canonical]); + }); + + it('falls back to discoveredFiles when no canonical (legacy callers)', () => { + const a = file('https://example.com/llms.txt'); + const b = file('https://example.com/docs/llms.txt'); + const res = makeResult({ discoveredFiles: [a, b] }); + expect(getLlmsTxtFilesForAnalysis(res)).toEqual([a, b]); + }); +});