From 1b512478b9e245d6d7bd2d239b4c83e06bf3ae14 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:19:09 -0700 Subject: [PATCH 1/2] feat(docs): enhance search with TF-IDF relevance ranking (#40) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add relevance ranking toggle to existing Pagefind search (Ctrl+K) - When enabled: re-ranks results using TF-IDF cosine similarity with title/heading boost - Toggle persists in localStorage, off by default - Build-time index: chunks ~108 markdown files for relevance scoring - 9 search quality tests (schema, coverage, relevance, data quality) - Zero new dependencies — pure JS scoring enhancement Relates to #40 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/.gitignore | 1 + docs/package.json | 3 +- docs/scripts/build-search-index.mjs | 153 +++++++++++++++++++ docs/src/components/Search.astro | 178 +++++++++++++++++++++- docs/src/styles/global.css | 2 +- test/docs-search.test.ts | 225 ++++++++++++++++++++++++++++ 6 files changed, 552 insertions(+), 10 deletions(-) create mode 100644 docs/scripts/build-search-index.mjs create mode 100644 test/docs-search.test.ts diff --git a/docs/.gitignore b/docs/.gitignore index ddce69b68..f44a4d0ed 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,4 @@ node_modules/ dist/ .astro/ +public/search-index.json diff --git a/docs/package.json b/docs/package.json index 488a4890d..dc663ec5a 100644 --- a/docs/package.json +++ b/docs/package.json @@ -5,7 +5,8 @@ "private": true, "scripts": { "dev": "astro dev", - "build": "astro build && npx pagefind --site dist", + "build:search": "node scripts/build-search-index.mjs", + "build": "node scripts/build-search-index.mjs && astro build && npx pagefind --site dist", "preview": "astro preview", "astro": "astro", "test": "node --test tests/build-output.test.mjs && npx playwright test", diff --git a/docs/scripts/build-search-index.mjs b/docs/scripts/build-search-index.mjs new file mode 100644 index 000000000..00e082849 --- /dev/null +++ b/docs/scripts/build-search-index.mjs @@ -0,0 +1,153 @@ +#!/usr/bin/env node +/** + * build-search-index.mjs + * Reads all .md files from docs/src/content/docs/, chunks by ## headings, + * and outputs a static search-index.json for client-side TF-IDF search. + */ + +import { readdir, readFile, writeFile, mkdir } from 'node:fs/promises'; +import { join, relative, dirname, sep } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DOCS_ROOT = join(__dirname, '..', 'src', 'content', 'docs'); +const OUTPUT_DIR = join(__dirname, '..', 'public'); +const OUTPUT_FILE = join(OUTPUT_DIR, 'search-index.json'); + +// Section display names derived from directory +const SECTION_NAMES = { + 'get-started': 'Get Started', + guide: 'Guide', + features: 'Features', + reference: 'Reference', + scenarios: 'Scenarios', + concepts: 'Concepts', + cookbook: 'Cookbook', +}; + +async function collectMdFiles(dir) { + const entries = await readdir(dir, { withFileTypes: true }); + const files = []; + for (const entry of entries) { + const full = join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...(await collectMdFiles(full))); + } else if (entry.name.endsWith('.md')) { + files.push(full); + } + } + return files; +} + +function stripFrontmatter(content) { + const match = content.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n/); + return match ? content.slice(match[0].length) : content; +} + +function extractTitle(content) { + const match = content.match(/^#\s+(.+)$/m); + return match ? match[1].trim() : 'Untitled'; +} + +function deriveSlug(filePath) { + let rel = relative(DOCS_ROOT, filePath) + .replace(/\\/g, '/') + .replace(/\.md$/, ''); + if (rel.endsWith('/index')) rel = rel.replace(/\/index$/, ''); + return rel; +} + +function deriveSection(slug) { + const first = slug.split('/')[0]; + return SECTION_NAMES[first] || first.charAt(0).toUpperCase() + first.slice(1); +} + +function stripMarkdown(text) { + return text + .replace(/!\[.*?\]\(.*?\)/g, '') // images + .replace(/\[([^\]]*)\]\(.*?\)/g, '$1') // links → text + .replace(/(`{1,3})[\s\S]*?\1/g, '') // inline/fenced code + .replace(/^>\s?/gm, '') // blockquotes + .replace(/[*_~]{1,3}/g, '') // bold/italic/strikethrough + .replace(/^[-*+]\s/gm, '') // unordered list markers + .replace(/^\d+\.\s/gm, '') // ordered list markers + .replace(/\|/g, ' ') // table pipes + .replace(/^-{3,}$/gm, '') // horizontal rules + .replace(/<[^>]+>/g, '') // HTML tags + .replace(/\n{2,}/g, '\n') // collapse blank lines + .trim(); +} + +function chunkByHeadings(content, pageTitle, slug) { + const body = stripFrontmatter(content); + const section = deriveSection(slug); + const lines = body.split('\n'); + const chunks = []; + let currentHeading = pageTitle; + let buffer = []; + + function flush() { + const raw = buffer.join('\n').trim(); + if (!raw) return; + const text = stripMarkdown(raw); + if (text.length < 20) return; // skip tiny chunks + chunks.push({ + title: pageTitle, + slug, + section, + heading: currentHeading, + text, + }); + } + + for (const line of lines) { + const headingMatch = line.match(/^#{2,3}\s+(.+)/); + if (headingMatch) { + flush(); + currentHeading = headingMatch[1].trim(); + buffer = []; + } else { + buffer.push(line); + } + } + flush(); + + // If no chunks were produced, add the whole page as one chunk + if (chunks.length === 0) { + const text = stripMarkdown(body); + if (text.length >= 20) { + chunks.push({ title: pageTitle, slug, section, heading: pageTitle, text }); + } + } + + return chunks; +} + +async function main() { + console.log('Building search index...'); + const files = await collectMdFiles(DOCS_ROOT); + console.log(`Found ${files.length} markdown files`); + + const allChunks = []; + + for (const file of files) { + const content = await readFile(file, 'utf-8'); + const title = extractTitle(content); + const slug = deriveSlug(file); + const chunks = chunkByHeadings(content, title, slug); + allChunks.push(...chunks); + } + + await mkdir(OUTPUT_DIR, { recursive: true }); + const json = JSON.stringify(allChunks); + await writeFile(OUTPUT_FILE, json, 'utf-8'); + + const sizeKB = (Buffer.byteLength(json) / 1024).toFixed(1); + console.log(`✓ ${allChunks.length} chunks from ${files.length} files`); + console.log(`✓ Output: search-index.json (${sizeKB} KB)`); +} + +main().catch((err) => { + console.error('Build search index failed:', err); + process.exit(1); +}); diff --git a/docs/src/components/Search.astro b/docs/src/components/Search.astro index cac07914b..2b8791388 100644 --- a/docs/src/components/Search.astro +++ b/docs/src/components/Search.astro @@ -36,6 +36,17 @@ const base = import.meta.env.BASE_URL; /> Esc + +
+ +
@@ -50,7 +61,121 @@ const base = import.meta.env.BASE_URL;