From b680f661095d374478832d7cf3a0a6bddc6d7841 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sun, 3 May 2026 22:33:10 -0400 Subject: [PATCH] fix(parity): DOM-aware HTML extraction and heading-line protection Replaces the flat-text + regex pipeline in extractHtmlText with a DOM walker that re-parses
 rawText to expose syntax-highlighter markup
as DOM nodes. This eliminates the inline `` / `
` / `` ambiguity that issue #90 reported: tag mentions in prose now flow through as literal text instead of being deleted by the tag-stripping regex. The HTML_TAG_NAMES set is no longer needed. Adds heading-line placeholder protection in extractMarkdownText (restored after list-marker strips) so leading "1. " in numbered headings like "### 1. How well..." is preserved instead of being stripped as a numbered-list marker (issue #91). Validated against 20 doc sites from PARITY-CHECK-NOTES.md: 3 sites improved (mongodb 8% to 2%, resend 4% to 1%, posthog warn to pass), 0 regressed. Issue repro page (audit-conclusions) goes from 6 missing segments / warn to 0 missing / pass. Closes #90 #91 --- .../observability/markdown-content-parity.ts | 256 +++++++----------- .../checks/markdown-content-parity.test.ts | 121 +++++++++ 2 files changed, 221 insertions(+), 156 deletions(-) diff --git a/src/checks/observability/markdown-content-parity.ts b/src/checks/observability/markdown-content-parity.ts index 475bab2..2be21c7 100644 --- a/src/checks/observability/markdown-content-parity.ts +++ b/src/checks/observability/markdown-content-parity.ts @@ -1,4 +1,4 @@ -import { parse } from 'node-html-parser'; +import { parse, NodeType, type HTMLElement, type Node } from 'node-html-parser'; import { registerCheck } from '../registry.js'; import { fetchPage } from '../../helpers/fetch-page.js'; import { toHtmlUrl } from '../../helpers/to-md-urls.js'; @@ -32,12 +32,9 @@ const STRIP_TAGS = [ ]; /** - * Tags that were removed at the DOM level (STRIP_TAGS). If these tag names - * appear in `.text` output, they came from entity-decoded content (e.g., - * `<nav>` → `<nav>` in prose discussing HTML elements), not from - * actual DOM elements. The text-level tag stripping regex should keep their - * content rather than deleting it, so both sides produce matching text - * after normalize() strips the angle brackets. + * Tag names corresponding to STRIP_TAGS, used by the DOM walker to skip + * these elements if they reappear inside re-parsed <pre> content (e.g., + * a stray <style> block injected by a CSS-in-JS library). */ const DOM_STRIPPED_TAGS = new Set(STRIP_TAGS); @@ -91,122 +88,6 @@ interface PageParityResult { error?: string; } -/** - * Known HTML tag names used to distinguish real tags from angle-bracket - * placeholders like <YOUR_API_KEY> or <clusterName> in code examples. - * Only needs to cover tags that appear in node-html-parser's .text output - * (i.e., tags inside <pre> that survive as raw text). - */ -const HTML_TAG_NAMES = new Set([ - 'a', - 'abbr', - 'address', - 'article', - 'aside', - 'audio', - 'b', - 'bdi', - 'bdo', - 'blockquote', - 'body', - 'br', - 'button', - 'canvas', - 'caption', - 'cite', - 'code', - 'col', - 'colgroup', - 'data', - 'dd', - 'del', - 'details', - 'dfn', - 'dialog', - 'div', - 'dl', - 'dt', - 'em', - 'embed', - 'fieldset', - 'figcaption', - 'figure', - 'footer', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'head', - 'header', - 'hr', - 'html', - 'i', - 'iframe', - 'img', - 'input', - 'ins', - 'kbd', - 'label', - 'legend', - 'li', - 'link', - 'main', - 'map', - 'mark', - 'meta', - 'meter', - 'nav', - 'noscript', - 'object', - 'ol', - 'optgroup', - 'option', - 'output', - 'p', - 'param', - 'picture', - 'pre', - 'progress', - 'q', - 'rp', - 'rt', - 'ruby', - 's', - 'samp', - 'script', - 'section', - 'select', - 'slot', - 'small', - 'source', - 'span', - 'strong', - 'style', - 'sub', - 'summary', - 'sup', - 'table', - 'tbody', - 'td', - 'template', - 'textarea', - 'tfoot', - 'th', - 'thead', - 'time', - 'title', - 'tr', - 'track', - 'u', - 'ul', - 'var', - 'video', - 'wbr', -]); - /** Block-level HTML elements that should produce line breaks in extracted text. */ const BLOCK_TAGS = new Set([ 'p', @@ -359,34 +240,67 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract } } - // Insert newlines before block-level elements so .text produces - // separated lines instead of smashing paragraphs together - for (const tag of BLOCK_TAGS) { - for (const el of content.querySelectorAll(tag)) { - el.insertAdjacentHTML('beforebegin', '\n'); - el.insertAdjacentHTML('afterend', '\n'); - } + // Walk the DOM to produce text. Doing this ourselves (instead of relying + // on .text) lets us handle two cases that flat-text + regex stripping + // can't disambiguate: + // + // 1. node-html-parser treats <pre> content as a single raw-text node, so + // syntax-highlighter markup inside (<span class="kw">, <div class="line">, + // <code class="lang-js">) appears as literal text. We re-parse that + // rawText as HTML and walk the resulting subtree, which yields just the + // code's textContent without any markup leaking through. + // + // 2. Inline `<code>` mentions in prose (rendered as <code><code></code> + // from a `\`<code>\`` markdown span) decode to literal `<code>` text. The + // DOM walk preserves that as text; normalize() then strips the angle + // brackets so it matches the markdown side. Previously the text-level + // tag-stripping regex deleted these as if they were tags. + const text = walkContent(content); + return { text, segmentationStripped }; +} + +/** + * Walk a DOM subtree and emit text content with newlines around block + * elements. Used by extractHtmlText. + */ +function walkContent(node: HTMLElement): string { + let out = ''; + for (const child of node.childNodes) { + out += walkNode(child); } + return out; +} - // node-html-parser treats <pre> content as raw text, so <style> tags - // injected inside code blocks (e.g., Emotion CSS-in-JS / Leafygreen) - // survive DOM-level stripping. Remove <style>...</style> blocks first, - // inject newlines before <div tags to separate code lines (e.g., - // Expressive Code / Shiki use <div class="ec-line"> inside <pre>), - // then strip HTML tags while preserving angle-bracket placeholders - // like <YOUR_API_KEY> or <clusterName> (decoded from <...> entities). - const text = content.text - .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') - .replace(/<!--[\s\S]*?-->/g, '') - .replace(/<div[\s>]/gi, '\n<div ') - .replace(/<\/[^>\s]+>/g, '') - .replace(/<([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>/g, (_match, tag, rest) => { - const lower = tag.toLowerCase(); - if (DOM_STRIPPED_TAGS.has(lower)) return tag; - if (HTML_TAG_NAMES.has(lower)) return ''; - return tag + rest; - }); - return { text, segmentationStripped }; +function walkNode(node: Node): string { + if (node.nodeType === NodeType.TEXT_NODE) { + // text getter decodes entities (< -> <, & -> &) + return node.text; + } + if (node.nodeType !== NodeType.ELEMENT_NODE) { + // Skip comments and anything else + return ''; + } + const el = node as HTMLElement; + const tag = el.tagName?.toLowerCase(); + if (!tag) return walkContent(el); + + // Defensive: even though STRIP_TAGS removes these at DOM level above, + // re-parsed <pre> content can re-introduce script/style/etc. as elements, + // so skip them here too. + if (DOM_STRIPPED_TAGS.has(tag)) return ''; + + if (tag === 'pre') { + // node-html-parser parses <pre> content as a single raw text node, so + // any inner markup (syntax-highlighter spans/divs/code) is opaque. + // Re-parse the rawText to expose that markup as DOM nodes, then walk. + const reparsed = parse(el.rawText); + return '\n' + walkContent(reparsed) + '\n'; + } + + if (BLOCK_TAGS.has(tag)) { + return '\n' + walkContent(el) + '\n'; + } + return walkContent(el); } /** @@ -399,6 +313,12 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract * preserves the literal text inside <pre><code> and <code> tags. The * placeholder approach hides code content from the stripping regexes, * then restores it after all stripping is done. + * + * Heading lines are also placeholder-protected: a heading like + * "### 1. How well..." has the "1. " stripped by the numbered-list regex + * if processed normally, even though that "1. " is part of the heading + * text on the HTML side. Protecting heading content keeps the bullet/ + * numbered-list passes from touching it. */ function extractMarkdownText(markdown: string): string { let text = markdown; @@ -457,20 +377,44 @@ function extractMarkdownText(markdown: string): string { return `\x00CODE${idx}\x00`; }); - // Step 3: Strip markdown formatting on non-code text + // Step 3: Protect heading lines from list-marker stripping. Headings + // like "### 1. How well are X supported?" survive into the HTML as + // "<h3>1. How well are X supported?</h3>", so the leading "1. " is + // part of the heading text — not a list marker. Without this, the + // numbered-list regex would strip it and the markdown side wouldn't + // contain the HTML segment. + const headings: string[] = []; + text = text.replace(/^#{1,6}\s+(.*)$/gm, (_match, content) => { + const idx = headings.length; + headings.push(content); + return `\x00HEAD${idx}\x00`; + }); + + // Step 4: Strip list markers and setext underlines while heading lines + // are still placeholder-protected. These are the passes that would + // misinterpret heading text — e.g., the numbered-list regex stripping + // "1. " from "### 1. How well..." (issue #91). text = text - // Remove heading markers - .replace(/^#{1,6}\s+/gm, '') // Remove setext-style heading underlines .replace(/^[=-]+$/gm, '') - // Remove link/image URLs, keep text: [text](url) → text - .replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1') // Remove reference-style link definitions .replace(/^\[.*?\]:\s+.*$/gm, '') // Remove list bullets/numbers (before emphasis, so leading * isn't // misinterpreted as an emphasis marker) .replace(/^[\s]*[-*+]\s+/gm, '') - .replace(/^[\s]*\d+\.\s+/gm, '') + .replace(/^[\s]*\d+\.\s+/gm, ''); + + // Step 5: Restore heading text. From here on, heading content is + // processed like any other body text — emphasis, links, etc. inside + // heading text gets the same treatment so it matches the HTML side + // (where <h1><em>Foo</em></h1> renders as "Foo"). + // eslint-disable-next-line no-control-regex + text = text.replace(/\x00HEAD(\d+)\x00/g, (_match, idxStr) => headings[parseInt(idxStr, 10)]); + + // Step 6: Strip remaining markdown formatting on body and heading text. + text = text + // Remove link/image URLs, keep text: [text](url) → text + .replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1') // Remove emphasis markers. * emphasis is stripped unconditionally. // _ emphasis is stripped only at word boundaries (per CommonMark, // _text_ is emphasis only when _ is not adjacent to an alphanumeric). @@ -483,7 +427,7 @@ function extractMarkdownText(markdown: string): string { // Remove horizontal rules .replace(/^[-*_]{3,}$/gm, ''); - // Step 4: Restore code content (without backticks/fence markers) + // Step 7: Restore code content (without backticks/fence markers). // eslint-disable-next-line no-control-regex text = text.replace(/\x00CODE(\d+)\x00/g, (_match, idxStr) => codeSpans[parseInt(idxStr, 10)]); // eslint-disable-next-line no-control-regex diff --git a/test/unit/checks/markdown-content-parity.test.ts b/test/unit/checks/markdown-content-parity.test.ts index 2a690bc..19f9c66 100644 --- a/test/unit/checks/markdown-content-parity.test.ts +++ b/test/unit/checks/markdown-content-parity.test.ts @@ -2151,6 +2151,127 @@ The greet helper is also re-exported from the package index for convenience.`; expect(pageResults[0].missingSegments).toBe(0); }); + it('preserves inline `<tag>` code spans on the HTML side (issue #90)', async () => { + // Markdown like `<code>` (a code span whose content looks like an HTML + // tag) renders to <code><code></code>. The DOM walker emits the + // entity-decoded text "<code>" literally; normalize() then strips angle + // brackets so both sides produce "code". Previously the text-level + // tag-stripping regex deleted these as if they were tags. + const html = `<html><body><main> + <h1>Code Examples in Prose</h1> + <p>This type of code example should not be rendered using the HTML <code><code></code> tag in any context.</p> + <p>The <code><main></code> element is for the main content area of the page document body.</p> + <p>Use the <code><title></code> tag inside <code><head></code> to set the page title for browsers.</p> + <p>The <code><h1></code> element should appear once per page as the top-level heading content.</p> + <p>The <code><link></code> element pulls in stylesheets and other external resources for rendering.</p> + <p>Anchor elements use <code><a></code> with an href attribute pointing to the destination url.</p> + <p>The <code><nav></code> element wraps navigation menus that link to other pages on the site.</p> + <p>For inline emphasis use <code><em></code> rather than the older italic element from the past.</p> + <p>The <code><article></code> element groups self-contained pieces of content like blog posts.</p> + <p>Use <code><section></code> for thematic groupings of content within a page or document body.</p> + </main></body></html>`; + + const markdown = `# Code Examples in Prose + +This type of code example should not be rendered using the HTML \`<code>\` tag in any context. + +The \`<main>\` element is for the main content area of the page document body. + +Use the \`<title>\` tag inside \`<head>\` to set the page title for browsers. + +The \`<h1>\` element should appear once per page as the top-level heading content. + +The \`<link>\` element pulls in stylesheets and other external resources for rendering. + +Anchor elements use \`<a>\` with an href attribute pointing to the destination url. + +The \`<nav>\` element wraps navigation menus that link to other pages on the site. + +For inline emphasis use \`<em>\` rather than the older italic element from the past. + +The \`<article>\` element groups self-contained pieces of content like blog posts. + +Use \`<section>\` for thematic groupings of content within a page or document body.`; + + const url = 'http://mcp-tag-spans.local/docs/code-examples'; + server.use( + http.get( + url, + () => + new HttpResponse(html, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + ); + + const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-tag-spans.local'); + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>; + expect(pageResults[0].missingSegments).toBe(0); + }); + + it('preserves leading numbers in headings (issue #91)', async () => { + // Headings like "### 1. How well..." render to "<h3>1. How well...</h3>", + // where "1. " is part of the heading text, not a list marker. The + // markdown extractor must not run the numbered-list regex against + // heading content or the segments won't match. + const html = `<html><body><main> + <h1>Audit Conclusions</h1> + <p>This document summarizes findings across four numbered research questions about agent docs.</p> + <h3>1. How well are key programming languages supported by code examples?</h3> + <p>Most documentation covers JavaScript and Python well but lags on Go and Rust ecosystem support.</p> + <h3>2. Are authentication flows documented for both web and machine-to-machine clients?</h3> + <p>Web flows are well documented but service-to-service flows often skip the credential rotation steps.</p> + <h3>3. Is the rate-limiting behavior described in enough detail to handle gracefully?</h3> + <p>Most APIs document headers and quotas but few cover the retry-after semantics in detail.</p> + <h3>4. Are versioning and deprecation policies clearly communicated to integrators?</h3> + <p>Versioning is consistent across the platforms surveyed but deprecation timelines are often vague.</p> + <p>The above questions framed the audit and inform the recommendations in the next section.</p> + </main></body></html>`; + + const markdown = `# Audit Conclusions + +This document summarizes findings across four numbered research questions about agent docs. + +### 1. How well are key programming languages supported by code examples? + +Most documentation covers JavaScript and Python well but lags on Go and Rust ecosystem support. + +### 2. Are authentication flows documented for both web and machine-to-machine clients? + +Web flows are well documented but service-to-service flows often skip the credential rotation steps. + +### 3. Is the rate-limiting behavior described in enough detail to handle gracefully? + +Most APIs document headers and quotas but few cover the retry-after semantics in detail. + +### 4. Are versioning and deprecation policies clearly communicated to integrators? + +Versioning is consistent across the platforms surveyed but deprecation timelines are often vague. + +The above questions framed the audit and inform the recommendations in the next section.`; + + const url = 'http://mcp-numbered-headings.local/blog/audit'; + server.use( + http.get( + url, + () => + new HttpResponse(html, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + ); + + const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-numbered-headings.local'); + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>; + expect(pageResults[0].missingSegments).toBe(0); + }); + it('uses configurable thresholds (0/0 = informational mode)', async () => { // Setting both thresholds to 0 means the check always passes, // making it informational for sites with intentional content divergence.