From b680f661095d374478832d7cf3a0a6bddc6d7841 Mon Sep 17 00:00:00 2001
From: dacharyc <dc@dacharycarey.com>
Date: Sun, 3 May 2026 22:33:10 -0400
Subject: [PATCH] fix(parity): DOM-aware HTML extraction and heading-line
 protection

Replaces the flat-text + regex pipeline in extractHtmlText with a DOM
walker that re-parses <pre> rawText to expose syntax-highlighter markup
as DOM nodes. This eliminates the inline `<code>` / `<main>` / `<title>`
ambiguity that issue #90 reported: tag mentions in prose now flow
through as literal text instead of being deleted by the tag-stripping
regex. The HTML_TAG_NAMES set is no longer needed.

Adds heading-line placeholder protection in extractMarkdownText
(restored after list-marker strips) so leading "1. " in numbered
headings like "### 1. How well..." is preserved instead of being
stripped as a numbered-list marker (issue #91).

Validated against 20 doc sites from PARITY-CHECK-NOTES.md: 3 sites
improved (mongodb 8% to 2%, resend 4% to 1%, posthog warn to pass), 0
regressed. Issue repro page (audit-conclusions) goes from 6 missing
segments / warn to 0 missing / pass.

Closes #90 #91
---
 .../observability/markdown-content-parity.ts  | 256 +++++++-----------
 .../checks/markdown-content-parity.test.ts    | 121 +++++++++
 2 files changed, 221 insertions(+), 156 deletions(-)
diff --git a/src/checks/observability/markdown-content-parity.ts b/src/checks/observability/markdown-content-parity.ts
index 475bab2..2be21c7 100644
--- a/src/checks/observability/markdown-content-parity.ts
+++ b/src/checks/observability/markdown-content-parity.ts
@@ -1,4 +1,4 @@
-import { parse } from 'node-html-parser';
+import { parse, NodeType, type HTMLElement, type Node } from 'node-html-parser';
 import { registerCheck } from '../registry.js';
 import { fetchPage } from '../../helpers/fetch-page.js';
 import { toHtmlUrl } from '../../helpers/to-md-urls.js';
@@ -32,12 +32,9 @@ const STRIP_TAGS = [
 ];
 
 /**
- * Tags that were removed at the DOM level (STRIP_TAGS). If these tag names
- * appear in `.text` output, they came from entity-decoded content (e.g.,
- * `&lt;nav&gt;` → `<nav>` in prose discussing HTML elements), not from
- * actual DOM elements. The text-level tag stripping regex should keep their
- * content rather than deleting it, so both sides produce matching text
- * after normalize() strips the angle brackets.
+ * Tag names corresponding to STRIP_TAGS, used by the DOM walker to skip
+ * these elements if they reappear inside re-parsed <pre> content (e.g.,
+ * a stray <style> block injected by a CSS-in-JS library).
  */
 const DOM_STRIPPED_TAGS = new Set(STRIP_TAGS);
 
@@ -91,122 +88,6 @@ interface PageParityResult {
   error?: string;
 }
 
-/**
- * Known HTML tag names used to distinguish real tags from angle-bracket
- * placeholders like <YOUR_API_KEY> or <clusterName> in code examples.
- * Only needs to cover tags that appear in node-html-parser's .text output
- * (i.e., tags inside <pre> that survive as raw text).
- */
-const HTML_TAG_NAMES = new Set([
-  'a',
-  'abbr',
-  'address',
-  'article',
-  'aside',
-  'audio',
-  'b',
-  'bdi',
-  'bdo',
-  'blockquote',
-  'body',
-  'br',
-  'button',
-  'canvas',
-  'caption',
-  'cite',
-  'code',
-  'col',
-  'colgroup',
-  'data',
-  'dd',
-  'del',
-  'details',
-  'dfn',
-  'dialog',
-  'div',
-  'dl',
-  'dt',
-  'em',
-  'embed',
-  'fieldset',
-  'figcaption',
-  'figure',
-  'footer',
-  'form',
-  'h1',
-  'h2',
-  'h3',
-  'h4',
-  'h5',
-  'h6',
-  'head',
-  'header',
-  'hr',
-  'html',
-  'i',
-  'iframe',
-  'img',
-  'input',
-  'ins',
-  'kbd',
-  'label',
-  'legend',
-  'li',
-  'link',
-  'main',
-  'map',
-  'mark',
-  'meta',
-  'meter',
-  'nav',
-  'noscript',
-  'object',
-  'ol',
-  'optgroup',
-  'option',
-  'output',
-  'p',
-  'param',
-  'picture',
-  'pre',
-  'progress',
-  'q',
-  'rp',
-  'rt',
-  'ruby',
-  's',
-  'samp',
-  'script',
-  'section',
-  'select',
-  'slot',
-  'small',
-  'source',
-  'span',
-  'strong',
-  'style',
-  'sub',
-  'summary',
-  'sup',
-  'table',
-  'tbody',
-  'td',
-  'template',
-  'textarea',
-  'tfoot',
-  'th',
-  'thead',
-  'time',
-  'title',
-  'tr',
-  'track',
-  'u',
-  'ul',
-  'var',
-  'video',
-  'wbr',
-]);
-
 /** Block-level HTML elements that should produce line breaks in extracted text. */
 const BLOCK_TAGS = new Set([
   'p',
@@ -359,34 +240,67 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
     }
   }
 
-  // Insert newlines before block-level elements so .text produces
-  // separated lines instead of smashing paragraphs together
-  for (const tag of BLOCK_TAGS) {
-    for (const el of content.querySelectorAll(tag)) {
-      el.insertAdjacentHTML('beforebegin', '\n');
-      el.insertAdjacentHTML('afterend', '\n');
-    }
+  // Walk the DOM to produce text. Doing this ourselves (instead of relying
+  // on .text) lets us handle two cases that flat-text + regex stripping
+  // can't disambiguate:
+  //
+  // 1. node-html-parser treats <pre> content as a single raw-text node, so
+  //    syntax-highlighter markup inside (<span class="kw">, <div class="line">,
+  //    <code class="lang-js">) appears as literal text. We re-parse that
+  //    rawText as HTML and walk the resulting subtree, which yields just the
+  //    code's textContent without any markup leaking through.
+  //
+  // 2. Inline `<code>` mentions in prose (rendered as <code>&lt;code&gt;</code>
+  //    from a `\`<code>\`` markdown span) decode to literal `<code>` text. The
+  //    DOM walk preserves that as text; normalize() then strips the angle
+  //    brackets so it matches the markdown side. Previously the text-level
+  //    tag-stripping regex deleted these as if they were tags.
+  const text = walkContent(content);
+  return { text, segmentationStripped };
+}
+
+/**
+ * Walk a DOM subtree and emit text content with newlines around block
+ * elements. Used by extractHtmlText.
+ */
+function walkContent(node: HTMLElement): string {
+  let out = '';
+  for (const child of node.childNodes) {
+    out += walkNode(child);
   }
+  return out;
+}
 
-  // node-html-parser treats <pre> content as raw text, so <style> tags
-  // injected inside code blocks (e.g., Emotion CSS-in-JS / Leafygreen)
-  // survive DOM-level stripping. Remove <style>...</style> blocks first,
-  // inject newlines before <div tags to separate code lines (e.g.,
-  // Expressive Code / Shiki use <div class="ec-line"> inside <pre>),
-  // then strip HTML tags while preserving angle-bracket placeholders
-  // like <YOUR_API_KEY> or <clusterName> (decoded from &lt;...&gt; entities).
-  const text = content.text
-    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
-    .replace(/<!--[\s\S]*?-->/g, '')
-    .replace(/<div[\s>]/gi, '\n<div ')
-    .replace(/<\/[^>\s]+>/g, '')
-    .replace(/<([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>/g, (_match, tag, rest) => {
-      const lower = tag.toLowerCase();
-      if (DOM_STRIPPED_TAGS.has(lower)) return tag;
-      if (HTML_TAG_NAMES.has(lower)) return '';
-      return tag + rest;
-    });
-  return { text, segmentationStripped };
+function walkNode(node: Node): string {
+  if (node.nodeType === NodeType.TEXT_NODE) {
+    // text getter decodes entities (&lt; -> <, &amp; -> &)
+    return node.text;
+  }
+  if (node.nodeType !== NodeType.ELEMENT_NODE) {
+    // Skip comments and anything else
+    return '';
+  }
+  const el = node as HTMLElement;
+  const tag = el.tagName?.toLowerCase();
+  if (!tag) return walkContent(el);
+
+  // Defensive: even though STRIP_TAGS removes these at DOM level above,
+  // re-parsed <pre> content can re-introduce script/style/etc. as elements,
+  // so skip them here too.
+  if (DOM_STRIPPED_TAGS.has(tag)) return '';
+
+  if (tag === 'pre') {
+    // node-html-parser parses <pre> content as a single raw text node, so
+    // any inner markup (syntax-highlighter spans/divs/code) is opaque.
+    // Re-parse the rawText to expose that markup as DOM nodes, then walk.
+    const reparsed = parse(el.rawText);
+    return '\n' + walkContent(reparsed) + '\n';
+  }
+
+  if (BLOCK_TAGS.has(tag)) {
+    return '\n' + walkContent(el) + '\n';
+  }
+  return walkContent(el);
 }
 
 /**
@@ -399,6 +313,12 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
  * preserves the literal text inside <pre><code> and <code> tags. The
  * placeholder approach hides code content from the stripping regexes,
  * then restores it after all stripping is done.
+ *
+ * Heading lines are also placeholder-protected: a heading like
+ * "### 1. How well..." has the "1. " stripped by the numbered-list regex
+ * if processed normally, even though that "1. " is part of the heading
+ * text on the HTML side. Protecting heading content keeps the bullet/
+ * numbered-list passes from touching it.
  */
 function extractMarkdownText(markdown: string): string {
   let text = markdown;
@@ -457,20 +377,44 @@ function extractMarkdownText(markdown: string): string {
     return `\x00CODE${idx}\x00`;
   });
 
-  // Step 3: Strip markdown formatting on non-code text
+  // Step 3: Protect heading lines from list-marker stripping. Headings
+  // like "### 1. How well are X supported?" survive into the HTML as
+  // "<h3>1. How well are X supported?</h3>", so the leading "1. " is
+  // part of the heading text — not a list marker. Without this, the
+  // numbered-list regex would strip it and the markdown side wouldn't
+  // contain the HTML segment.
+  const headings: string[] = [];
+  text = text.replace(/^#{1,6}\s+(.*)$/gm, (_match, content) => {
+    const idx = headings.length;
+    headings.push(content);
+    return `\x00HEAD${idx}\x00`;
+  });
+
+  // Step 4: Strip list markers and setext underlines while heading lines
+  // are still placeholder-protected. These are the passes that would
+  // misinterpret heading text — e.g., the numbered-list regex stripping
+  // "1. " from "### 1. How well..." (issue #91).
   text = text
-    // Remove heading markers
-    .replace(/^#{1,6}\s+/gm, '')
     // Remove setext-style heading underlines
     .replace(/^[=-]+$/gm, '')
-    // Remove link/image URLs, keep text: [text](url) → text
-    .replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1')
     // Remove reference-style link definitions
     .replace(/^\[.*?\]:\s+.*$/gm, '')
     // Remove list bullets/numbers (before emphasis, so leading * isn't
     // misinterpreted as an emphasis marker)
     .replace(/^[\s]*[-*+]\s+/gm, '')
-    .replace(/^[\s]*\d+\.\s+/gm, '')
+    .replace(/^[\s]*\d+\.\s+/gm, '');
+
+  // Step 5: Restore heading text. From here on, heading content is
+  // processed like any other body text — emphasis, links, etc. inside
+  // heading text gets the same treatment so it matches the HTML side
+  // (where <h1><em>Foo</em></h1> renders as "Foo").
+  // eslint-disable-next-line no-control-regex
+  text = text.replace(/\x00HEAD(\d+)\x00/g, (_match, idxStr) => headings[parseInt(idxStr, 10)]);
+
+  // Step 6: Strip remaining markdown formatting on body and heading text.
+  text = text
+    // Remove link/image URLs, keep text: [text](url) → text
+    .replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1')
     // Remove emphasis markers. * emphasis is stripped unconditionally.
     // _ emphasis is stripped only at word boundaries (per CommonMark,
     // _text_ is emphasis only when _ is not adjacent to an alphanumeric).
@@ -483,7 +427,7 @@ function extractMarkdownText(markdown: string): string {
     // Remove horizontal rules
     .replace(/^[-*_]{3,}$/gm, '');
 
-  // Step 4: Restore code content (without backticks/fence markers)
+  // Step 7: Restore code content (without backticks/fence markers).
   // eslint-disable-next-line no-control-regex
   text = text.replace(/\x00CODE(\d+)\x00/g, (_match, idxStr) => codeSpans[parseInt(idxStr, 10)]);
   // eslint-disable-next-line no-control-regex
diff --git a/test/unit/checks/markdown-content-parity.test.ts b/test/unit/checks/markdown-content-parity.test.ts
index 2a690bc..19f9c66 100644
--- a/test/unit/checks/markdown-content-parity.test.ts
+++ b/test/unit/checks/markdown-content-parity.test.ts
@@ -2151,6 +2151,127 @@ The greet helper is also re-exported from the package index for convenience.`;
     expect(pageResults[0].missingSegments).toBe(0);
   });
 
+  it('preserves inline `<tag>` code spans on the HTML side (issue #90)', async () => {
+    // Markdown like `<code>` (a code span whose content looks like an HTML
+    // tag) renders to <code>&lt;code&gt;</code>. The DOM walker emits the
+    // entity-decoded text "<code>" literally; normalize() then strips angle
+    // brackets so both sides produce "code". Previously the text-level
+    // tag-stripping regex deleted these as if they were tags.
+    const html = `<html><body><main>
+      <h1>Code Examples in Prose</h1>
+      <p>This type of code example should not be rendered using the HTML <code>&lt;code&gt;</code> tag in any context.</p>
+      <p>The <code>&lt;main&gt;</code> element is for the main content area of the page document body.</p>
+      <p>Use the <code>&lt;title&gt;</code> tag inside <code>&lt;head&gt;</code> to set the page title for browsers.</p>
+      <p>The <code>&lt;h1&gt;</code> element should appear once per page as the top-level heading content.</p>
+      <p>The <code>&lt;link&gt;</code> element pulls in stylesheets and other external resources for rendering.</p>
+      <p>Anchor elements use <code>&lt;a&gt;</code> with an href attribute pointing to the destination url.</p>
+      <p>The <code>&lt;nav&gt;</code> element wraps navigation menus that link to other pages on the site.</p>
+      <p>For inline emphasis use <code>&lt;em&gt;</code> rather than the older italic element from the past.</p>
+      <p>The <code>&lt;article&gt;</code> element groups self-contained pieces of content like blog posts.</p>
+      <p>Use <code>&lt;section&gt;</code> for thematic groupings of content within a page or document body.</p>
+    </main></body></html>`;
+
+    const markdown = `# Code Examples in Prose
+
+This type of code example should not be rendered using the HTML \`<code>\` tag in any context.
+
+The \`<main>\` element is for the main content area of the page document body.
+
+Use the \`<title>\` tag inside \`<head>\` to set the page title for browsers.
+
+The \`<h1>\` element should appear once per page as the top-level heading content.
+
+The \`<link>\` element pulls in stylesheets and other external resources for rendering.
+
+Anchor elements use \`<a>\` with an href attribute pointing to the destination url.
+
+The \`<nav>\` element wraps navigation menus that link to other pages on the site.
+
+For inline emphasis use \`<em>\` rather than the older italic element from the past.
+
+The \`<article>\` element groups self-contained pieces of content like blog posts.
+
+Use \`<section>\` for thematic groupings of content within a page or document body.`;
+
+    const url = 'http://mcp-tag-spans.local/docs/code-examples';
+    server.use(
+      http.get(
+        url,
+        () =>
+          new HttpResponse(html, {
+            status: 200,
+            headers: { 'Content-Type': 'text/html' },
+          }),
+      ),
+    );
+
+    const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-tag-spans.local');
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>;
+    expect(pageResults[0].missingSegments).toBe(0);
+  });
+
+  it('preserves leading numbers in headings (issue #91)', async () => {
+    // Headings like "### 1. How well..." render to "<h3>1. How well...</h3>",
+    // where "1. " is part of the heading text, not a list marker. The
+    // markdown extractor must not run the numbered-list regex against
+    // heading content or the segments won't match.
+    const html = `<html><body><main>
+      <h1>Audit Conclusions</h1>
+      <p>This document summarizes findings across four numbered research questions about agent docs.</p>
+      <h3>1. How well are key programming languages supported by code examples?</h3>
+      <p>Most documentation covers JavaScript and Python well but lags on Go and Rust ecosystem support.</p>
+      <h3>2. Are authentication flows documented for both web and machine-to-machine clients?</h3>
+      <p>Web flows are well documented but service-to-service flows often skip the credential rotation steps.</p>
+      <h3>3. Is the rate-limiting behavior described in enough detail to handle gracefully?</h3>
+      <p>Most APIs document headers and quotas but few cover the retry-after semantics in detail.</p>
+      <h3>4. Are versioning and deprecation policies clearly communicated to integrators?</h3>
+      <p>Versioning is consistent across the platforms surveyed but deprecation timelines are often vague.</p>
+      <p>The above questions framed the audit and inform the recommendations in the next section.</p>
+    </main></body></html>`;
+
+    const markdown = `# Audit Conclusions
+
+This document summarizes findings across four numbered research questions about agent docs.
+
+### 1. How well are key programming languages supported by code examples?
+
+Most documentation covers JavaScript and Python well but lags on Go and Rust ecosystem support.
+
+### 2. Are authentication flows documented for both web and machine-to-machine clients?
+
+Web flows are well documented but service-to-service flows often skip the credential rotation steps.
+
+### 3. Is the rate-limiting behavior described in enough detail to handle gracefully?
+
+Most APIs document headers and quotas but few cover the retry-after semantics in detail.
+
+### 4. Are versioning and deprecation policies clearly communicated to integrators?
+
+Versioning is consistent across the platforms surveyed but deprecation timelines are often vague.
+
+The above questions framed the audit and inform the recommendations in the next section.`;
+
+    const url = 'http://mcp-numbered-headings.local/blog/audit';
+    server.use(
+      http.get(
+        url,
+        () =>
+          new HttpResponse(html, {
+            status: 200,
+            headers: { 'Content-Type': 'text/html' },
+          }),
+      ),
+    );
+
+    const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-numbered-headings.local');
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>;
+    expect(pageResults[0].missingSegments).toBe(0);
+  });
+
   it('uses configurable thresholds (0/0 = informational mode)', async () => {
     // Setting both thresholds to 0 means the check always passes,
     // making it informational for sites with intentional content divergence.