diff --git a/netlify/edge-functions/markdown-negotiation.ts b/netlify/edge-functions/markdown-negotiation.ts index 28f388b52..98754819e 100644 --- a/netlify/edge-functions/markdown-negotiation.ts +++ b/netlify/edge-functions/markdown-negotiation.ts @@ -6,7 +6,6 @@ const NOISE_SELECTORS = [ "style", "noscript", "template", - "svg", "header", "footer", "nav", @@ -127,8 +126,7 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string { .replace(//g, "") .replace(/]*>[\s\S]*?<\/script>/gi, "") .replace(/]*>[\s\S]*?<\/style>/gi, "") - .replace(/]*>[\s\S]*?<\/noscript>/gi, "") - .replace(/]*>[\s\S]*?<\/svg>/gi, ""); + .replace(/]*>[\s\S]*?<\/noscript>/gi, ""); const titleMatch = sanitized.match(/]*>([\s\S]*?)<\/title>/i); const title = titleMatch ? normalizeWhitespace(decodeHtmlEntities(stripTags(titleMatch[1]))) : ""; @@ -469,11 +467,13 @@ function selectContentRoot(doc: { querySelector: (selector: string) => any; body const preferredSelectors = [ "#main article", "main article", + ".main article", "article.guide", "article", "#main", "[role='main']", "main", + ".main", "body", ]; @@ -496,6 +496,11 @@ function extractPrimaryHtmlFragment(html: string): string { return mainHtml; } + const guideArticleMatch = html.match(/]*class=("')[^"']*\bguide\b[^"']*\1[^>]*>([\s\S]*?)<\/article>/i); + if (guideArticleMatch) { + return guideArticleMatch[2]; + } + const articleMatch = html.match(/]*>([\s\S]*?)<\/article>/i); if (articleMatch) { return articleMatch[1];