From d5dd0389071fec4cbb60889c2f7a8d2a2f72811e Mon Sep 17 00:00:00 2001 From: Hannes Etzelstorfer Date: Sun, 8 Dec 2024 09:43:33 +0100 Subject: [PATCH] create internal links & improved sanitizeUrl for non-english characters --- src/astronot.js | 5 +++- src/helpers/links.mjs | 51 ++++++++++++++++++++++++++++++++++++++++ src/helpers/sanitize.mjs | 6 +++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 src/helpers/links.mjs diff --git a/src/astronot.js b/src/astronot.js index 040c4df..820d1ae 100644 --- a/src/astronot.js +++ b/src/astronot.js @@ -6,6 +6,7 @@ import { config } from 'dotenv'; import { parseArgs } from 'node:util'; import { sanitizeUrl, sanitizeImageString } from './helpers/sanitize.mjs'; import { hashString, downloadImage } from './helpers/images.mjs'; +import { convertInternalLinks } from './helpers/links.mjs'; import { delay } from './helpers/delay.mjs'; // Input Arguments @@ -123,10 +124,12 @@ const pages = results.map((page) => { for (let page of pages) { console.info("Fetching from Notion & Converting to Markdown: ", `${page.title} [${page.id}]`); const mdblocks = await n2m.pageToMarkdown(page.id); - const { parent: mdString } = n2m.toMarkdownString(mdblocks); + let { parent: mdString } = n2m.toMarkdownString(mdblocks); const estimatedReadingTime = readingTime(mdString || '').text; + mdString = convertInternalLinks(mdString, pages); + // Download Cover Image const coverFileName = page.cover ? await downloadImage(page.cover, { isCover: true }) : ''; if (coverFileName) console.info("Cover image downloaded:", coverFileName) diff --git a/src/helpers/links.mjs b/src/helpers/links.mjs new file mode 100644 index 0000000..58517e2 --- /dev/null +++ b/src/helpers/links.mjs @@ -0,0 +1,51 @@ +/** + * convert internal links in markdown to the correct url + * + * detects links like [link text](/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX) and [link text](XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX) + * if the link matches a page id in the pages array, it replaces the link with the page url + * @param {*} mdString + * @param {*} pages + * @returns + */ +export function convertInternalLinks(mdString, pages) { + // find all link url in markdown + const links = mdString.match(/\[.*?\]\(.*?\)/g); + if (!links) { + return mdString; + } + links.forEach(link => { + // find the page title in the link + let title = link.match(/\[(.*?)\]/)[1]; + + // find the page slug in the link + const slug = link.match(/\((.*?)\)/)[1]; + let targetPageId = null; + + // skip external links + if (slug.startsWith("http")) { + return; + } + + // inline links start with "/" and do not have "-" in the slug + if (slug.startsWith("/") && slug.length === 33) { + // remove the leading "/" from the slug and add "-" like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX + targetPageId = slug.substring(1, 9) + "-" + slug.substring(9, 13) + "-" + slug.substring(13, 17) + "-" + slug.substring(17, 21) + "-" + slug.substring(21); + } + // block links are like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX without the leading "/" + else if (slug.length === 36) { + targetPageId = slug; + } + + // find the page id in the pages array + const page = pages.find(page => page.id === targetPageId); + // if the page exists, replace the link with the page url + if (page) { + // this seems to be a bug in the original code. Some links have the title "link_to_page" + if (title === "link_to_page") { + title = page.title; + } + mdString = mdString.replace(link, `[${title}](/posts/${page.slug})`); + } + }); + return mdString; +} diff --git a/src/helpers/sanitize.mjs b/src/helpers/sanitize.mjs index ea7db65..5e32bcb 100644 --- a/src/helpers/sanitize.mjs +++ b/src/helpers/sanitize.mjs @@ -1,6 +1,12 @@ export function sanitizeUrl(str) { return str + .normalize('NFD') // decompose combined graphemes into base characters and diacritical marks + .replace(/[\u0300-\u036f]/g, "") // remove diacritical marks .toLowerCase() // convert to lowercase + .replace('ä', 'ae') + .replace('ö', 'oe') + .replace('ü', 'ue') + .replace('ß', 'ss') .replace(/[^a-z0-9]+/g, "-") // replace non-alphanumeric characters with hyphens .replace(/(^-|-$)+/g, ""); // remove leading/trailing hyphens }