jsonMartin · haet · Dec 8, 2024 · coderabbitai · Dec 8, 2024 · coderabbitai
diff --git a/src/astronot.js b/src/astronot.js
@@ -6,6 +6,7 @@ import { config } from 'dotenv';
 import { parseArgs } from 'node:util';
 import { sanitizeUrl, sanitizeImageString } from './helpers/sanitize.mjs';
 import { hashString, downloadImage } from './helpers/images.mjs';
+import { convertInternalLinks } from './helpers/links.mjs';
 import { delay } from './helpers/delay.mjs';
 
 // Input Arguments
@@ -123,10 +124,12 @@ const pages = results.map((page) => {
 for (let page of pages) {
   console.info("Fetching from Notion & Converting to Markdown: ", `${page.title} [${page.id}]`);
   const mdblocks = await n2m.pageToMarkdown(page.id);
-  const { parent: mdString } = n2m.toMarkdownString(mdblocks);
+  let { parent: mdString } = n2m.toMarkdownString(mdblocks);
 
   const estimatedReadingTime = readingTime(mdString || '').text;
 
+  mdString = convertInternalLinks(mdString, pages);
-  let { parent: mdString } = n2m.toMarkdownString(mdblocks);
-
-  const estimatedReadingTime = readingTime(mdString || '').text;
-
-  mdString = convertInternalLinks(mdString, pages);
+  let { parent: mdString } = n2m.toMarkdownString(mdblocks);
+
+  try {
+    mdString = convertInternalLinks(mdString, pages);
+  } catch (error) {
+    console.error(`Error converting internal links for page ${page.id}:`, error);
+    // Continue with original mdString
+  }
+
+  const estimatedReadingTime = readingTime(mdString || '').text;
-  let { parent: mdString } = n2m.toMarkdownString(mdblocks);
-
-  const estimatedReadingTime = readingTime(mdString || '').text;
-
-  mdString = convertInternalLinks(mdString, pages);
+  let { parent: mdString } = n2m.toMarkdownString(mdblocks);
+
+  try {
+    mdString = convertInternalLinks(mdString, pages);
+  } catch (error) {
+    console.error(`Error converting internal links for page ${page.id}:`, error);
+    // Continue with original mdString
+  }
+
+  const estimatedReadingTime = readingTime(mdString || '').text;
+
   // Download Cover Image
   const coverFileName = page.cover ? await downloadImage(page.cover, { isCover: true }) : '';
   if (coverFileName) console.info("Cover image downloaded:", coverFileName)

diff --git a/src/helpers/links.mjs b/src/helpers/links.mjs
@@ -0,0 +1,51 @@
+/**
+ * convert internal links in markdown to the correct url
+ * 
+ * detects links like [link text](/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX) and [link text](XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX) 
+ * if the link matches a page id in the pages array, it replaces the link with the page url
+ * @param {*} mdString 
+ * @param {*} pages 
+ * @returns 
+ */
+export function convertInternalLinks(mdString, pages) {
+    // find all link url in markdown
+    const links = mdString.match(/\[.*?\]\(.*?\)/g);
+    if (!links) {
+        return mdString;
+    }
-    const links = mdString.match(/\[.*?\]\(.*?\)/g);
-    if (!links) {
-        return mdString;
-    }
+    if (!mdString || typeof mdString !== 'string') {
+        throw new Error('mdString must be a non-empty string');
+    }
+    if (!Array.isArray(pages)) {
+        throw new Error('pages must be an array');
+    }
+
+    // find all link url in markdown
+    const links = mdString.match(/\[([^\]]*)\]\(([^)]+)\)/g);
+    if (!links) {
+        return mdString;
+    }
-    const links = mdString.match(/\[.*?\]\(.*?\)/g);
-    if (!links) {
-        return mdString;
-    }
+    if (!mdString || typeof mdString !== 'string') {
+        throw new Error('mdString must be a non-empty string');
+    }
+    if (!Array.isArray(pages)) {
+        throw new Error('pages must be an array');
+    }
+
+    // find all link url in markdown
+    const links = mdString.match(/\[([^\]]*)\]\(([^)]+)\)/g);
+    if (!links) {
+        return mdString;
+    }
+    links.forEach(link => {
+        // find the page title in the link
+        let title = link.match(/\[(.*?)\]/)[1];
+
+        // find the page slug in the link
+        const slug = link.match(/\((.*?)\)/)[1];
+        let targetPageId = null;
+
+        // skip external links
+        if (slug.startsWith("http")) {
+            return;
+        }
+
+        // inline links start with "/" and do not have "-" in the slug
+        if (slug.startsWith("/") && slug.length === 33) {
+            // remove the leading "/" from the slug and add "-" like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
+            targetPageId = slug.substring(1, 9) + "-" + slug.substring(9, 13) + "-" + slug.substring(13, 17) + "-" + slug.substring(17, 21) + "-" + slug.substring(21);
+        }
+        // block links are like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX without the leading "/"
+        else if (slug.length === 36) {
+            targetPageId = slug;
+        }
-        if (slug.startsWith("/") && slug.length === 33) {
-            // remove the leading "/" from the slug and add "-" like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
-            targetPageId = slug.substring(1, 9) + "-" + slug.substring(9, 13) + "-" + slug.substring(13, 17) + "-" + slug.substring(17, 21) + "-" + slug.substring(21);
-        }
-        // block links are like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX without the leading "/"
-        else if (slug.length === 36) {
-            targetPageId = slug;
-        }
+        const uuidPattern = /^(?:\/)?([0-9a-f]{8})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{12})$/i;
+        const match = slug.match(uuidPattern);
+        if (match) {
+            targetPageId = match.slice(1).join('-');
+        }
-        if (slug.startsWith("/") && slug.length === 33) {
-            // remove the leading "/" from the slug and add "-" like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
-            targetPageId = slug.substring(1, 9) + "-" + slug.substring(9, 13) + "-" + slug.substring(13, 17) + "-" + slug.substring(17, 21) + "-" + slug.substring(21);
-        }
-        // block links are like XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX without the leading "/"
-        else if (slug.length === 36) {
-            targetPageId = slug;
-        }
+        const uuidPattern = /^(?:\/)?([0-9a-f]{8})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{12})$/i;
+        const match = slug.match(uuidPattern);
+        if (match) {
+            targetPageId = match.slice(1).join('-');
+        }
+
+        // find the page id in the pages array
+        const page = pages.find(page => page.id === targetPageId);
+        // if the page exists, replace the link with the page url
+        if (page) {
+            // this seems to be a bug in the original code. Some links have the title "link_to_page"
+            if (title === "link_to_page") {
+                title = page.title;
+            }
+            mdString = mdString.replace(link, `[${title}](/posts/${page.slug})`);
+        }
+    });
+    return mdString;
+}
diff --git a/src/helpers/sanitize.mjs b/src/helpers/sanitize.mjs
@@ -1,6 +1,12 @@
 export function sanitizeUrl(str) {
   return str
+    .normalize('NFD') // decompose combined graphemes into base characters and diacritical marks
+    .replace(/[\u0300-\u036f]/g, "") // remove diacritical marks
     .toLowerCase() // convert to lowercase
+    .replace('ä', 'ae')
+    .replace('ö', 'oe')
+    .replace('ü', 'ue')
+    .replace('ß', 'ss')
     .replace(/[^a-z0-9]+/g, "-") // replace non-alphanumeric characters with hyphens
     .replace(/(^-|-$)+/g, ""); // remove leading/trailing hyphens
 }