From 1d45e958647e4ff0ad018ffadac701d058d12d10 Mon Sep 17 00:00:00 2001
From: 0xbbjoker <0xbbjoker@proton.me>
Date: Fri, 7 Nov 2025 16:54:03 +0100
Subject: [PATCH] feat(plugin-knowledge): migrate from pdfjs-dist to unpdf for
 universal PDF parsing

---
 package.json |  6 ++--
 src/utils.ts | 83 ++++++++++++++++++----------------------------------
 2 files changed, 32 insertions(+), 57 deletions(-)
diff --git a/package.json b/package.json
index d9ca2ae..f81f9a2 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@elizaos/plugin-knowledge",
   "description": "Plugin for Knowledge",
-  "version": "1.5.13",
+  "version": "1.5.14",
   "type": "module",
   "main": "dist/index.js",
   "module": "dist/index.js",
@@ -30,7 +30,7 @@
     "@ai-sdk/anthropic": "^2.0.17",
     "@ai-sdk/google": "^2.0.14",
     "@ai-sdk/openai": "^2.0.32",
-    "@elizaos/core": "^1.5.10",
+    "@elizaos/core": "^1.6.4",
     "@openrouter/ai-sdk-provider": "^1.2.0",
     "@tanstack/react-query": "^5.51.1",
     "ai": "^5.0.48",
@@ -39,11 +39,11 @@
     "lucide-react": "^0.525.0",
     "mammoth": "^1.9.0",
     "multer": "^2.0.1",
-    "pdfjs-dist": "^5.2.133",
     "react": "^19.1.0",
     "react-dom": "^19.1.0",
     "react-force-graph-2d": "^1.27.1",
     "tailwind-merge": "^3.3.1",
+    "unpdf": "^1.4.0",
     "zod": "3.25.76"
   },
   "devDependencies": {
diff --git a/src/utils.ts b/src/utils.ts
index 8b9e0a3..1c490f9 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -1,8 +1,7 @@
 import { Buffer } from 'node:buffer';
 import * as mammoth from 'mammoth';
 import { logger } from '@elizaos/core';
-import { getDocument, PDFDocumentProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
-import type { TextItem, TextMarkedContent } from 'pdfjs-dist/types/src/display/api';
+import { extractText } from 'unpdf';
 import { createHash } from 'crypto';
 import { v5 as uuidv5 } from 'uuid';
 
@@ -112,11 +111,8 @@ export async function extractTextFromFileBuffer(
 }
 
 /**
- * Converts a PDF file buffer to text content.
- * Requires pdfjs-dist to be properly configured, especially its worker.
- */
-/**
- * Converts a PDF Buffer to text with enhanced formatting preservation.
+ * Converts a PDF Buffer to text using unpdf (universal PDF parser).
+ * Works in Node.js, Bun, Browser, Edge, and Serverless environments.
  *
  * @param {Buffer} pdfBuffer - The PDF Buffer to convert to text
  * @param {string} [filename] - Optional filename for logging purposes
@@ -127,47 +123,36 @@ export async function convertPdfToTextFromBuffer(
   filename?: string
 ): Promise<string> {
   const docName = filename || 'unnamed-document';
-  logger.debug(`[PdfService] Starting conversion for ${docName}`);
+  logger.debug(`[PdfService] Starting conversion for ${docName} using unpdf`);
 
   try {
-    const uint8Array = new Uint8Array(pdfBuffer);
-    const pdf: PDFDocumentProxy = await getDocument({ data: uint8Array }).promise;
-    const numPages = pdf.numPages;
-    const textPages: string[] = [];
-
-    for (let pageNum = 1; pageNum <= numPages; pageNum++) {
-      logger.debug(`[PdfService] Processing page ${pageNum}/${numPages}`);
-      const page = await pdf.getPage(pageNum);
-      const textContent = await page.getTextContent();
-
-      // Group text items by their y-position to maintain line structure
-      const lineMap = new Map<number, TextItem[]>();
-
-      textContent.items.filter(isTextItem).forEach((item) => {
-        // Round y-position to account for small variations in the same line
-        const yPos = Math.round(item.transform[5]);
-        if (!lineMap.has(yPos)) {
-          lineMap.set(yPos, []);
-        }
-        lineMap.get(yPos)!.push(item);
-      });
-
-      // Sort lines by y-position (top to bottom) and items within lines by x-position (left to right)
-      const sortedLines = Array.from(lineMap.entries())
-        .sort((a, b) => b[0] - a[0]) // Reverse sort for top-to-bottom
-        .map(([_, items]) =>
-          items
-            .sort((a, b) => a.transform[4] - b.transform[4])
-            .map((item) => item.str)
-            .join(' ')
-        );
-
-      textPages.push(sortedLines.join('\n'));
+    // unpdf requires Uint8Array - convert Buffer properly
+    // Buffer.from() returns a Buffer, but we need a pure Uint8Array
+    const uint8Array = new Uint8Array(
+      pdfBuffer.buffer.slice(pdfBuffer.byteOffset, pdfBuffer.byteOffset + pdfBuffer.byteLength)
+    );
+
+    const result = await extractText(uint8Array, {
+      mergePages: true, // Merge all pages into a single string
+    });
+
+    if (!result.text || result.text.trim().length === 0) {
+      logger.warn(`[PdfService] No text extracted from ${docName}`);
+      return '';
     }
 
-    const fullText = textPages.join('\n\n').replace(/\s+/g, ' ').trim();
-    logger.debug(`[PdfService] Conversion complete for ${docName}, length: ${fullText.length}`);
-    return fullText;
+    // Clean up excessive whitespace while preserving paragraph structure
+    const cleanedText = result.text
+      .split('\n')
+      .map((line: string) => line.trim())
+      .filter((line: string) => line.length > 0)
+      .join('\n')
+      .replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines
+
+    logger.debug(
+      `[PdfService] Conversion complete for ${docName}, ${result.totalPages} pages, length: ${cleanedText.length}`
+    );
+    return cleanedText;
   } catch (error: any) {
     logger.error(`[PdfService] Error converting PDF ${docName}:`, error.message);
     throw new Error(`Failed to convert PDF to text: ${error.message}`);
@@ -341,16 +326,6 @@ export function isBinaryContentType(contentType: string, filename: string): bool
   return binaryExtensions.includes(fileExt);
 }
 
-/**
- * Check if the input is a TextItem.
- *
- * @param item - The input item to check.
- * @returns A boolean indicating if the input is a TextItem.
- */
-function isTextItem(item: TextItem | TextMarkedContent): item is TextItem {
-  return 'str' in item;
-}
-
 /**
  * Normalizes an S3 URL by removing query parameters (signature, etc.)
  * This allows for consistent URL comparison regardless of presigned URL parameters