From 1d45e958647e4ff0ad018ffadac701d058d12d10 Mon Sep 17 00:00:00 2001 From: 0xbbjoker <0xbbjoker@proton.me> Date: Fri, 7 Nov 2025 16:54:03 +0100 Subject: [PATCH] feat(plugin-knowledge): migrate from pdfjs-dist to unpdf for universal PDF parsing --- package.json | 6 ++-- src/utils.ts | 83 ++++++++++++++++++---------------------------------- 2 files changed, 32 insertions(+), 57 deletions(-) diff --git a/package.json b/package.json index d9ca2ae..f81f9a2 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@elizaos/plugin-knowledge", "description": "Plugin for Knowledge", - "version": "1.5.13", + "version": "1.5.14", "type": "module", "main": "dist/index.js", "module": "dist/index.js", @@ -30,7 +30,7 @@ "@ai-sdk/anthropic": "^2.0.17", "@ai-sdk/google": "^2.0.14", "@ai-sdk/openai": "^2.0.32", - "@elizaos/core": "^1.5.10", + "@elizaos/core": "^1.6.4", "@openrouter/ai-sdk-provider": "^1.2.0", "@tanstack/react-query": "^5.51.1", "ai": "^5.0.48", @@ -39,11 +39,11 @@ "lucide-react": "^0.525.0", "mammoth": "^1.9.0", "multer": "^2.0.1", - "pdfjs-dist": "^5.2.133", "react": "^19.1.0", "react-dom": "^19.1.0", "react-force-graph-2d": "^1.27.1", "tailwind-merge": "^3.3.1", + "unpdf": "^1.4.0", "zod": "3.25.76" }, "devDependencies": { diff --git a/src/utils.ts b/src/utils.ts index 8b9e0a3..1c490f9 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,8 +1,7 @@ import { Buffer } from 'node:buffer'; import * as mammoth from 'mammoth'; import { logger } from '@elizaos/core'; -import { getDocument, PDFDocumentProxy } from 'pdfjs-dist/legacy/build/pdf.mjs'; -import type { TextItem, TextMarkedContent } from 'pdfjs-dist/types/src/display/api'; +import { extractText } from 'unpdf'; import { createHash } from 'crypto'; import { v5 as uuidv5 } from 'uuid'; @@ -112,11 +111,8 @@ export async function extractTextFromFileBuffer( } /** - * Converts a PDF file buffer to text content. - * Requires pdfjs-dist to be properly configured, especially its worker. - */ -/** - * Converts a PDF Buffer to text with enhanced formatting preservation. + * Converts a PDF Buffer to text using unpdf (universal PDF parser). + * Works in Node.js, Bun, Browser, Edge, and Serverless environments. * * @param {Buffer} pdfBuffer - The PDF Buffer to convert to text * @param {string} [filename] - Optional filename for logging purposes @@ -127,47 +123,36 @@ export async function convertPdfToTextFromBuffer( filename?: string ): Promise { const docName = filename || 'unnamed-document'; - logger.debug(`[PdfService] Starting conversion for ${docName}`); + logger.debug(`[PdfService] Starting conversion for ${docName} using unpdf`); try { - const uint8Array = new Uint8Array(pdfBuffer); - const pdf: PDFDocumentProxy = await getDocument({ data: uint8Array }).promise; - const numPages = pdf.numPages; - const textPages: string[] = []; - - for (let pageNum = 1; pageNum <= numPages; pageNum++) { - logger.debug(`[PdfService] Processing page ${pageNum}/${numPages}`); - const page = await pdf.getPage(pageNum); - const textContent = await page.getTextContent(); - - // Group text items by their y-position to maintain line structure - const lineMap = new Map(); - - textContent.items.filter(isTextItem).forEach((item) => { - // Round y-position to account for small variations in the same line - const yPos = Math.round(item.transform[5]); - if (!lineMap.has(yPos)) { - lineMap.set(yPos, []); - } - lineMap.get(yPos)!.push(item); - }); - - // Sort lines by y-position (top to bottom) and items within lines by x-position (left to right) - const sortedLines = Array.from(lineMap.entries()) - .sort((a, b) => b[0] - a[0]) // Reverse sort for top-to-bottom - .map(([_, items]) => - items - .sort((a, b) => a.transform[4] - b.transform[4]) - .map((item) => item.str) - .join(' ') - ); - - textPages.push(sortedLines.join('\n')); + // unpdf requires Uint8Array - convert Buffer properly + // Buffer.from() returns a Buffer, but we need a pure Uint8Array + const uint8Array = new Uint8Array( + pdfBuffer.buffer.slice(pdfBuffer.byteOffset, pdfBuffer.byteOffset + pdfBuffer.byteLength) + ); + + const result = await extractText(uint8Array, { + mergePages: true, // Merge all pages into a single string + }); + + if (!result.text || result.text.trim().length === 0) { + logger.warn(`[PdfService] No text extracted from ${docName}`); + return ''; } - const fullText = textPages.join('\n\n').replace(/\s+/g, ' ').trim(); - logger.debug(`[PdfService] Conversion complete for ${docName}, length: ${fullText.length}`); - return fullText; + // Clean up excessive whitespace while preserving paragraph structure + const cleanedText = result.text + .split('\n') + .map((line: string) => line.trim()) + .filter((line: string) => line.length > 0) + .join('\n') + .replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines + + logger.debug( + `[PdfService] Conversion complete for ${docName}, ${result.totalPages} pages, length: ${cleanedText.length}` + ); + return cleanedText; } catch (error: any) { logger.error(`[PdfService] Error converting PDF ${docName}:`, error.message); throw new Error(`Failed to convert PDF to text: ${error.message}`); @@ -341,16 +326,6 @@ export function isBinaryContentType(contentType: string, filename: string): bool return binaryExtensions.includes(fileExt); } -/** - * Check if the input is a TextItem. - * - * @param item - The input item to check. - * @returns A boolean indicating if the input is a TextItem. - */ -function isTextItem(item: TextItem | TextMarkedContent): item is TextItem { - return 'str' in item; -} - /** * Normalizes an S3 URL by removing query parameters (signature, etc.) * This allows for consistent URL comparison regardless of presigned URL parameters