Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@elizaos/plugin-knowledge",
"description": "Plugin for Knowledge",
"version": "1.5.13",
"version": "1.5.14",
"type": "module",
"main": "dist/index.js",
"module": "dist/index.js",
Expand Down Expand Up @@ -30,7 +30,7 @@
"@ai-sdk/anthropic": "^2.0.17",
"@ai-sdk/google": "^2.0.14",
"@ai-sdk/openai": "^2.0.32",
"@elizaos/core": "^1.5.10",
"@elizaos/core": "^1.6.4",
"@openrouter/ai-sdk-provider": "^1.2.0",
"@tanstack/react-query": "^5.51.1",
"ai": "^5.0.48",
Expand All @@ -39,11 +39,11 @@
"lucide-react": "^0.525.0",
"mammoth": "^1.9.0",
"multer": "^2.0.1",
"pdfjs-dist": "^5.2.133",
"react": "^19.1.0",
"react-dom": "^19.1.0",
"react-force-graph-2d": "^1.27.1",
"tailwind-merge": "^3.3.1",
"unpdf": "^1.4.0",
"zod": "3.25.76"
},
"devDependencies": {
Expand Down
83 changes: 29 additions & 54 deletions src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import { Buffer } from 'node:buffer';
import * as mammoth from 'mammoth';
import { logger } from '@elizaos/core';
import { getDocument, PDFDocumentProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
import type { TextItem, TextMarkedContent } from 'pdfjs-dist/types/src/display/api';
import { extractText } from 'unpdf';
import { createHash } from 'crypto';
import { v5 as uuidv5 } from 'uuid';

Expand Down Expand Up @@ -112,11 +111,8 @@ export async function extractTextFromFileBuffer(
}

/**
* Converts a PDF file buffer to text content.
* Requires pdfjs-dist to be properly configured, especially its worker.
*/
/**
* Converts a PDF Buffer to text with enhanced formatting preservation.
* Converts a PDF Buffer to text using unpdf (universal PDF parser).
* Works in Node.js, Bun, Browser, Edge, and Serverless environments.
*
* @param {Buffer} pdfBuffer - The PDF Buffer to convert to text
* @param {string} [filename] - Optional filename for logging purposes
Expand All @@ -127,47 +123,36 @@ export async function convertPdfToTextFromBuffer(
filename?: string
): Promise<string> {
const docName = filename || 'unnamed-document';
logger.debug(`[PdfService] Starting conversion for ${docName}`);
logger.debug(`[PdfService] Starting conversion for ${docName} using unpdf`);

try {
const uint8Array = new Uint8Array(pdfBuffer);
const pdf: PDFDocumentProxy = await getDocument({ data: uint8Array }).promise;
const numPages = pdf.numPages;
const textPages: string[] = [];

for (let pageNum = 1; pageNum <= numPages; pageNum++) {
logger.debug(`[PdfService] Processing page ${pageNum}/${numPages}`);
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();

// Group text items by their y-position to maintain line structure
const lineMap = new Map<number, TextItem[]>();

textContent.items.filter(isTextItem).forEach((item) => {
// Round y-position to account for small variations in the same line
const yPos = Math.round(item.transform[5]);
if (!lineMap.has(yPos)) {
lineMap.set(yPos, []);
}
lineMap.get(yPos)!.push(item);
});

// Sort lines by y-position (top to bottom) and items within lines by x-position (left to right)
const sortedLines = Array.from(lineMap.entries())
.sort((a, b) => b[0] - a[0]) // Reverse sort for top-to-bottom
.map(([_, items]) =>
items
.sort((a, b) => a.transform[4] - b.transform[4])
.map((item) => item.str)
.join(' ')
);

textPages.push(sortedLines.join('\n'));
// unpdf requires Uint8Array - convert Buffer properly
// Buffer.from() returns a Buffer, but we need a pure Uint8Array
const uint8Array = new Uint8Array(
pdfBuffer.buffer.slice(pdfBuffer.byteOffset, pdfBuffer.byteOffset + pdfBuffer.byteLength)
);

const result = await extractText(uint8Array, {
mergePages: true, // Merge all pages into a single string
});

if (!result.text || result.text.trim().length === 0) {
logger.warn(`[PdfService] No text extracted from ${docName}`);
return '';
}

const fullText = textPages.join('\n\n').replace(/\s+/g, ' ').trim();
logger.debug(`[PdfService] Conversion complete for ${docName}, length: ${fullText.length}`);
return fullText;
// Clean up excessive whitespace while preserving paragraph structure
const cleanedText = result.text
.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n')
.replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines
Comment on lines +145 to +150
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fix text cleaning logic - it removes all paragraph breaks.

The current implementation has a logic error that defeats its stated purpose. The .filter((line: string) => line.length > 0) step removes ALL empty lines, making the subsequent regex /\n{3,}/g ineffective (it can never match after filtering). This results in losing all paragraph structure in the extracted text.

Apply this diff to preserve paragraph structure while still limiting excessive blank lines:

     // Clean up excessive whitespace while preserving paragraph structure
     const cleanedText = result.text
       .split('\n')
       .map((line: string) => line.trim())
-      .filter((line: string) => line.length > 0)
       .join('\n')
       .replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines

This will:

  • Trim leading/trailing whitespace from each line
  • Preserve empty lines (paragraph breaks)
  • Limit to a maximum of 1 blank line between paragraphs
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const cleanedText = result.text
.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n')
.replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines
const cleanedText = result.text
.split('\n')
.map((line: string) => line.trim())
.join('\n')
.replace(/\n{3,}/g, '\n\n'); // Max 2 consecutive newlines
🤖 Prompt for AI Agents
In src/utils.ts around lines 145 to 150, the current cleaning trims each line
then filters out empty lines which removes all paragraph breaks; instead remove
the .filter(...) so empty lines are preserved as paragraph separators, trim each
line, then collapse excessive blank lines by replacing runs of 3+ consecutive
newlines with exactly 2 (so at most one empty line between paragraphs), and
finally trim leading/trailing whitespace of the whole string.


logger.debug(
`[PdfService] Conversion complete for ${docName}, ${result.totalPages} pages, length: ${cleanedText.length}`
);
return cleanedText;
} catch (error: any) {
logger.error(`[PdfService] Error converting PDF ${docName}:`, error.message);
throw new Error(`Failed to convert PDF to text: ${error.message}`);
Expand Down Expand Up @@ -341,16 +326,6 @@ export function isBinaryContentType(contentType: string, filename: string): bool
return binaryExtensions.includes(fileExt);
}

/**
* Check if the input is a TextItem.
*
* @param item - The input item to check.
* @returns A boolean indicating if the input is a TextItem.
*/
function isTextItem(item: TextItem | TextMarkedContent): item is TextItem {
return 'str' in item;
}

/**
* Normalizes an S3 URL by removing query parameters (signature, etc.)
* This allows for consistent URL comparison regardless of presigned URL parameters
Expand Down