-
Notifications
You must be signed in to change notification settings - Fork 30
feat(rag): embedding model support & rag improvements #192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
980af79
cba19c2
ae7b1de
6ce989a
9fab565
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,11 @@ | ||
| from dataclasses import dataclass | ||
| from typing import Union | ||
| from rag_ingestion.processors.file_processor import DocumentType | ||
|
|
||
|
|
||
| @dataclass | ||
| class DocContent: | ||
| """Model representing the extracted content from a document file""" | ||
|
|
||
| content: Union[str, bytes] | ||
| parse_type: str | ||
| document_type: DocumentType |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,48 +1,59 @@ | ||
| import os | ||
| import tempfile | ||
| import textract | ||
| from typing import Union | ||
| from typing import Tuple | ||
| from enum import Enum | ||
| from common_module.log.logger import logger | ||
|
|
||
|
|
||
| class FileProcessor: | ||
| def process_file(self, file_content: bytes, file_type: str) -> Union[str, bytes]: | ||
| mime_type = file_type | ||
| class DocumentType(Enum): | ||
| PDF = 'pdf' | ||
| IMAGE = 'image' | ||
| TEXT = 'text' | ||
|
|
||
| if mime_type.startswith('text/plain'): | ||
| return file_content.decode('utf-8') | ||
|
|
||
| if mime_type.startswith('image/'): | ||
| return file_content | ||
| class FileProcessor: | ||
| def process_file( | ||
| self, file_content: bytes, file_type: str | ||
| ) -> Tuple[str | bytes, DocumentType]: | ||
| mime_type = file_type | ||
| document_type = self.extract_document_type(mime_type) | ||
| if document_type == DocumentType.TEXT: | ||
| return file_content.decode('utf-8'), DocumentType.TEXT | ||
|
|
||
| if mime_type.startswith('application/'): | ||
| try: | ||
| sub_type = mime_type.split('/')[1] | ||
| except IndexError: | ||
| raise ValueError( | ||
| f'Unsupported file type: Malformed MIME type "{mime_type}"' | ||
| ) | ||
| if document_type == DocumentType.IMAGE: | ||
| return file_content, DocumentType.IMAGE | ||
|
|
||
| # Set delete=False to keep the file until we manually call os.unlink | ||
| if document_type == DocumentType.PDF: | ||
| with tempfile.NamedTemporaryFile( | ||
| mode='w+b', delete=False, suffix=f'.{sub_type}' | ||
| mode='w+b', delete=False, suffix='.pdf' | ||
| ) as temp_file: | ||
| temp_file.write(file_content) | ||
| temp_file.flush() # Ensure data is written to disk before processing | ||
| temp_file.flush() | ||
| temp_file_path = temp_file.name | ||
|
|
||
| try: | ||
| # Process the file using its path | ||
| text_content = textract.process( | ||
| temp_file_path, method='pdfminer' | ||
| ).decode('utf-8') | ||
| return text_content | ||
| return text_content, DocumentType.PDF | ||
|
|
||
| except Exception as e: | ||
| # Re-raise processing errors | ||
| logger.error(f'Text extraction failed for {mime_type}: {e}') | ||
| raise RuntimeError(f'Text extraction failed for {mime_type}: {e}') | ||
|
Comment on lines
41
to
43
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improve exception handling with chaining and specific types. The exception handling has two issues:
Apply this diff: - except Exception as e:
+ except (textract.exceptions.ShellError, UnicodeDecodeError, OSError) as e:
logger.error(f'Text extraction failed for {mime_type}: {e}')
- raise RuntimeError(f'Text extraction failed for {mime_type}: {e}')
+ raise RuntimeError(f'Text extraction failed for {mime_type}: {e}') from eNote: Verify the specific exception types raised by textract if
🧰 Tools🪛 Ruff (0.14.8)41-41: Do not catch blind exception: (BLE001) 43-43: Within an (B904) 43-43: Avoid specifying long messages outside the exception class (TRY003) |
||
|
|
||
| finally: | ||
| os.unlink(temp_file_path) | ||
|
|
||
| # Explicit raise to prevent implicit None return. | ||
| raise RuntimeError(f'Unsupported or unknown document type: {document_type}') | ||
|
|
||
| def extract_document_type(self, file_type: str) -> DocumentType: | ||
| if file_type.startswith('text/plain'): | ||
| return DocumentType.TEXT | ||
| if file_type.startswith('image/'): | ||
| return DocumentType.IMAGE | ||
| if file_type in ('application/pdf', 'application/x-pdf'): | ||
| return DocumentType.PDF | ||
| else: | ||
| raise ValueError(f'Unsupported file type: {mime_type}') | ||
| raise ValueError(f'Unsupported file type: {file_type}') | ||
|
Comment on lines
+51
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implement case-insensitive MIME type matching. MIME types are case-insensitive per RFC 2045, but the current implementation uses case-sensitive Apply this diff: def extract_document_type(self, file_type: str) -> DocumentType:
- if file_type.startswith('text/plain'):
+ file_type_lower = file_type.lower()
+ if file_type_lower.startswith('text/plain'):
return DocumentType.TEXT
- if file_type.startswith('image/'):
+ if file_type_lower.startswith('image/'):
return DocumentType.IMAGE
- if file_type in ('application/pdf', 'application/x-pdf'):
+ if file_type_lower in ('application/pdf', 'application/x-pdf'):
return DocumentType.PDF
else:
raise ValueError(f'Unsupported file type: {file_type}')🧰 Tools🪛 Ruff (0.14.8)59-59: Avoid specifying long messages outside the exception class (TRY003) 🤖 Prompt for AI Agents |
||
Uh oh!
There was an error while loading. Please reload this page.