Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ const createLLMInferenceSchema = z.object({
displayName: z.string().min(1, 'Display name is required'),
llmModel: z.string().min(1, 'LLM model name is required'),
type: z.enum(['openai', 'anthropic', 'gemini', 'azure_openai', 'ollama', 'vllm', 'groq']),
modelType: z.enum(['llm', 'embedding']),
apiKey: z.string().optional(),
baseUrl: z.string().optional(),
});
Expand Down Expand Up @@ -103,6 +104,7 @@ const CreateLLMInferenceDialog: React.FC<CreateLLMInferenceDialogProps> = ({
displayName: '',
llmModel: '',
type: 'openai',
modelType: 'llm',
apiKey: '',
baseUrl: getDefaultBaseUrl('openai'),
},
Expand Down Expand Up @@ -131,6 +133,7 @@ const CreateLLMInferenceDialog: React.FC<CreateLLMInferenceDialogProps> = ({
displayName: '',
llmModel: '',
type: defaultType,
modelType: 'llm',
apiKey: '',
baseUrl: getDefaultBaseUrl(defaultType),
});
Expand Down Expand Up @@ -171,6 +174,7 @@ const CreateLLMInferenceDialog: React.FC<CreateLLMInferenceDialogProps> = ({
llm_model: data.llmModel.trim(),
api_key: data.apiKey?.trim() || undefined,
type: data.type,
model_type: data.modelType,
base_url: data.baseUrl?.trim() || undefined,
parameters: Object.keys(cleanedParams).length > 0 ? cleanedParams : undefined,
});
Expand Down Expand Up @@ -366,6 +370,31 @@ const CreateLLMInferenceDialog: React.FC<CreateLLMInferenceDialogProps> = ({
</FormItem>
)}
/>

<FormField
control={form.control}
name="modelType"
render={({ field }) => (
<FormItem>
<FormLabel>
Model Type<span className="text-red-500">*</span>
</FormLabel>
<Select onValueChange={field.onChange} value={field.value}>
<FormControl>
<SelectTrigger>
<SelectValue placeholder="Select model type" />
</SelectTrigger>
</FormControl>
<SelectContent>
<SelectItem value="llm">LLM</SelectItem>
<SelectItem value="embedding">Embedding</SelectItem>
</SelectContent>
</Select>
<FormDescription>Choose whether this is an LLM or embedding model</FormDescription>
<FormMessage />
</FormItem>
)}
/>
</div>

<div className="grid grid-cols-2 gap-6">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const llmConfigFormSchema = z.object({
llm_model: z.string().min(1, 'LLM model is required'),
type: z.enum(['openai', 'anthropic', 'gemini', 'azure_openai', 'ollama', 'vllm', 'groq']),
api_key: z.string().optional(),
model_type: z.enum(['llm', 'embedding']),
base_url: z.string().optional(),
parameters: z.record(z.any()).optional(),
});
Expand All @@ -74,6 +75,7 @@ const LLMInferenceConfigDetail: React.FC = () => {
display_name: '',
llm_model: '',
type: 'openai',
model_type: 'llm',
api_key: '',
base_url: '',
parameters: {},
Expand All @@ -91,6 +93,7 @@ const LLMInferenceConfigDetail: React.FC = () => {
display_name: config.display_name,
llm_model: config.llm_model,
type: config.type,
model_type: (config.model_type as 'llm' | 'embedding') || 'llm',
api_key: '', // API key is never returned for security
base_url: config.base_url || '',
parameters: mergedParams,
Expand Down Expand Up @@ -134,6 +137,7 @@ const LLMInferenceConfigDetail: React.FC = () => {
display_name: data.display_name.trim(),
llm_model: data.llm_model.trim(),
type: data.type,
model_type: data.model_type,
parameters: Object.keys(cleanedParams).length > 0 ? cleanedParams : null,
};

Expand Down Expand Up @@ -244,6 +248,7 @@ const LLMInferenceConfigDetail: React.FC = () => {
display_name: config.display_name,
llm_model: config.llm_model,
type: config.type,
model_type: (config.model_type as 'llm' | 'embedding') || 'llm',
api_key: '',
base_url: config.base_url || '',
parameters: mergedParams,
Expand Down Expand Up @@ -369,6 +374,28 @@ const LLMInferenceConfigDetail: React.FC = () => {
)}
/>
)}

<FormField
control={form.control}
name="model_type"
render={({ field }) => (
<FormItem>
<FormLabel>Model Type</FormLabel>
<Select onValueChange={field.onChange} value={field.value} disabled={!editing}>
<FormControl>
<SelectTrigger>
<SelectValue placeholder="Select model type" />
</SelectTrigger>
</FormControl>
<SelectContent>
<SelectItem value="llm">LLM</SelectItem>
<SelectItem value="embedding">Embedding</SelectItem>
</SelectContent>
</Select>
<FormMessage />
</FormItem>
)}
/>
</div>
</form>
</Form>
Expand Down
3 changes: 3 additions & 0 deletions wavefront/client/src/types/llm-inference-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export interface LLMInferenceConfig {
llm_model: string;
display_name: string;
type: InferenceEngineType;
model_type?: string;
base_url?: string;
parameters?: Record<string, any> | null;
is_deleted: boolean;
Expand All @@ -19,6 +20,7 @@ export interface CreateLLMConfigRequest {
display_name: string;
api_key?: string;
type: InferenceEngineType;
model_type?: string;
base_url?: string;
parameters?: Record<string, any> | null;
}
Expand All @@ -28,6 +30,7 @@ export interface UpdateLLMConfigRequest {
display_name?: string;
api_key?: string | null;
type?: InferenceEngineType;
model_type?: string;
base_url?: string | null;
parameters?: Record<string, any> | null;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from dataclasses import dataclass
from typing import Union
from rag_ingestion.processors.file_processor import DocumentType


@dataclass
class DocContent:
"""Model representing the extracted content from a document file"""

content: Union[str, bytes]
parse_type: str
document_type: DocumentType
Original file line number Diff line number Diff line change
@@ -1,48 +1,59 @@
import os
import tempfile
import textract
from typing import Union
from typing import Tuple
from enum import Enum
from common_module.log.logger import logger


class FileProcessor:
def process_file(self, file_content: bytes, file_type: str) -> Union[str, bytes]:
mime_type = file_type
class DocumentType(Enum):
PDF = 'pdf'
IMAGE = 'image'
TEXT = 'text'

if mime_type.startswith('text/plain'):
return file_content.decode('utf-8')

if mime_type.startswith('image/'):
return file_content
class FileProcessor:
def process_file(
self, file_content: bytes, file_type: str
) -> Tuple[str | bytes, DocumentType]:
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
mime_type = file_type
document_type = self.extract_document_type(mime_type)
if document_type == DocumentType.TEXT:
return file_content.decode('utf-8'), DocumentType.TEXT

if mime_type.startswith('application/'):
try:
sub_type = mime_type.split('/')[1]
except IndexError:
raise ValueError(
f'Unsupported file type: Malformed MIME type "{mime_type}"'
)
if document_type == DocumentType.IMAGE:
return file_content, DocumentType.IMAGE

# Set delete=False to keep the file until we manually call os.unlink
if document_type == DocumentType.PDF:
with tempfile.NamedTemporaryFile(
mode='w+b', delete=False, suffix=f'.{sub_type}'
mode='w+b', delete=False, suffix='.pdf'
) as temp_file:
temp_file.write(file_content)
temp_file.flush() # Ensure data is written to disk before processing
temp_file.flush()
temp_file_path = temp_file.name

try:
# Process the file using its path
text_content = textract.process(
temp_file_path, method='pdfminer'
).decode('utf-8')
return text_content
return text_content, DocumentType.PDF

except Exception as e:
# Re-raise processing errors
logger.error(f'Text extraction failed for {mime_type}: {e}')
raise RuntimeError(f'Text extraction failed for {mime_type}: {e}')
Comment on lines 41 to 43
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Improve exception handling with chaining and specific types.

The exception handling has two issues:

  1. Catching blind Exception is too broad and may mask unexpected errors
  2. The re-raised RuntimeError lacks from e, which loses the original traceback

Apply this diff:

-            except Exception as e:
+            except (textract.exceptions.ShellError, UnicodeDecodeError, OSError) as e:
                 logger.error(f'Text extraction failed for {mime_type}: {e}')
-                raise RuntimeError(f'Text extraction failed for {mime_type}: {e}')
+                raise RuntimeError(f'Text extraction failed for {mime_type}: {e}') from e

Note: Verify the specific exception types raised by textract if textract.exceptions.ShellError is not available.

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.14.8)

41-41: Do not catch blind exception: Exception

(BLE001)


43-43: Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling

(B904)


43-43: Avoid specifying long messages outside the exception class

(TRY003)


finally:
os.unlink(temp_file_path)

# Explicit raise to prevent implicit None return.
raise RuntimeError(f'Unsupported or unknown document type: {document_type}')

def extract_document_type(self, file_type: str) -> DocumentType:
if file_type.startswith('text/plain'):
return DocumentType.TEXT
if file_type.startswith('image/'):
return DocumentType.IMAGE
if file_type in ('application/pdf', 'application/x-pdf'):
return DocumentType.PDF
else:
raise ValueError(f'Unsupported file type: {mime_type}')
raise ValueError(f'Unsupported file type: {file_type}')
Comment on lines +51 to +59
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Implement case-insensitive MIME type matching.

MIME types are case-insensitive per RFC 2045, but the current implementation uses case-sensitive startswith() and in checks. This could reject valid uploads if MIME types arrive with different casing (e.g., Text/Plain, IMAGE/PNG, Application/PDF).

Apply this diff:

     def extract_document_type(self, file_type: str) -> DocumentType:
-        if file_type.startswith('text/plain'):
+        file_type_lower = file_type.lower()
+        if file_type_lower.startswith('text/plain'):
             return DocumentType.TEXT
-        if file_type.startswith('image/'):
+        if file_type_lower.startswith('image/'):
             return DocumentType.IMAGE
-        if file_type in ('application/pdf', 'application/x-pdf'):
+        if file_type_lower in ('application/pdf', 'application/x-pdf'):
             return DocumentType.PDF
         else:
             raise ValueError(f'Unsupported file type: {file_type}')
🧰 Tools
🪛 Ruff (0.14.8)

59-59: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In
wavefront/server/background_jobs/rag_ingestion/rag_ingestion/processors/file_processor.py
around lines 51 to 59, the MIME type checks are case-sensitive and can fail for
differently-cased values; normalize the incoming file_type to a canonical
lower-case string (and defensively handle None or empty values) before
performing startswith() or in checks, then use the same lower-case comparisons
for 'text/plain', 'image/', and the PDF MIME variants so matching is
case-insensitive.

Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from flo_cloud.cloud_storage import CloudStorageManager
from dataclasses import dataclass
from typing import List, Union
from typing import List
from flo_utils.utils.log import logger
from rag_ingestion.service.kb_rag_storage import KBRagStorage
from rag_ingestion.embeddings.embed import EmbeddingFunc
from rag_ingestion.models.doc_content import DocContent
from rag_ingestion.stream.queue_message import QueueMessage
from flo_cloud.kms import FloKmsService
from flo_utils.streaming.message_processor import MessageProcessor, ProcessingResult
from rag_ingestion.processors.file_processor import FileProcessor
from rag_ingestion.processors.file_processor import FileProcessor, DocumentType
from rag_ingestion.embeddings.image_embed import ImageEmbedding
from rag_ingestion.models.knowledge_base_embeddings import KnowledgeBaseEmbeddingObject
from rag_ingestion.models.rag_message import RagEventMessage
Expand All @@ -20,7 +20,7 @@ class KbStorageInsights:
doc_id: str
doc_content: DocContent
kb_id: str
file_type: str
file_type: DocumentType


class KbStorageProcessor(MessageProcessor):
Expand All @@ -36,7 +36,7 @@ def __init__(
self.file_processor = FileProcessor()
self.image_embedding = ImageEmbedding()

async def _extract_text_from_message(
async def _extract_content(
self, message: QueueMessage, file_content: bytes
) -> DocContent:
"""
Expand All @@ -49,12 +49,10 @@ async def _extract_text_from_message(
Returns:
A DocContent object with extracted content and parse_type.
"""
if message.parse_type != 'kb_insertion':
return DocContent(content='', parse_type=message.parse_type)
content: Union[str, bytes] = self.file_processor.process_file(
(content, document_type) = self.file_processor.process_file(
file_content, str(message.file_type)
)
return DocContent(content=content, parse_type=message.parse_type)
return DocContent(content=content, document_type=document_type)

def __insert_kb_from_message(
self, insights: List[ProcessingResult[KbStorageInsights]]
Expand All @@ -74,27 +72,29 @@ def __insert_kb_from_message(
for kb_insight in insights:
kb_id = kb_insight.insights.kb_id
doc_id = kb_insight.insights.doc_id
file_type = kb_insight.insights.file_type
document_type = kb_insight.insights.doc_content.document_type

logger.info('Embeddings storing process is started')
if file_type and file_type.lower() in ('pdf', 'application/pdf'):
if (
document_type == DocumentType.PDF
or document_type == DocumentType.TEXT
):
extracted_docs = [kb_insight.insights.doc_content.content]
docs: List[KnowledgeBaseEmbeddingObject] = (
self.kb_rag_storage.process_document(extracted_docs)
)
else:
elif document_type == DocumentType.IMAGE:
image_data = [kb_insight.insights.doc_content.content]
docs: List[KnowledgeBaseEmbeddingObject] = [
self.image_embedding.embed_image(image_data)
for image_data in image_data
]

embeddings.append(
EmbeddingsToStore(
kb_embeddings=docs,
doc_id=doc_id,
kb_id=kb_id,
file_type=file_type,
file_type=document_type,
)
)

Expand Down Expand Up @@ -126,14 +126,14 @@ async def process(
if self.encryption_service
else file_content_encrypt
)
doc_content = await self._extract_text_from_message(message, file_content)
doc_content = await self._extract_content(message, file_content)
return ProcessingResult[KbStorageInsights](
success=True,
insights=KbStorageInsights(
doc_id=message.doc_id,
doc_content=doc_content,
kb_id=message.kb_id,
file_type=message.file_type,
file_type=doc_content.document_type,
),
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
from typing import Any, List, Dict, Tuple, Optional
import logging
import numpy as np
import tiktoken
from datetime import datetime
import uuid
from rag_ingestion.embeddings.embed import EmbeddingFunc
import httpx
import time
import ast
import numpy as np
from flo_utils.utils.log import logger
from datetime import datetime
from dataclasses import dataclass
from rag_ingestion.env import FLOWARE_SERVICE_URL, APP_ENV, PASSTHROUGH_SECRET
from rag_ingestion.constants.auth import RootfloHeaders
from rag_ingestion.models.knowledge_base_embeddings import (
KnowledgeBaseEmbeddingObject,
RetrieveParams,
)
from flo_utils.utils.log import logger
import ast
from typing import Any, List, Dict, Tuple, Optional
from rag_ingestion.embeddings.embed import EmbeddingFunc
from rag_ingestion.processors.file_processor import DocumentType


@dataclass
class EmbeddingsToStore:
kb_embeddings: List[KnowledgeBaseEmbeddingObject]
doc_id: str
kb_id: str
file_type: str
file_type: DocumentType


class KBRagStorage:
Expand Down
Loading