Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ OLLAMA_PORT=11434
OLLAMA_MODEL=mistral
CHUNK_SIZE=3000
TOKEN_ENCODING=cl100k_base
PROMPT_FILE=prompts/default.txt
PROMPT_FILE=prompts/mistral.txt
HEALTH_CHECK_TIMEOUT=5.0
OLLAMA_TIMEOUT=30.0
OLLAMA_MEMORY=12G
OLLAMA_MEMORY=12G
DEFAULT_CHUNK_TIME=40.0
2 changes: 1 addition & 1 deletion .github/workflows/build-backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Install Dependencies
run: |
cd backend
uv venv
uv .venv
source .venv/bin/activate
uv pip install -r requirements.txt

Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
Analyze, summarize, and explain information in PDF textbooks and whitepapers using OCR and AI analysis. Upload PDFs to get markdown summaries and analysis with real-time progress tracking.

## Features
- PDF text extraction with OCR
- PDF text extraction with PyMuPDF
- Real-time tracking of analysis progress
- Chunked file upload
- Chunked file upload to Ollama
- Automatic error recovery
- Memory-efficient processing
- Markdown formatted output
Expand Down Expand Up @@ -57,8 +57,9 @@ bun run dev

# Backend (default: http://localhost:8000)
cd backend
python -m venv venv
source venv/bin/activate
pip install uv
uv .venv
source .venv/bin/activate
pip install -r requirements.txt
uvicorn main:app --reload
```
Expand Down
4 changes: 3 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
FROM python:3.12-slim
WORKDIR /app

ARG VERSION

LABEL version=$VERSION

# Update pip
Expand All @@ -17,4 +19,4 @@ COPY requirements.txt .
RUN uv pip install --system -r requirements.txt

COPY . .
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "$BACKEND_PORT", "--lifespan=on", "--log-level=info"]
CMD uvicorn main:app --host 0.0.0.0 --port $BACKEND_PORT --lifespan=on --log-level=info
1 change: 1 addition & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Settings(BaseSettings):
PROMPT_FILE: str = os.environ.get("PROMPT_FILE", "prompts/default.txt")
HEALTH_CHECK_TIMEOUT: float = float(os.environ.get("HEALTH_CHECK_TIMEOUT", "5.0"))
OLLAMA_TIMEOUT: float = float(os.environ.get("OLLAMA_TIMEOUT", "30.0"))
DEFAULT_CHUNK_TIME: float = float(os.environ.get("DEFAULT_CHUNK_TIME", "30.0"))

class Config:
env_file = ".env"
Expand Down
185 changes: 159 additions & 26 deletions backend/app/services/ai.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from tenacity import retry, stop_after_attempt, wait_exponential
import ollama
import json
from fastapi import WebSocket
from typing import Optional
from typing import Optional, Dict, Any

from app.core.logging import logger
from app.core.config import settings
Expand All @@ -12,10 +13,11 @@ async def load_prompt(prompt_path: str = None) -> str:
"""
if prompt_path is None:
prompt_path = settings.PROMPT_FILE

try:
with open(prompt_path, 'r') as file:
return file.read().strip()
content = file.read().strip()
logger.info(f"Loaded prompt from {prompt_path} (length: {len(content)} chars)")
return content
except FileNotFoundError:
logger.warning(f"Prompt file {prompt_path} not found, using default prompt")
return """You are processing part of a document. Your task is to create a concise summary of this section while maintaining context with previous sections.
Expand All @@ -27,47 +29,178 @@ async def load_prompt(prompt_path: str = None) -> str:
4. Use clear paragraph breaks for different topics
5. Avoid repeating information that was covered in previous sections"""

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def process_chunk(chunk: str, websocket: WebSocket, chunk_index: int, total_chunks: int, previous_summary: Optional[str] = None) -> str:
def extract_tag_content(text: str, tag: str) -> str:
"""Extract content between XML-style tags, handling nested tags."""
start_tag = f"<{tag}>"
end_tag = f"</{tag}>"
result_start = "<result>"
result_end = "</result>"

try:
prompt_template = await load_prompt('default')
context = f"This is chunk {chunk_index + 1} of {total_chunks}."
# First try direct tag extraction
start = text.index(start_tag) + len(start_tag)
end = text.index(end_tag)
return text[start:end].strip()
except ValueError:
try:
# Try extracting from within <result> tags
result_content_start = text.index(result_start) + len(result_start)
result_content_end = text.index(result_end)
result_content = text[result_content_start:result_content_end].strip()

# Now try to find our tag within the result content
start = result_content.index(start_tag) + len(start_tag)
end = result_content.index(end_tag)
return result_content[start:end].strip()
except ValueError:
return ""

def validate_markdown_structure(summary: str) -> str:
"""Ensure summary has proper markdown structure"""
lines = summary.split('\n')
if not any(line.startswith('#') for line in lines):
return f"# Summary\n\n{summary}"
return summary

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def process_chunk(
chunk: str,
websocket: WebSocket,
chunk_index: int,
total_chunks: int,
previous_context: Optional[Dict[str, Any]] = None
) -> Dict[str, str]:
"""
Process a document chunk and return both context and summary.

Args:
chunk: The text chunk to process
websocket: WebSocket connection for status updates
chunk_index: Current chunk index
total_chunks: Total number of chunks
previous_context: Context from previous chunk processing

# Pass more focused context about previous content
if previous_summary:
# Extract key points from previous summary to maintain flow
key_points = previous_summary.split('\n')[-5:] # Take last few key points
points_text = '\n'.join(key_points)
context = f"{context}\n\nKey points from previous section:\n{points_text}\n\nContinue the document flow, focusing on new information while maintaining narrative coherence."
Returns:
Dict containing 'context' and 'summary' keys
"""
try:
prompt_template = await load_prompt()

full_prompt = f"{prompt_template}\n\n{context}\n\nText to analyze:\n{chunk}"
# Initialize or maintain context
if previous_context is None:
previous_context = {
"metadata": {
"partial_titles": [],
"current_depth": 1,
"pending_sections": []
},
"content": {
"active_concepts": [],
"knowledge_chain": []
}
}

logger.info(f"Processing chunk {chunk_index + 1}/{total_chunks} (length: {len(chunk)} chars):\n{chunk}")
logger.info("Chunk boundary marker ----")
# Prepare input section
inputs = f"""<Inputs>
{chunk}: str # Current text section for analysis
{chunk_index == 0}: bool # Start of new document flag
{json.dumps(previous_context, indent=2)}: dict # Previous context
</Inputs>"""

full_prompt = f"{prompt_template}\n\n{inputs}"

logger.info(f"Processing chunk {chunk_index + 1}/{total_chunks} (length: {len(chunk)} chars)")

try:
response = ollama.chat(
model=settings.OLLAMA_MODEL,
messages=[{
'role': 'user',
'content': full_prompt
}],
messages=[
{
'role': 'system',
'content': '''Your response must be in this exact format:
<context>
{
"metadata": {
"partial_titles": [],
"current_depth": 1,
"pending_sections": []
},
"content": {
"active_concepts": [],
"knowledge_chain": []
}
}
</context>

<summary>
# [Section Title]

## Overview
[One paragraph overview]

## New Concepts
### [Concept Name]
- Definition: [Clear definition]
- Example: [Concrete example]
- Prerequisites: [Required concepts]

## Technical Implementation
[Implementation details with examples]

## Related Concepts
[List of related concepts with brief connections]
</summary>'''
},
{
'role': 'user',
'content': full_prompt
}
],
stream=False
)
result = response['message']['content']
logger.info(f"AI Output for chunk {chunk_index + 1}/{total_chunks} (length: {len(result)} chars):\n{result}")
return result

# Add detailed logging of AI response
logger.debug(f"Raw AI response for chunk {chunk_index + 1}:\n{result}")

# Extract both context and summary
context = extract_tag_content(result, "context")
summary = extract_tag_content(result, "summary")

# Log extracted content
logger.debug(f"Extracted context for chunk {chunk_index + 1}:\n{context}")
logger.debug(f"Extracted summary for chunk {chunk_index + 1}:\n{summary}")

if not context or not summary:
logger.warning(f"Missing context or summary in AI response for chunk {chunk_index + 1}")
logger.warning("AI response structure:\n" + "\n".join(
f"- Line {i+1}: {line[:100]}..." for i, line in enumerate(result.split('\n'))
))

# Parse context back into dictionary if present
try:
context_dict = json.loads(context) if context else previous_context
except json.JSONDecodeError:
logger.error(f"Failed to parse context JSON for chunk {chunk_index + 1}")
logger.error(f"Invalid context content:\n{context}")
context_dict = previous_context

logger.info(f"AI Output for chunk {chunk_index + 1}/{total_chunks} (context: {len(context)} chars, summary: {len(summary)} chars)")
logger.info(f"Context: {context_dict}")
logger.info(f"Summary: {summary}")

return {
"context": context_dict,
"summary": summary or result # Fallback to full response if no summary tag
}

except Exception as e:
logger.error(f"Ollama processing failed: {str(e)}")
logger.error(f"AI processing error for chunk {chunk_index + 1}: {str(e)}")
await websocket.send_json({
'error': f'AI processing failed: {str(e)}'
})
raise

except Exception as e:
logger.error(f"Chunk processing failed: {str(e)}")
await websocket.send_json({
'error': f'Chunk processing failed: {str(e)}'
})
raise
26 changes: 15 additions & 11 deletions backend/app/services/pdf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import fitz
from fastapi import WebSocket
from typing import Dict, Any
import time

from app.core.logging import logger
from app.services.text_extraction import process_pdf_page
from app.services.ai import process_chunk
from app.utils.text import split_into_chunks
from app.utils.time import estimate_processing_time, estimate_remaining_time
from app.utils.time import estimator
from app.core.config import settings

async def process_pdf(pdf_path: str, websocket: WebSocket) -> str:
Expand Down Expand Up @@ -51,37 +53,39 @@ async def process_pdf(pdf_path: str, websocket: WebSocket) -> str:
'total_chunks': total_chunks,
'current_chunk': 0,
'progress': 0,
'estimated_time': estimate_processing_time(total_chunks)
'estimated_time': estimator.estimate_total_time(total_chunks)
})

# Process chunks with AI
summaries = []
running_summary = None
current_context: Dict[str, Any] = None
for i, chunk in enumerate(chunks):
summary = await process_chunk(
result = await process_chunk(
chunk,
websocket,
chunk_index=i,
total_chunks=total_chunks,
previous_summary=running_summary
previous_context=current_context
)
summaries.append(summary)
# Keep only the most recent summary for context
running_summary = summary
summaries.append(result['summary'])
current_context = result['context']

await websocket.send_json({
'status': 'analyzing',
'current_chunk': i + 1,
'total_chunks': total_chunks,
'progress': (i + 1) / total_chunks,
'estimated_remaining': estimate_remaining_time(i + 1, total_chunks)
'estimated_remaining': estimator.estimate_remaining_time(i + 1, total_chunks)
})

# Combine summaries with proper section numbering and formatting
sections = []
for i, summary in enumerate(summaries, 1):
section_header = f"## Section {i}"
sections.append(f"{section_header}\n\n{summary}")
if summary.startswith('#'): # If summary already has a heading
sections.append(summary)
else: # Add section number if no heading
section_header = f"## Section {i}"
sections.append(f"{section_header}\n\n{summary}")

final_summary = "# Document Summary\n\n" + "\n\n".join(sections)
logger.info(f"Final Combined Summary:\n{final_summary}")
Expand Down
Loading