poiley · poiley · Jan 4, 2025 · Jan 4, 2025 · Jan 4, 2025 · Jan 5, 2025
diff --git a/.env b/.env
@@ -4,7 +4,8 @@ OLLAMA_PORT=11434
 OLLAMA_MODEL=mistral
 CHUNK_SIZE=3000
 TOKEN_ENCODING=cl100k_base
-PROMPT_FILE=prompts/default.txt
+PROMPT_FILE=prompts/mistral.txt
 HEALTH_CHECK_TIMEOUT=5.0
 OLLAMA_TIMEOUT=30.0 
-OLLAMA_MEMORY=12G
+OLLAMA_MEMORY=12G
+DEFAULT_CHUNK_TIME=40.0
diff --git a/.github/workflows/build-backend.yml b/.github/workflows/build-backend.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Install Dependencies
         run: |
           cd backend
-          uv venv
+          uv .venv
           source .venv/bin/activate
           uv pip install -r requirements.txt
 

diff --git a/README.md b/README.md
@@ -3,9 +3,9 @@
 Analyze, summarize, and explain information in PDF textbooks and whitepapers using OCR and AI analysis. Upload PDFs to get markdown summaries and analysis with real-time progress tracking.
 
 ## Features
-- PDF text extraction with OCR
+- PDF text extraction with PyMuPDF
 - Real-time tracking of analysis progress
-- Chunked file upload
+- Chunked file upload to Ollama
 - Automatic error recovery
 - Memory-efficient processing
 - Markdown formatted output
@@ -57,8 +57,9 @@ bun run dev
 
 # Backend (default: http://localhost:8000)
 cd backend
-python -m venv venv
-source venv/bin/activate
+pip install uv
+uv .venv
+source .venv/bin/activate
 pip install -r requirements.txt
 uvicorn main:app --reload
 ```

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,6 +1,8 @@
 FROM python:3.12-slim
 WORKDIR /app
 
+ARG VERSION
+
 LABEL version=$VERSION
 
 # Update pip
@@ -17,4 +19,4 @@ COPY requirements.txt .
 RUN uv pip install --system -r requirements.txt
 
 COPY . .
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "$BACKEND_PORT", "--lifespan=on", "--log-level=info"]
+CMD uvicorn main:app --host 0.0.0.0 --port $BACKEND_PORT --lifespan=on --log-level=info
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -11,6 +11,7 @@ class Settings(BaseSettings):
     PROMPT_FILE: str = os.environ.get("PROMPT_FILE", "prompts/default.txt")
     HEALTH_CHECK_TIMEOUT: float = float(os.environ.get("HEALTH_CHECK_TIMEOUT", "5.0"))
     OLLAMA_TIMEOUT: float = float(os.environ.get("OLLAMA_TIMEOUT", "30.0"))
+    DEFAULT_CHUNK_TIME: float = float(os.environ.get("DEFAULT_CHUNK_TIME", "30.0"))
 
     class Config:
         env_file = ".env"

diff --git a/backend/app/services/ai.py b/backend/app/services/ai.py
@@ -1,7 +1,8 @@
 from tenacity import retry, stop_after_attempt, wait_exponential
 import ollama
+import json
 from fastapi import WebSocket
-from typing import Optional
+from typing import Optional, Dict, Any
 
 from app.core.logging import logger
 from app.core.config import settings
@@ -12,10 +13,11 @@ async def load_prompt(prompt_path: str = None) -> str:
     """
     if prompt_path is None:
         prompt_path = settings.PROMPT_FILE
-
     try:
         with open(prompt_path, 'r') as file:
-            return file.read().strip()
+            content = file.read().strip()
+            logger.info(f"Loaded prompt from {prompt_path} (length: {len(content)} chars)")
+            return content
     except FileNotFoundError:
         logger.warning(f"Prompt file {prompt_path} not found, using default prompt")
         return """You are processing part of a document. Your task is to create a concise summary of this section while maintaining context with previous sections.
@@ -27,47 +29,178 @@ async def load_prompt(prompt_path: str = None) -> str:
 4. Use clear paragraph breaks for different topics
 5. Avoid repeating information that was covered in previous sections"""
 
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-async def process_chunk(chunk: str, websocket: WebSocket, chunk_index: int, total_chunks: int, previous_summary: Optional[str] = None) -> str:
+def extract_tag_content(text: str, tag: str) -> str:
+    """Extract content between XML-style tags, handling nested tags."""
+    start_tag = f"<{tag}>"
+    end_tag = f"</{tag}>"
+    result_start = "<result>"
+    result_end = "</result>"
+
     try:
-        prompt_template = await load_prompt('default')
-        context = f"This is chunk {chunk_index + 1} of {total_chunks}."
+        # First try direct tag extraction
+        start = text.index(start_tag) + len(start_tag)
+        end = text.index(end_tag)
+        return text[start:end].strip()
+    except ValueError:
+        try:
+            # Try extracting from within <result> tags
+            result_content_start = text.index(result_start) + len(result_start)
+            result_content_end = text.index(result_end)
+            result_content = text[result_content_start:result_content_end].strip()
+
+            # Now try to find our tag within the result content
+            start = result_content.index(start_tag) + len(start_tag)
+            end = result_content.index(end_tag)
+            return result_content[start:end].strip()
+        except ValueError:
+            return ""
+
+def validate_markdown_structure(summary: str) -> str:
+    """Ensure summary has proper markdown structure"""
+    lines = summary.split('\n')
+    if not any(line.startswith('#') for line in lines):
+        return f"# Summary\n\n{summary}"
+    return summary
+
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+async def process_chunk(
+    chunk: str, 
+    websocket: WebSocket, 
+    chunk_index: int, 
+    total_chunks: int, 
+    previous_context: Optional[Dict[str, Any]] = None
+) -> Dict[str, str]:
+    """
+    Process a document chunk and return both context and summary.
+
+    Args:
+        chunk: The text chunk to process
+        websocket: WebSocket connection for status updates
+        chunk_index: Current chunk index
+        total_chunks: Total number of chunks
+        previous_context: Context from previous chunk processing
 
-        # Pass more focused context about previous content
-        if previous_summary:
-            # Extract key points from previous summary to maintain flow
-            key_points = previous_summary.split('\n')[-5:]  # Take last few key points
-            points_text = '\n'.join(key_points)
-            context = f"{context}\n\nKey points from previous section:\n{points_text}\n\nContinue the document flow, focusing on new information while maintaining narrative coherence."
+    Returns:
+        Dict containing 'context' and 'summary' keys
+    """
+    try:
+        prompt_template = await load_prompt()
 
-        full_prompt = f"{prompt_template}\n\n{context}\n\nText to analyze:\n{chunk}"
+        # Initialize or maintain context
+        if previous_context is None:
+            previous_context = {
+                "metadata": {
+                    "partial_titles": [],
+                    "current_depth": 1,
+                    "pending_sections": []
+                },
+                "content": {
+                    "active_concepts": [],
+                    "knowledge_chain": []
+                }
+            }
 
-        logger.info(f"Processing chunk {chunk_index + 1}/{total_chunks} (length: {len(chunk)} chars):\n{chunk}")
-        logger.info("Chunk boundary marker ----")
+        # Prepare input section
+        inputs = f"""<Inputs>
+{chunk}: str  # Current text section for analysis
+{chunk_index == 0}: bool  # Start of new document flag
+{json.dumps(previous_context, indent=2)}: dict  # Previous context
+</Inputs>"""
 
+        full_prompt = f"{prompt_template}\n\n{inputs}"
+
+        logger.info(f"Processing chunk {chunk_index + 1}/{total_chunks} (length: {len(chunk)} chars)")
+
         try:
             response = ollama.chat(
                 model=settings.OLLAMA_MODEL,
-                messages=[{
-                    'role': 'user',
-                    'content': full_prompt
-                }],
+                messages=[
+                    {
+                        'role': 'system',
+                        'content': '''Your response must be in this exact format:
+<context>
+{
+    "metadata": {
+        "partial_titles": [],
+        "current_depth": 1,
+        "pending_sections": []
+    },
+    "content": {
+        "active_concepts": [],
+        "knowledge_chain": []
+    }
+}
+</context>
+
+<summary>
+# [Section Title]
+
+## Overview
+[One paragraph overview]
+
+## New Concepts
+### [Concept Name]
+- Definition: [Clear definition]
+- Example: [Concrete example]
+- Prerequisites: [Required concepts]
+
+## Technical Implementation
+[Implementation details with examples]
+
+## Related Concepts
+[List of related concepts with brief connections]
+</summary>'''
+                    },
+                    {
+                        'role': 'user',
+                        'content': full_prompt
+                    }
+                ],
                 stream=False
             )
             result = response['message']['content']
-            logger.info(f"AI Output for chunk {chunk_index + 1}/{total_chunks} (length: {len(result)} chars):\n{result}")
-            return result
+
+            # Add detailed logging of AI response
+            logger.debug(f"Raw AI response for chunk {chunk_index + 1}:\n{result}")
+
+            # Extract both context and summary
+            context = extract_tag_content(result, "context")
+            summary = extract_tag_content(result, "summary")
+
+            # Log extracted content
+            logger.debug(f"Extracted context for chunk {chunk_index + 1}:\n{context}")
+            logger.debug(f"Extracted summary for chunk {chunk_index + 1}:\n{summary}")
+
+            if not context or not summary:
+                logger.warning(f"Missing context or summary in AI response for chunk {chunk_index + 1}")
+                logger.warning("AI response structure:\n" + "\n".join(
+                    f"- Line {i+1}: {line[:100]}..." for i, line in enumerate(result.split('\n'))
+                ))
+
+            # Parse context back into dictionary if present
+            try:
+                context_dict = json.loads(context) if context else previous_context
+            except json.JSONDecodeError:
+                logger.error(f"Failed to parse context JSON for chunk {chunk_index + 1}")
+                logger.error(f"Invalid context content:\n{context}")
+                context_dict = previous_context
+
+            logger.info(f"AI Output for chunk {chunk_index + 1}/{total_chunks} (context: {len(context)} chars, summary: {len(summary)} chars)")
+            logger.info(f"Context: {context_dict}")
+            logger.info(f"Summary: {summary}")
+
+            return {
+                "context": context_dict,
+                "summary": summary or result  # Fallback to full response if no summary tag
+            }
 
         except Exception as e:
-            logger.error(f"Ollama processing failed: {str(e)}")
+            logger.error(f"AI processing error for chunk {chunk_index + 1}: {str(e)}")
             await websocket.send_json({
                 'error': f'AI processing failed: {str(e)}'
             })
             raise
 
     except Exception as e:
         logger.error(f"Chunk processing failed: {str(e)}")
-        await websocket.send_json({
-            'error': f'Chunk processing failed: {str(e)}'
-        })
         raise 
diff --git a/backend/app/services/pdf.py b/backend/app/services/pdf.py
@@ -1,11 +1,13 @@
 import fitz
 from fastapi import WebSocket
+from typing import Dict, Any
+import time
 
 from app.core.logging import logger
 from app.services.text_extraction import process_pdf_page
 from app.services.ai import process_chunk
 from app.utils.text import split_into_chunks
-from app.utils.time import estimate_processing_time, estimate_remaining_time
+from app.utils.time import estimator
 from app.core.config import settings
 
 async def process_pdf(pdf_path: str, websocket: WebSocket) -> str:
@@ -51,37 +53,39 @@ async def process_pdf(pdf_path: str, websocket: WebSocket) -> str:
             'total_chunks': total_chunks,
             'current_chunk': 0,
             'progress': 0,
-            'estimated_time': estimate_processing_time(total_chunks)
+            'estimated_time': estimator.estimate_total_time(total_chunks)
         })
 
         # Process chunks with AI
         summaries = []
-        running_summary = None
+        current_context: Dict[str, Any] = None
         for i, chunk in enumerate(chunks):
-            summary = await process_chunk(
+            result = await process_chunk(
                 chunk, 
                 websocket, 
                 chunk_index=i,
                 total_chunks=total_chunks,
-                previous_summary=running_summary
+                previous_context=current_context
             )
-            summaries.append(summary)
-            # Keep only the most recent summary for context
-            running_summary = summary
+            summaries.append(result['summary'])
+            current_context = result['context']
 
             await websocket.send_json({
                 'status': 'analyzing',
                 'current_chunk': i + 1,
                 'total_chunks': total_chunks,
                 'progress': (i + 1) / total_chunks,
-                'estimated_remaining': estimate_remaining_time(i + 1, total_chunks)
+                'estimated_remaining': estimator.estimate_remaining_time(i + 1, total_chunks)
             })
 
         # Combine summaries with proper section numbering and formatting
         sections = []
         for i, summary in enumerate(summaries, 1):
-            section_header = f"## Section {i}"
-            sections.append(f"{section_header}\n\n{summary}")
+            if summary.startswith('#'):  # If summary already has a heading
+                sections.append(summary)
+            else:  # Add section number if no heading
+                section_header = f"## Section {i}"
+                sections.append(f"{section_header}\n\n{summary}")
 
         final_summary = "# Document Summary\n\n" + "\n\n".join(sections)
         logger.info(f"Final Combined Summary:\n{final_summary}")