diff --git a/CLAUDE.md b/CLAUDE.md index 299a106..27999ff 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,7 +16,7 @@ Code Graph Knowledge System is a Neo4j-based intelligent knowledge management sy - **Task Queue System** (`services/task_queue.py`, `monitoring/task_monitor.py`): Async background processing with web monitoring - **MCP Server** (`mcp_server.py`, `start_mcp.py`): Model Context Protocol integration for AI assistants using official MCP SDK - Modular architecture with handlers in `mcp_tools/` package - - 25 tools across 5 categories (Knowledge, Code Graph, Memory, Tasks, System) + - 30 tools across 6 categories (Knowledge, Code Graph, Memory, Memory Extraction v0.7, Tasks, System) - Official SDK features: session management, streaming support, multi-transport ### Multi-Provider LLM Support @@ -36,7 +36,7 @@ Configuration is handled via environment variables in `.env` file (see `env.exam # Start main application python start.py -# Start MCP server (Official MCP SDK - All 25 tools) +# Start MCP server (Official MCP SDK - All 30 tools) python start_mcp.py # Using script entry points (after uv sync) @@ -261,8 +261,9 @@ When AI agents work on a project over time, they need to remember: ### MCP Tools for AI Agents -The Memory Store provides 7 MCP tools (available in Claude Desktop, VSCode with MCP, etc.): +The Memory Store provides 12 MCP tools (available in Claude Desktop, VSCode with MCP, etc.): +**Manual Memory Management (v0.6):** 1. **add_memory**: Save new project knowledge 2. **search_memories**: Find relevant memories when starting tasks 3. **get_memory**: Retrieve specific memory by ID @@ -271,6 +272,13 @@ The Memory Store provides 7 MCP tools (available in Claude Desktop, VSCode with 6. **supersede_memory**: Create new memory that replaces old one 7. **get_project_summary**: Get overview of all project memories +**Automatic Extraction (v0.7):** +8. **extract_from_conversation**: Extract memories from AI conversations +9. **extract_from_git_commit**: Analyze git commits for decisions and experiences +10. **extract_from_code_comments**: Mine TODO, FIXME, NOTE markers from code +11. **suggest_memory_from_query**: Suggest memories from knowledge base Q&A +12. **batch_extract_from_repository**: Comprehensive repository analysis + ### Typical AI Agent Workflow ``` @@ -370,11 +378,89 @@ supersede_memory( - User can manually add memories via API - Full control over what gets saved -**Future (v0.7+)**: Automatic extraction -- Extract from git commits -- Mine from code comments -- Analyze conversations -- Auto-suggest important memories +**v0.7 (Current)**: Automatic extraction (IMPLEMENTED ✅) +- ✅ Extract from git commits - LLM-powered analysis of commit messages and changes +- ✅ Mine from code comments - Automatic extraction of TODO, FIXME, NOTE, DECISION markers +- ✅ Analyze conversations - Extract decisions and learnings from AI conversations +- ✅ Auto-suggest important memories - Intelligent Q&A analysis for knowledge worth saving +- ✅ Batch repository extraction - Comprehensive analysis of entire codebase + +#### v0.7 Automatic Extraction Features + +**1. Conversation Analysis** +```python +# HTTP API +POST /api/v1/memory/extract/conversation +{ + "project_id": "myapp", + "conversation": [ + {"role": "user", "content": "Should we use Redis?"}, + {"role": "assistant", "content": "Yes, Redis is best for caching..."} + ], + "auto_save": false # Set true to auto-save high-confidence (>0.7) memories +} + +# MCP Tool +extract_from_conversation(project_id, conversation, auto_save) +``` + +**2. Git Commit Analysis** +```python +# HTTP API +POST /api/v1/memory/extract/commit +{ + "project_id": "myapp", + "commit_sha": "abc123...", + "commit_message": "feat: add JWT authentication\n\nImplemented JWT for stateless auth", + "changed_files": ["src/auth/jwt.py", "src/middleware/auth.py"], + "auto_save": true +} + +# MCP Tool +extract_from_git_commit(project_id, commit_sha, commit_message, changed_files, auto_save) +``` + +**3. Code Comment Mining** +```python +# HTTP API +POST /api/v1/memory/extract/comments +{ + "project_id": "myapp", + "file_path": "/path/to/file.py" +} + +# MCP Tool +extract_from_code_comments(project_id, file_path) +``` + +**4. Query-based Memory Suggestions** +```python +# HTTP API +POST /api/v1/memory/suggest +{ + "project_id": "myapp", + "query": "How does authentication work?", + "answer": "The system uses JWT tokens..." +} + +# MCP Tool +suggest_memory_from_query(project_id, query, answer) +``` + +**5. Batch Repository Extraction** +```python +# HTTP API +POST /api/v1/memory/extract/batch +{ + "project_id": "myapp", + "repo_path": "/path/to/repo", + "max_commits": 50, + "file_patterns": ["*.py", "*.js"] +} + +# MCP Tool +batch_extract_from_repository(project_id, repo_path, max_commits, file_patterns) +``` ### Best Practices diff --git a/api/memory_routes.py b/api/memory_routes.py index 82540e5..0445b68 100644 --- a/api/memory_routes.py +++ b/api/memory_routes.py @@ -12,6 +12,7 @@ from typing import Optional, List, Dict, Any, Literal from services.memory_store import memory_store +from services.memory_extractor import memory_extractor from loguru import logger @@ -112,6 +113,97 @@ class Config: } +# ============================================================================ +# v0.7 Extraction Request Models +# ============================================================================ + +class ExtractFromConversationRequest(BaseModel): + """Request model for extracting memories from conversation""" + project_id: str = Field(..., description="Project identifier") + conversation: List[Dict[str, str]] = Field(..., description="Conversation messages") + auto_save: bool = Field(False, description="Auto-save high-confidence memories") + + class Config: + json_schema_extra = { + "example": { + "project_id": "myapp", + "conversation": [ + {"role": "user", "content": "Should we use Redis or Memcached?"}, + {"role": "assistant", "content": "Let's use Redis because it supports data persistence"} + ], + "auto_save": False + } + } + + +class ExtractFromGitCommitRequest(BaseModel): + """Request model for extracting memories from git commit""" + project_id: str = Field(..., description="Project identifier") + commit_sha: str = Field(..., description="Git commit SHA") + commit_message: str = Field(..., description="Commit message") + changed_files: List[str] = Field(..., description="List of changed files") + auto_save: bool = Field(False, description="Auto-save high-confidence memories") + + class Config: + json_schema_extra = { + "example": { + "project_id": "myapp", + "commit_sha": "abc123def456", + "commit_message": "feat: add JWT authentication\n\nImplemented JWT-based auth for stateless API", + "changed_files": ["src/auth/jwt.py", "src/middleware/auth.py"], + "auto_save": True + } + } + + +class ExtractFromCodeCommentsRequest(BaseModel): + """Request model for extracting memories from code comments""" + project_id: str = Field(..., description="Project identifier") + file_path: str = Field(..., description="Path to source file") + + class Config: + json_schema_extra = { + "example": { + "project_id": "myapp", + "file_path": "/path/to/project/src/service.py" + } + } + + +class SuggestMemoryRequest(BaseModel): + """Request model for suggesting memory from query""" + project_id: str = Field(..., description="Project identifier") + query: str = Field(..., description="User query") + answer: str = Field(..., description="LLM answer") + + class Config: + json_schema_extra = { + "example": { + "project_id": "myapp", + "query": "How does the authentication work?", + "answer": "The system uses JWT tokens with refresh token rotation..." + } + } + + +class BatchExtractRequest(BaseModel): + """Request model for batch extraction from repository""" + project_id: str = Field(..., description="Project identifier") + repo_path: str = Field(..., description="Path to git repository") + max_commits: int = Field(50, ge=1, le=200, description="Maximum commits to analyze") + file_patterns: Optional[List[str]] = Field(None, description="File patterns to scan") + + class Config: + json_schema_extra = { + "example": { + "project_id": "myapp", + "repo_path": "/path/to/repository", + "max_commits": 50, + "file_patterns": ["*.py", "*.js"] + } + } + + # ============================================================================ # API Endpoints # ============================================================================ @@ -351,6 +443,166 @@ async def get_project_summary(project_id: str) -> Dict[str, Any]: raise HTTPException(status_code=500, detail=str(e)) +# ============================================================================ +# v0.7 Automatic Extraction Endpoints +# ============================================================================ + +@router.post("/extract/conversation") +async def extract_from_conversation(request: ExtractFromConversationRequest) -> Dict[str, Any]: + """ + Extract memories from a conversation using LLM analysis. + + Analyzes conversation for important decisions, preferences, and experiences. + Can auto-save high-confidence memories or return suggestions for manual review. + + Returns: + Extracted memories with confidence scores + """ + try: + result = await memory_extractor.extract_from_conversation( + project_id=request.project_id, + conversation=request.conversation, + auto_save=request.auto_save + ) + + if not result.get("success"): + raise HTTPException(status_code=400, detail=result.get("error", "Extraction failed")) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in extract_from_conversation endpoint: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/extract/commit") +async def extract_from_git_commit(request: ExtractFromGitCommitRequest) -> Dict[str, Any]: + """ + Extract memories from a git commit using LLM analysis. + + Analyzes commit message and changes to identify important decisions, + bug fixes, and architectural changes. + + Returns: + Extracted memories from the commit + """ + try: + result = await memory_extractor.extract_from_git_commit( + project_id=request.project_id, + commit_sha=request.commit_sha, + commit_message=request.commit_message, + changed_files=request.changed_files, + auto_save=request.auto_save + ) + + if not result.get("success"): + raise HTTPException(status_code=400, detail=result.get("error", "Extraction failed")) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in extract_from_git_commit endpoint: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/extract/comments") +async def extract_from_code_comments(request: ExtractFromCodeCommentsRequest) -> Dict[str, Any]: + """ + Extract memories from code comments in a source file. + + Identifies special markers like TODO, FIXME, NOTE, DECISION and + extracts them as structured memories. + + Returns: + Extracted memories from code comments + """ + try: + result = await memory_extractor.extract_from_code_comments( + project_id=request.project_id, + file_path=request.file_path + ) + + if not result.get("success"): + raise HTTPException(status_code=400, detail=result.get("error", "Extraction failed")) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in extract_from_code_comments endpoint: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/suggest") +async def suggest_memory_from_query(request: SuggestMemoryRequest) -> Dict[str, Any]: + """ + Suggest creating a memory based on a knowledge query and answer. + + Uses LLM to determine if the Q&A represents important knowledge + worth saving for future sessions. + + Returns: + Memory suggestion with confidence score (not auto-saved) + """ + try: + result = await memory_extractor.suggest_memory_from_query( + project_id=request.project_id, + query=request.query, + answer=request.answer + ) + + if not result.get("success"): + raise HTTPException(status_code=400, detail=result.get("error", "Suggestion failed")) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in suggest_memory_from_query endpoint: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/extract/batch") +async def batch_extract_from_repository(request: BatchExtractRequest) -> Dict[str, Any]: + """ + Batch extract memories from an entire repository. + + Analyzes: + - Recent git commits + - Code comments in source files + - Documentation files (README, CHANGELOG, etc.) + + This is a comprehensive operation that may take several minutes. + + Returns: + Summary of extracted memories by source type + """ + try: + result = await memory_extractor.batch_extract_from_repository( + project_id=request.project_id, + repo_path=request.repo_path, + max_commits=request.max_commits, + file_patterns=request.file_patterns + ) + + if not result.get("success"): + raise HTTPException(status_code=400, detail=result.get("error", "Batch extraction failed")) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in batch_extract_from_repository endpoint: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + # ============================================================================ # Health Check # ============================================================================ @@ -366,5 +618,6 @@ async def memory_health() -> Dict[str, Any]: return { "service": "memory_store", "status": "healthy" if memory_store._initialized else "not_initialized", - "initialized": memory_store._initialized + "initialized": memory_store._initialized, + "extraction_enabled": memory_extractor.extraction_enabled } diff --git a/mcp_server.py b/mcp_server.py index baa9ef7..ea4e6c1 100644 --- a/mcp_server.py +++ b/mcp_server.py @@ -41,6 +41,7 @@ # Import services from services.neo4j_knowledge_service import Neo4jKnowledgeService from services.memory_store import memory_store +from services.memory_extractor import memory_extractor from services.task_queue import task_queue, TaskStatus, submit_document_processing_task, submit_directory_processing_task from services.task_processors import processor_registry from services.graph_service import graph_service @@ -69,6 +70,13 @@ handle_delete_memory, handle_supersede_memory, handle_get_project_summary, + # v0.7 Extraction handlers + handle_extract_from_conversation, + handle_extract_from_git_commit, + handle_extract_from_code_comments, + handle_suggest_memory_from_query, + handle_batch_extract_from_repository, + # Task handlers handle_get_task_status, handle_watch_task, handle_watch_tasks, diff --git a/mcp_tools/__init__.py b/mcp_tools/__init__.py index b5530c3..a47defd 100644 --- a/mcp_tools/__init__.py +++ b/mcp_tools/__init__.py @@ -31,6 +31,12 @@ handle_delete_memory, handle_supersede_memory, handle_get_project_summary, + # v0.7 Automatic extraction + handle_extract_from_conversation, + handle_extract_from_git_commit, + handle_extract_from_code_comments, + handle_suggest_memory_from_query, + handle_batch_extract_from_repository, ) # Task management handlers @@ -83,6 +89,12 @@ "handle_delete_memory", "handle_supersede_memory", "handle_get_project_summary", + # v0.7 Extraction handlers + "handle_extract_from_conversation", + "handle_extract_from_git_commit", + "handle_extract_from_code_comments", + "handle_suggest_memory_from_query", + "handle_batch_extract_from_repository", # Task handlers "handle_get_task_status", "handle_watch_task", diff --git a/mcp_tools/memory_handlers.py b/mcp_tools/memory_handlers.py index 6cf1d5a..72efb7e 100644 --- a/mcp_tools/memory_handlers.py +++ b/mcp_tools/memory_handlers.py @@ -9,6 +9,13 @@ - Delete memory - Supersede memory - Get project summary + +v0.7 Automatic Extraction: +- Extract from conversation +- Extract from git commit +- Extract from code comments +- Suggest memory from query +- Batch extract from repository """ from typing import Dict, Any @@ -166,3 +173,114 @@ async def handle_get_project_summary(args: Dict, memory_store) -> Dict: summary = result.get("summary", {}) logger.info(f"Project summary: {summary.get('total_memories', 0)} memories") return result + + +# ============================================================================ +# v0.7 Automatic Extraction Handlers +# ============================================================================ + +async def handle_extract_from_conversation(args: Dict, memory_extractor) -> Dict: + """ + Extract memories from conversation using LLM analysis. + + Args: + args: Arguments containing project_id, conversation, auto_save + memory_extractor: Memory extractor instance + + Returns: + Extracted memories with confidence scores + """ + result = await memory_extractor.extract_from_conversation( + project_id=args["project_id"], + conversation=args["conversation"], + auto_save=args.get("auto_save", False) + ) + if result.get("success"): + logger.info(f"Extracted {result.get('total_extracted', 0)} memories from conversation") + return result + + +async def handle_extract_from_git_commit(args: Dict, memory_extractor) -> Dict: + """ + Extract memories from git commit using LLM analysis. + + Args: + args: Arguments containing project_id, commit_sha, commit_message, changed_files, auto_save + memory_extractor: Memory extractor instance + + Returns: + Extracted memories from commit + """ + result = await memory_extractor.extract_from_git_commit( + project_id=args["project_id"], + commit_sha=args["commit_sha"], + commit_message=args["commit_message"], + changed_files=args["changed_files"], + auto_save=args.get("auto_save", False) + ) + if result.get("success"): + logger.info(f"Extracted {result.get('auto_saved_count', 0)} memories from commit {args['commit_sha'][:8]}") + return result + + +async def handle_extract_from_code_comments(args: Dict, memory_extractor) -> Dict: + """ + Extract memories from code comments in source file. + + Args: + args: Arguments containing project_id, file_path + memory_extractor: Memory extractor instance + + Returns: + Extracted memories from code comments + """ + result = await memory_extractor.extract_from_code_comments( + project_id=args["project_id"], + file_path=args["file_path"] + ) + if result.get("success"): + logger.info(f"Extracted {result.get('total_extracted', 0)} memories from {args['file_path']}") + return result + + +async def handle_suggest_memory_from_query(args: Dict, memory_extractor) -> Dict: + """ + Suggest creating memory based on knowledge query and answer. + + Args: + args: Arguments containing project_id, query, answer + memory_extractor: Memory extractor instance + + Returns: + Memory suggestion with confidence (not auto-saved) + """ + result = await memory_extractor.suggest_memory_from_query( + project_id=args["project_id"], + query=args["query"], + answer=args["answer"] + ) + if result.get("success") and result.get("should_save"): + logger.info(f"Memory suggested from query: {result.get('suggested_memory', {}).get('title', 'N/A')}") + return result + + +async def handle_batch_extract_from_repository(args: Dict, memory_extractor) -> Dict: + """ + Batch extract memories from entire repository. + + Args: + args: Arguments containing project_id, repo_path, max_commits, file_patterns + memory_extractor: Memory extractor instance + + Returns: + Summary of extracted memories by source + """ + result = await memory_extractor.batch_extract_from_repository( + project_id=args["project_id"], + repo_path=args["repo_path"], + max_commits=args.get("max_commits", 50), + file_patterns=args.get("file_patterns") + ) + if result.get("success"): + logger.info(f"Batch extraction: {result.get('total_extracted', 0)} memories from {args['repo_path']}") + return result diff --git a/mcp_tools/tool_definitions.py b/mcp_tools/tool_definitions.py index 568d6fd..5f2bb8f 100644 --- a/mcp_tools/tool_definitions.py +++ b/mcp_tools/tool_definitions.py @@ -11,13 +11,14 @@ def get_tool_definitions() -> List[Tool]: """ - Get all 25 tool definitions for MCP server. + Get all 30 tool definitions for MCP server. Returns: List of Tool objects organized by category: - Knowledge Base (5 tools) - Code Graph (4 tools) - Memory Store (7 tools) + - Memory Extraction v0.7 (5 tools) - Task Management (6 tools) - System (3 tools) """ @@ -392,6 +393,149 @@ def get_tool_definitions() -> List[Tool]: } ), + # ===== Memory Extraction Tools (v0.7) - 5 tools ===== + Tool( + name="extract_from_conversation", + description="""Extract memories from conversation using LLM analysis (v0.7). + +Analyzes conversation messages to identify: +- Design decisions and rationale +- Problems encountered and solutions +- Preferences and conventions +- Important architectural choices + +Can auto-save high-confidence memories or return suggestions for manual review.""", + inputSchema={ + "type": "object", + "properties": { + "project_id": {"type": "string"}, + "conversation": { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": {"type": "string"}, + "content": {"type": "string"} + } + }, + "description": "List of conversation messages" + }, + "auto_save": { + "type": "boolean", + "default": False, + "description": "Auto-save high-confidence memories (>= 0.7)" + } + }, + "required": ["project_id", "conversation"] + } + ), + + Tool( + name="extract_from_git_commit", + description="""Extract memories from git commit using LLM analysis (v0.7). + +Analyzes commit message and changed files to identify: +- Feature additions (decisions) +- Bug fixes (experiences) +- Refactoring (experiences/conventions) +- Breaking changes (high importance decisions)""", + inputSchema={ + "type": "object", + "properties": { + "project_id": {"type": "string"}, + "commit_sha": {"type": "string", "description": "Git commit SHA"}, + "commit_message": {"type": "string", "description": "Full commit message"}, + "changed_files": { + "type": "array", + "items": {"type": "string"}, + "description": "List of changed file paths" + }, + "auto_save": { + "type": "boolean", + "default": False, + "description": "Auto-save high-confidence memories" + } + }, + "required": ["project_id", "commit_sha", "commit_message", "changed_files"] + } + ), + + Tool( + name="extract_from_code_comments", + description="""Extract memories from code comments in source file (v0.7). + +Identifies special markers: +- TODO: → plan +- FIXME: / BUG: → experience +- NOTE: / IMPORTANT: → convention +- DECISION: → decision + +Extracts and saves as structured memories with file references.""", + inputSchema={ + "type": "object", + "properties": { + "project_id": {"type": "string"}, + "file_path": {"type": "string", "description": "Path to source file"} + }, + "required": ["project_id", "file_path"] + } + ), + + Tool( + name="suggest_memory_from_query", + description="""Suggest creating memory from knowledge base query (v0.7). + +Uses LLM to determine if Q&A represents important knowledge worth saving. +Returns suggestion with confidence score (not auto-saved). + +Useful for: +- Frequently asked questions +- Important architectural information +- Non-obvious solutions or workarounds""", + inputSchema={ + "type": "object", + "properties": { + "project_id": {"type": "string"}, + "query": {"type": "string", "description": "User query"}, + "answer": {"type": "string", "description": "LLM answer"} + }, + "required": ["project_id", "query", "answer"] + } + ), + + Tool( + name="batch_extract_from_repository", + description="""Batch extract memories from entire repository (v0.7). + +Comprehensive analysis of: +- Recent git commits (configurable count) +- Code comments in source files +- Documentation files (README, CHANGELOG, etc.) + +This is a long-running operation that may take several minutes. +Returns summary of extracted memories by source type.""", + inputSchema={ + "type": "object", + "properties": { + "project_id": {"type": "string"}, + "repo_path": {"type": "string", "description": "Path to git repository"}, + "max_commits": { + "type": "integer", + "minimum": 1, + "maximum": 200, + "default": 50, + "description": "Maximum commits to analyze" + }, + "file_patterns": { + "type": "array", + "items": {"type": "string"}, + "description": "File patterns to scan (e.g., ['*.py', '*.js'])" + } + }, + "required": ["project_id", "repo_path"] + } + ), + # ===== Task Management Tools (6) ===== Tool( name="get_task_status", diff --git a/services/memory_extractor.py b/services/memory_extractor.py index a0ac0e7..1423268 100644 --- a/services/memory_extractor.py +++ b/services/memory_extractor.py @@ -1,16 +1,23 @@ """ -Memory Extractor - Automatic Memory Extraction (Future Extension) +Memory Extractor - Automatic Memory Extraction (v0.7) -This module provides interfaces for automatically extracting memories from: -- Code changes and commits +This module provides automatic extraction of project memories from: +- Git commits and diffs +- Code comments and documentation - Conversations and interactions -- Documentation and comments +- Knowledge base queries -Currently provides skeleton/placeholder implementations. -Full implementation planned for future versions. +Uses LLM analysis to identify and extract important project knowledge. """ -from typing import Dict, Any, List, Optional +import ast +import re +import subprocess +from datetime import datetime +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple + +from llama_index.core import Settings from loguru import logger from services.memory_store import memory_store @@ -20,16 +27,26 @@ class MemoryExtractor: """ Extract and automatically persist project memories from various sources. - Future Extensions: + Features: - LLM-based extraction from conversations - - Git commit analysis for decisions - - Code comment mining for conventions - - Issue/PR analysis for experiences + - Git commit analysis for decisions and experiences + - Code comment mining for conventions and plans + - Auto-suggest memories from knowledge queries """ + # Processing limits + MAX_COMMITS_TO_PROCESS = 20 # Maximum commits to analyze in batch processing + MAX_FILES_TO_SAMPLE = 30 # Maximum files to scan for comments + MAX_ITEMS_PER_TYPE = 3 # Top items per memory type to include + MAX_README_LINES = 20 # Maximum README lines to process for overview + MAX_STRING_EXCERPT_LENGTH = 200 # Maximum length for string excerpts in responses + MAX_CONTENT_LENGTH = 500 # Maximum length for content fields + MAX_TITLE_LENGTH = 100 # Maximum length for title fields + def __init__(self): - self.extraction_enabled = False # Feature flag for future implementation - logger.info("Memory Extractor initialized (placeholder mode)") + self.extraction_enabled = True + self.confidence_threshold = 0.7 # Threshold for auto-saving + logger.info("Memory Extractor initialized (v0.7 - full implementation)") async def extract_from_conversation( self, @@ -38,37 +55,122 @@ async def extract_from_conversation( auto_save: bool = False ) -> Dict[str, Any]: """ - Extract memories from a conversation between user and AI. + Extract memories from a conversation between user and AI using LLM analysis. - Future Implementation Plan: - 1. Use LLM to analyze conversation for: - - Design decisions and rationale - - Problems encountered and solutions - - Preferences and conventions mentioned - 2. Identify importance based on emphasis and repetition - 3. Extract relevant code references - 4. Optionally auto-save high-confidence extractions + Analyzes conversation for: + - Design decisions and rationale + - Problems encountered and solutions + - Preferences and conventions mentioned + - Important architectural choices Args: project_id: Project identifier conversation: List of messages [{"role": "user/assistant", "content": "..."}] - auto_save: If True, automatically save high-confidence memories + auto_save: If True, automatically save high-confidence memories (>= threshold) Returns: Dict with extracted memories and confidence scores - - Current Status: PLACEHOLDER - Returns empty list """ - logger.warning("extract_from_conversation called but not yet implemented") - - # Placeholder return structure - return { - "success": True, - "extracted_memories": [], - "auto_saved_count": 0, - "suggestions": [], - "implementation_status": "placeholder - planned for v0.7" - } + try: + logger.info(f"Extracting memories from conversation ({len(conversation)} messages)") + + # Format conversation for LLM analysis + conversation_text = self._format_conversation(conversation) + + # Create extraction prompt + extraction_prompt = f"""Analyze the following conversation between a user and an AI assistant working on a software project. + +Extract important project knowledge that should be saved as memories. For each memory, identify: +1. Type: decision, preference, experience, convention, plan, or note +2. Title: A concise summary (max 100 chars) +3. Content: Detailed description +4. Reason: Why this is important or rationale +5. Tags: Relevant tags (e.g., architecture, database, auth) +6. Importance: Score from 0.0 to 1.0 (critical decisions = 0.9+, preferences = 0.5-0.7) +7. Confidence: How confident you are in this extraction (0.0 to 1.0) + +Only extract significant information worth remembering for future sessions. Ignore casual chat. + +Conversation: +{conversation_text} + +Respond with a JSON array of extracted memories. Each memory should have this structure: +{{ + "type": "decision|preference|experience|convention|plan|note", + "title": "Brief title", + "content": "Detailed content", + "reason": "Why this matters", + "tags": ["tag1", "tag2"], + "importance": 0.8, + "confidence": 0.9 +}} + +If no significant memories found, return an empty array: []""" + + # Use LlamaIndex LLM to analyze + llm = Settings.llm + if not llm: + raise ValueError("LLM not initialized in Settings") + + response = await llm.acomplete(extraction_prompt) + response_text = str(response).strip() + + # Parse LLM response (extract JSON) + memories = self._parse_llm_json_response(response_text) + + # Filter by confidence and auto-save if enabled + auto_saved_count = 0 + extracted_memories = [] + suggestions = [] + + for mem in memories: + confidence = mem.get("confidence", 0.5) + mem_data = { + "type": mem.get("type", "note"), + "title": mem.get("title", "Untitled"), + "content": mem.get("content", ""), + "reason": mem.get("reason"), + "tags": mem.get("tags", []), + "importance": mem.get("importance", 0.5) + } + + if auto_save and confidence >= self.confidence_threshold: + # Auto-save high-confidence memories + result = await memory_store.add_memory( + project_id=project_id, + memory_type=mem_data["type"], + title=mem_data["title"], + content=mem_data["content"], + reason=mem_data["reason"], + tags=mem_data["tags"], + importance=mem_data["importance"], + metadata={"source": "conversation", "confidence": confidence} + ) + if result.get("success"): + auto_saved_count += 1 + extracted_memories.append({**mem_data, "memory_id": result["memory_id"], "auto_saved": True}) + else: + # Suggest for manual review + suggestions.append({**mem_data, "confidence": confidence}) + + logger.success(f"Extracted {len(memories)} memories ({auto_saved_count} auto-saved)") + + return { + "success": True, + "extracted_memories": extracted_memories, + "auto_saved_count": auto_saved_count, + "suggestions": suggestions, + "total_extracted": len(memories) + } + + except Exception as e: + logger.error(f"Failed to extract from conversation: {e}") + return { + "success": False, + "error": str(e), + "extracted_memories": [], + "auto_saved_count": 0 + } async def extract_from_git_commit( self, @@ -79,187 +181,697 @@ async def extract_from_git_commit( auto_save: bool = False ) -> Dict[str, Any]: """ - Extract memories from git commit information. - - Future Implementation Plan: - 1. Analyze commit message for keywords: - - "refactor" → experience - - "fix" → experience - - "feat" → decision - - "docs" → convention - 2. Extract rationale from commit body - 3. Link to changed files - 4. Identify breaking changes → high importance + Extract memories from git commit information using LLM analysis. + + Analyzes commit for: + - Feature additions (decisions) + - Bug fixes (experiences) + - Refactoring (experiences/conventions) + - Breaking changes (high importance decisions) Args: project_id: Project identifier commit_sha: Git commit SHA commit_message: Commit message (title + body) changed_files: List of file paths changed - auto_save: If True, automatically save + auto_save: If True, automatically save high-confidence memories Returns: Dict with extracted memories - - Current Status: PLACEHOLDER """ - logger.warning("extract_from_git_commit called but not yet implemented") - - return { - "success": True, - "extracted_memories": [], - "implementation_status": "placeholder - planned for v0.7" - } + try: + logger.info(f"Extracting memories from commit {commit_sha[:8]}") + + # Classify commit type from message + commit_type = self._classify_commit_type(commit_message) + + # Create extraction prompt + extraction_prompt = f"""Analyze this git commit and extract important project knowledge. + +Commit SHA: {commit_sha} +Commit Type: {commit_type} +Commit Message: +{commit_message} + +Changed Files: +{chr(10).join(f'- {f}' for f in changed_files[:20])} +{"..." if len(changed_files) > 20 else ""} + +Extract memories that represent important knowledge: +- For "feat" commits: architectural decisions, new features +- For "fix" commits: problems encountered and solutions +- For "refactor" commits: code improvements and rationale +- For "docs" commits: conventions and standards +- For breaking changes: critical decisions + +Respond with a JSON array of memories (same format as before). Consider: +1. Type: Choose appropriate type based on commit nature +2. Title: Brief description of the change +3. Content: What was done and why +4. Reason: Technical rationale or problem solved +5. Tags: Extract from file paths and commit message +6. Importance: Breaking changes = 0.9+, features = 0.7+, fixes = 0.5+ +7. Confidence: How significant is this commit + +Return empty array [] if this is routine maintenance or trivial changes.""" + + llm = Settings.llm + if not llm: + raise ValueError("LLM not initialized") + + response = await llm.acomplete(extraction_prompt) + memories = self._parse_llm_json_response(str(response).strip()) + + # Auto-save or suggest + auto_saved_count = 0 + extracted_memories = [] + suggestions = [] + + for mem in memories: + confidence = mem.get("confidence", 0.5) + mem_data = { + "type": mem.get("type", "note"), + "title": mem.get("title", commit_message.split('\n')[0][:100]), + "content": mem.get("content", ""), + "reason": mem.get("reason"), + "tags": mem.get("tags", []) + [commit_type], + "importance": mem.get("importance", 0.5), + "metadata": { + "source": "git_commit", + "commit_sha": commit_sha, + "changed_files": changed_files, + "confidence": confidence + } + } + + if auto_save and confidence >= self.confidence_threshold: + result = await memory_store.add_memory( + project_id=project_id, + memory_type=mem_data["type"], + title=mem_data["title"], + content=mem_data["content"], + reason=mem_data["reason"], + tags=mem_data["tags"], + importance=mem_data["importance"], + metadata=mem_data["metadata"] + ) + if result.get("success"): + auto_saved_count += 1 + extracted_memories.append({**mem_data, "memory_id": result["memory_id"]}) + else: + suggestions.append({**mem_data, "confidence": confidence}) + + logger.success(f"Extracted {len(memories)} memories from commit") + + return { + "success": True, + "extracted_memories": extracted_memories, + "auto_saved_count": auto_saved_count, + "suggestions": suggestions, + "commit_type": commit_type + } + + except Exception as e: + logger.error(f"Failed to extract from commit: {e}") + return { + "success": False, + "error": str(e) + } async def extract_from_code_comments( self, project_id: str, file_path: str, - comments: List[Dict[str, Any]] + comments: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: """ Extract memories from code comments and docstrings. - Future Implementation Plan: - 1. Identify special comment markers: - - "TODO:" → plan - - "FIXME:" → experience - - "NOTE:" → convention - - "DECISION:" → decision (custom marker) - 2. Extract context from surrounding code - 3. Link to specific line numbers + Identifies special markers: + - "TODO:" → plan + - "FIXME:" / "BUG:" → experience + - "NOTE:" / "IMPORTANT:" → convention + - "DECISION:" → decision (custom marker) Args: project_id: Project identifier file_path: Path to source file - comments: List of comments with line numbers + comments: Optional list of pre-extracted comments with line numbers. + If None, will parse the file automatically. Returns: Dict with extracted memories - - Current Status: PLACEHOLDER """ - logger.warning("extract_from_code_comments called but not yet implemented") - - return { - "success": True, - "extracted_memories": [], - "implementation_status": "placeholder - planned for v0.7" - } + try: + logger.info(f"Extracting memories from code comments in {file_path}") + + # If comments not provided, extract them + if comments is None: + comments = self._extract_comments_from_file(file_path) + + if not comments: + return { + "success": True, + "extracted_memories": [], + "message": "No comments found" + } + + # Group comments by marker type + extracted = [] + + for comment in comments: + text = comment.get("text", "") + line_num = comment.get("line", 0) + + # Check for special markers + memory_data = self._classify_comment(text, file_path, line_num) + if memory_data: + extracted.append(memory_data) + + # If we have many comments, use LLM to analyze them together + if len(extracted) > 5: + logger.info(f"Using LLM to analyze {len(extracted)} comment markers") + # Batch analyze for better context + combined = self._combine_related_comments(extracted) + extracted = combined + + # Save extracted memories + saved_memories = [] + for mem_data in extracted: + # Add file extension as tag if file has an extension + file_tags = mem_data.get("tags", ["code-comment"]) + file_suffix = Path(file_path).suffix + if file_suffix: + file_tags = file_tags + [file_suffix[1:]] + + result = await memory_store.add_memory( + project_id=project_id, + memory_type=mem_data["type"], + title=mem_data["title"], + content=mem_data["content"], + reason=mem_data.get("reason"), + tags=file_tags, + importance=mem_data.get("importance", 0.4), + related_refs=[f"ref://file/{file_path}#{mem_data.get('line', 0)}"], + metadata={ + "source": "code_comment", + "file_path": file_path, + "line_number": mem_data.get("line", 0) + } + ) + if result.get("success"): + saved_memories.append({**mem_data, "memory_id": result["memory_id"]}) + + logger.success(f"Extracted {len(saved_memories)} memories from code comments") + + return { + "success": True, + "extracted_memories": saved_memories, + "total_comments": len(comments), + "total_extracted": len(saved_memories) + } + + except Exception as e: + logger.error(f"Failed to extract from code comments: {e}") + return { + "success": False, + "error": str(e) + } async def suggest_memory_from_query( self, project_id: str, query: str, answer: str, - source_nodes: List[Dict[str, Any]] + source_nodes: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: """ Suggest creating a memory based on a knowledge base query. - Future Implementation Plan: - 1. Detect if query reveals lack of documented knowledge - 2. If answer provides important information, suggest saving - 3. Auto-categorize based on query topic - 4. Extract importance from query frequency + Detects if the Q&A represents important knowledge that should be saved, + such as: + - Frequently asked questions + - Important architectural information + - Non-obvious solutions or workarounds Args: project_id: Project identifier query: User query answer: LLM answer - source_nodes: Retrieved source nodes + source_nodes: Retrieved source nodes (optional) Returns: - Dict with memory suggestion (not auto-saved) - - Current Status: PLACEHOLDER + Dict with memory suggestion (not auto-saved, requires user confirmation) """ - logger.warning("suggest_memory_from_query called but not yet implemented") - - return { - "success": True, - "should_save": False, - "suggested_memory": None, - "implementation_status": "placeholder - planned for v0.7" - } + try: + logger.info(f"Analyzing query for memory suggestion: {query[:100]}") + + # Create analysis prompt + prompt = f"""Analyze this Q&A from a code knowledge base query. + +Query: {query} + +Answer: {answer} + +Determine if this Q&A represents important project knowledge worth saving as a memory. + +Consider: +1. Is this a frequently asked or important question? +2. Does it reveal non-obvious information? +3. Is it about architecture, decisions, or important conventions? +4. Would this be valuable for future sessions? + +If YES, extract a memory with: +- type: decision, preference, experience, convention, plan, or note +- title: Brief summary of the knowledge +- content: The important information from the answer +- reason: Why this is important +- tags: Relevant keywords +- importance: 0.0-1.0 (routine info = 0.3, important = 0.7+) +- should_save: true + +If NO (routine question or trivial info), respond with: +{{"should_save": false, "reason": "explanation"}} + +Respond with a single JSON object.""" + + llm = Settings.llm + if not llm: + raise ValueError("LLM not initialized") + + response = await llm.acomplete(prompt) + result = self._parse_llm_json_response(str(response).strip()) + + if isinstance(result, list) and len(result) > 0: + result = result[0] + elif not isinstance(result, dict): + result = {"should_save": False, "reason": "Could not parse LLM response"} + + should_save = result.get("should_save", False) + + if should_save: + suggested_memory = { + "type": result.get("type", "note"), + "title": result.get("title", query[:self.MAX_TITLE_LENGTH]), + "content": result.get("content", answer[:self.MAX_CONTENT_LENGTH]), + "reason": result.get("reason", "Important Q&A from knowledge query"), + "tags": result.get("tags", ["query-based"]), + "importance": result.get("importance", 0.5) + } + + logger.info(f"Suggested memory: {suggested_memory['title']}") + + return { + "success": True, + "should_save": True, + "suggested_memory": suggested_memory, + "query": query, + "answer_excerpt": answer[:self.MAX_STRING_EXCERPT_LENGTH] + } + else: + return { + "success": True, + "should_save": False, + "reason": result.get("reason", "Not significant enough to save"), + "query": query + } + + except Exception as e: + logger.error(f"Failed to suggest memory from query: {e}") + return { + "success": False, + "error": str(e), + "should_save": False + } async def batch_extract_from_repository( self, project_id: str, - repo_path: str + repo_path: str, + max_commits: int = 50, + file_patterns: Optional[List[str]] = None ) -> Dict[str, Any]: """ Batch extract memories from entire repository. - Future Implementation Plan: - 1. Scan git history for important commits + Process: + 1. Scan recent git history for important commits 2. Analyze README, CHANGELOG, docs - 3. Mine code comments - 4. Extract from configuration files - 5. Generate project summary memory + 3. Mine code comments from source files + 4. Generate project summary memory Args: project_id: Project identifier repo_path: Path to git repository + max_commits: Maximum number of recent commits to analyze (default 50) + file_patterns: List of file patterns to scan for comments (e.g., ["*.py", "*.js"]) Returns: Dict with batch extraction results - - Current Status: PLACEHOLDER """ - logger.warning("batch_extract_from_repository called but not yet implemented") - - return { - "success": True, - "total_extracted": 0, - "by_type": {}, - "implementation_status": "placeholder - planned for v0.7" - } - - -# ============================================================================ -# Helper Functions for Future Implementation -# ============================================================================ - -def _classify_memory_type(text: str) -> str: - """ - Classify text into memory type using heuristics. - - Future: Use LLM for better classification. - """ - text_lower = text.lower() - - # Simple keyword-based classification - if any(word in text_lower for word in ["decide", "chose", "selected", "architecture"]): - return "decision" - elif any(word in text_lower for word in ["prefer", "style", "convention", "always"]): - return "preference" - elif any(word in text_lower for word in ["fix", "bug", "issue", "problem", "solution"]): - return "experience" - elif any(word in text_lower for word in ["must", "should", "rule", "convention"]): - return "convention" - elif any(word in text_lower for word in ["todo", "plan", "future", "upcoming"]): - return "plan" - else: - return "note" - - -def _extract_importance(text: str, context: Dict[str, Any]) -> float: - """ - Estimate importance score from text and context. - - Future: Use LLM to assess importance. - """ - # Simple heuristic: longer text = more important - # Future: use emphasis markers, repetition, etc. - base_score = min(len(text) / 500, 0.5) # Cap at 0.5 - - # Boost if contains certain keywords - importance_keywords = ["critical", "important", "breaking", "major"] - if any(word in text.lower() for word in importance_keywords): - base_score += 0.3 - - return min(base_score, 1.0) + try: + logger.info(f"Starting batch extraction from repository: {repo_path}") + + repo_path_obj = Path(repo_path) + if not repo_path_obj.exists(): + raise ValueError(f"Repository path not found: {repo_path}") + + extracted_memories = [] + by_source = { + "git_commits": 0, + "code_comments": 0, + "documentation": 0 + } + + # 1. Extract from recent git commits + logger.info(f"Analyzing last {max_commits} git commits...") + commits = self._get_recent_commits(repo_path, max_commits) + + for commit in commits[:self.MAX_COMMITS_TO_PROCESS]: # Focus on most recent commits for efficiency + try: + result = await self.extract_from_git_commit( + project_id=project_id, + commit_sha=commit["sha"], + commit_message=commit["message"], + changed_files=commit["files"], + auto_save=True # Auto-save significant commits + ) + if result.get("success"): + count = result.get("auto_saved_count", 0) + by_source["git_commits"] += count + extracted_memories.extend(result.get("extracted_memories", [])) + except Exception as e: + logger.warning(f"Failed to extract from commit {commit['sha'][:8]}: {e}") + + # 2. Extract from code comments + if file_patterns is None: + file_patterns = ["*.py", "*.js", "*.ts", "*.java", "*.go", "*.rs"] + + logger.info(f"Scanning code comments in {file_patterns}...") + source_files = [] + for pattern in file_patterns: + source_files.extend(repo_path_obj.rglob(pattern)) + + # Sample files to avoid overload + sampled_files = list(source_files)[:self.MAX_FILES_TO_SAMPLE] + + for file_path in sampled_files: + try: + result = await self.extract_from_code_comments( + project_id=project_id, + file_path=str(file_path) + ) + if result.get("success"): + count = result.get("total_extracted", 0) + by_source["code_comments"] += count + extracted_memories.extend(result.get("extracted_memories", [])) + except Exception as e: + logger.warning(f"Failed to extract from {file_path.name}: {e}") + + # 3. Analyze documentation files + logger.info("Analyzing documentation files...") + doc_files = ["README.md", "CHANGELOG.md", "CONTRIBUTING.md", "CLAUDE.md"] + + for doc_name in doc_files: + doc_path = repo_path_obj / doc_name + if doc_path.exists(): + try: + content = doc_path.read_text(encoding="utf-8") + # Extract key information from docs + doc_memory = self._extract_from_documentation(content, doc_name) + if doc_memory: + result = await memory_store.add_memory( + project_id=project_id, + **doc_memory, + metadata={"source": "documentation", "file": doc_name} + ) + if result.get("success"): + by_source["documentation"] += 1 + extracted_memories.append(doc_memory) + except Exception as e: + logger.warning(f"Failed to extract from {doc_name}: {e}") + + total_extracted = sum(by_source.values()) + + logger.success(f"Batch extraction complete: {total_extracted} memories extracted") + + return { + "success": True, + "total_extracted": total_extracted, + "by_source": by_source, + "extracted_memories": extracted_memories, + "repository": repo_path + } + + except Exception as e: + logger.error(f"Failed batch extraction: {e}") + return { + "success": False, + "error": str(e), + "total_extracted": 0 + } + + + # ======================================================================== + # Helper Methods + # ======================================================================== + + def _format_conversation(self, conversation: List[Dict[str, str]]) -> str: + """Format conversation for LLM analysis""" + formatted = [] + for msg in conversation: + role = msg.get("role", "unknown") + content = msg.get("content", "") + formatted.append(f"{role.upper()}: {content}\n") + return "\n".join(formatted) + + def _parse_llm_json_response(self, response_text: str) -> List[Dict[str, Any]]: + """Parse JSON from LLM response, handling markdown code blocks""" + import json + + # Remove markdown code blocks if present + if "```json" in response_text: + match = re.search(r"```json\s*(.*?)\s*```", response_text, re.DOTALL) + if match: + response_text = match.group(1) + elif "```" in response_text: + match = re.search(r"```\s*(.*?)\s*```", response_text, re.DOTALL) + if match: + response_text = match.group(1) + + # Try to parse JSON + try: + result = json.loads(response_text) + # Ensure it's a list + if isinstance(result, dict): + return [result] + return result if isinstance(result, list) else [] + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON from LLM: {e}") + logger.debug(f"Response text: {response_text[:self.MAX_STRING_EXCERPT_LENGTH]}") + return [] + + def _classify_commit_type(self, commit_message: str) -> str: + """Classify commit type from conventional commit message""" + msg_lower = commit_message.lower() + first_line = commit_message.split('\n')[0].lower() + + # Conventional commits + if first_line.startswith("feat"): + return "feat" + elif first_line.startswith("fix"): + return "fix" + elif first_line.startswith("refactor"): + return "refactor" + elif first_line.startswith("docs"): + return "docs" + elif first_line.startswith("test"): + return "test" + elif first_line.startswith("chore"): + return "chore" + elif "breaking" in msg_lower or "breaking change" in msg_lower: + return "breaking" + else: + return "other" + + def _extract_comments_from_file(self, file_path: str) -> List[Dict[str, Any]]: + """Extract comments from Python source file using AST""" + comments = [] + file_path_obj = Path(file_path) + + if not file_path_obj.exists(): + return comments + + try: + content = file_path_obj.read_text(encoding="utf-8") + + # For Python files, extract comments + if file_path_obj.suffix == ".py": + for line_num, line in enumerate(content.split('\n'), 1): + line_stripped = line.strip() + if line_stripped.startswith("#"): + comments.append({ + "text": line_stripped[1:].strip(), + "line": line_num + }) + else: + # For other files, simple pattern matching + for line_num, line in enumerate(content.split('\n'), 1): + line_stripped = line.strip() + if "//" in line_stripped: + comment_text = line_stripped.split("//", 1)[1].strip() + comments.append({"text": comment_text, "line": line_num}) + + except Exception as e: + logger.warning(f"Failed to extract comments from {file_path}: {e}") + + return comments + + def _classify_comment(self, text: str, file_path: str, line_num: int) -> Optional[Dict[str, Any]]: + """Classify comment and extract memory data if it has special markers""" + text_upper = text.upper() + + # Check for special markers + if text_upper.startswith("TODO:") or "TODO:" in text_upper: + return { + "type": "plan", + "title": text.replace("TODO:", "").strip()[:100], + "content": text, + "importance": 0.4, + "tags": ["todo"], + "line": line_num + } + elif text_upper.startswith("FIXME:") or text_upper.startswith("BUG:"): + return { + "type": "experience", + "title": text.replace("FIXME:", "").replace("BUG:", "").strip()[:100], + "content": text, + "importance": 0.6, + "tags": ["bug", "fixme"], + "line": line_num + } + elif text_upper.startswith("NOTE:") or text_upper.startswith("IMPORTANT:"): + return { + "type": "convention", + "title": text.replace("NOTE:", "").replace("IMPORTANT:", "").strip()[:100], + "content": text, + "importance": 0.5, + "tags": ["note"], + "line": line_num + } + elif text_upper.startswith("DECISION:"): + return { + "type": "decision", + "title": text.replace("DECISION:", "").strip()[:100], + "content": text, + "importance": 0.7, + "tags": ["decision"], + "line": line_num + } + + return None + + def _combine_related_comments(self, comments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Combine related comments to avoid duplication""" + # Simple grouping by type + grouped = {} + for comment in comments: + mem_type = comment["type"] + if mem_type not in grouped: + grouped[mem_type] = [] + grouped[mem_type].append(comment) + + # Take top items per type by importance + combined = [] + for mem_type, items in grouped.items(): + sorted_items = sorted(items, key=lambda x: x.get("importance", 0), reverse=True) + combined.extend(sorted_items[:self.MAX_ITEMS_PER_TYPE]) + + return combined + + def _get_recent_commits(self, repo_path: str, max_commits: int) -> List[Dict[str, Any]]: + """Get recent commits from git repository""" + commits = [] + try: + # Get commit log + result = subprocess.run( + ["git", "log", f"-{max_commits}", "--pretty=format:%H|%s|%b"], + cwd=repo_path, + capture_output=True, + text=True, + check=True + ) + + for line in result.stdout.split('\n'): + if not line.strip(): + continue + + parts = line.split('|', 2) + if len(parts) < 2: + continue + + sha = parts[0] + subject = parts[1] + body = parts[2] if len(parts) > 2 else "" + + # Get changed files for this commit + files_result = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha], + cwd=repo_path, + capture_output=True, + text=True, + check=True + ) + changed_files = [f.strip() for f in files_result.stdout.split('\n') if f.strip()] + + commits.append({ + "sha": sha, + "message": f"{subject}\n{body}".strip(), + "files": changed_files + }) + + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to get git commits: {e}") + except FileNotFoundError: + logger.warning("Git not found in PATH") + + return commits + + def _extract_from_documentation(self, content: str, filename: str) -> Optional[Dict[str, Any]]: + """Extract key information from documentation files""" + # For README files, extract project overview + if "README" in filename.upper(): + # Extract first few paragraphs as project overview + lines = content.split('\n') + description = [] + for line in lines[1:self.MAX_README_LINES + 1]: # Skip first line (usually title) + if line.strip() and not line.startswith('#'): + description.append(line.strip()) + if len(description) >= 5: + break + + if description: + return { + "memory_type": "note", + "title": f"Project Overview from {filename}", + "content": " ".join(description)[:self.MAX_CONTENT_LENGTH], + "reason": "Core project information from README", + "tags": ["documentation", "overview"], + "importance": 0.6 + } + + # For CHANGELOG, extract recent important changes + elif "CHANGELOG" in filename.upper(): + return { + "memory_type": "note", + "title": "Project Changelog Summary", + "content": content[:self.MAX_CONTENT_LENGTH], + "reason": "Track project evolution and breaking changes", + "tags": ["documentation", "changelog"], + "importance": 0.5 + } + + return None # ============================================================================ @@ -275,23 +887,58 @@ async def auto_save_query_as_memory( """ Hook for knowledge service to auto-save important Q&A as memories. - Future: Call this from query_knowledge endpoint when query is valuable. + Can be called from query_knowledge endpoint to automatically save valuable Q&A. Args: project_id: Project identifier query: User query answer: LLM answer - threshold: Confidence threshold for auto-saving + threshold: Confidence threshold for auto-saving (default 0.8) Returns: memory_id if saved, None otherwise """ - logger.debug(f"auto_save_query_as_memory called (placeholder)") - - # Placeholder: would analyze query/answer and auto-save if important - # For now, just return None (no auto-save) - - return None + try: + # Use memory extractor to analyze the query + result = await memory_extractor.suggest_memory_from_query( + project_id=project_id, + query=query, + answer=answer + ) + + if not result.get("success"): + return None + + should_save = result.get("should_save", False) + suggested_memory = result.get("suggested_memory") + + if should_save and suggested_memory: + # Get importance from suggestion + importance = suggested_memory.get("importance", 0.5) + + # Only auto-save if importance meets threshold + if importance >= threshold: + save_result = await memory_store.add_memory( + project_id=project_id, + memory_type=suggested_memory["type"], + title=suggested_memory["title"], + content=suggested_memory["content"], + reason=suggested_memory.get("reason"), + tags=suggested_memory.get("tags", []), + importance=importance, + metadata={"source": "auto_query", "query": query[:self.MAX_STRING_EXCERPT_LENGTH]} + ) + + if save_result.get("success"): + memory_id = save_result.get("memory_id") + logger.info(f"Auto-saved query as memory: {memory_id}") + return memory_id + + return None + + except Exception as e: + logger.error(f"Failed to auto-save query as memory: {e}") + return None # Global instance diff --git a/tests/test_mcp_integration.py b/tests/test_mcp_integration.py index f47b241..4297ad4 100644 --- a/tests/test_mcp_integration.py +++ b/tests/test_mcp_integration.py @@ -24,10 +24,10 @@ class TestToolDefinitions: """Test suite for tool definitions""" def test_get_tool_definitions_count(self): - """Test that all 25 tools are defined""" + """Test that all 30 tools are defined""" tools = get_tool_definitions() - assert len(tools) == 25, "Should have exactly 25 tools" + assert len(tools) == 30, "Should have exactly 30 tools" def test_get_tool_definitions_knowledge_tools(self): """Test knowledge base tool definitions""" @@ -57,7 +57,7 @@ def test_get_tool_definitions_memory_tools(self): tools = get_tool_definitions() tool_names = [t.name for t in tools] - # Memory tools + # Manual memory tools assert "add_memory" in tool_names assert "search_memories" in tool_names assert "get_memory" in tool_names @@ -65,6 +65,13 @@ def test_get_tool_definitions_memory_tools(self): assert "delete_memory" in tool_names assert "supersede_memory" in tool_names assert "get_project_summary" in tool_names + + # Automatic extraction tools (v0.7) + assert "extract_from_conversation" in tool_names + assert "extract_from_git_commit" in tool_names + assert "extract_from_code_comments" in tool_names + assert "suggest_memory_from_query" in tool_names + assert "batch_extract_from_repository" in tool_names def test_get_tool_definitions_task_tools(self): """Test task management tool definitions"""