From b8d65e3f295d642446bcea1a5ade890a46c09bed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:59:07 +0000 Subject: [PATCH 1/3] Initial plan From 20b53fd8d386054f9daa6cc1792d2d42206a2dfb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:10:28 +0000 Subject: [PATCH 2/3] Fix code review issues: extract magic numbers to constants, fix variable reassignment, update test count Co-authored-by: royisme <350731+royisme@users.noreply.github.com> --- services/memory_extractor.py | 53 ++++++++++++++++++++++------------- tests/test_mcp_integration.py | 13 +++++++-- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/services/memory_extractor.py b/services/memory_extractor.py index c17cff1..cceb4ad 100644 --- a/services/memory_extractor.py +++ b/services/memory_extractor.py @@ -34,6 +34,15 @@ class MemoryExtractor: - Auto-suggest memories from knowledge queries """ + # Processing limits + MAX_COMMITS_TO_PROCESS = 20 # Maximum commits to analyze in batch processing + MAX_FILES_TO_SAMPLE = 30 # Maximum files to scan for comments + MAX_ITEMS_PER_TYPE = 3 # Top items per memory type to include + MAX_README_LINES = 20 # Maximum README lines to process for overview + MAX_STRING_EXCERPT_LENGTH = 200 # Maximum length for string excerpts in responses + MAX_CONTENT_LENGTH = 500 # Maximum length for content fields + MAX_TITLE_LENGTH = 100 # Maximum length for title fields + def __init__(self): self.extraction_enabled = True self.confidence_threshold = 0.7 # Threshold for auto-saving @@ -349,13 +358,19 @@ async def extract_from_code_comments( # Save extracted memories saved_memories = [] for mem_data in extracted: + # Add file extension as tag if file has an extension + file_tags = mem_data.get("tags", ["code-comment"]) + file_suffix = Path(file_path).suffix + if file_suffix: + file_tags = file_tags + [file_suffix[1:]] + result = await memory_store.add_memory( project_id=project_id, memory_type=mem_data["type"], title=mem_data["title"], content=mem_data["content"], reason=mem_data.get("reason"), - tags=mem_data.get("tags", ["code-comment"]) + [Path(file_path).suffix[1:]], + tags=file_tags, importance=mem_data.get("importance", 0.4), related_refs=[f"ref://file/{file_path}#{mem_data.get('line', 0)}"], metadata={ @@ -457,8 +472,8 @@ async def suggest_memory_from_query( if should_save: suggested_memory = { "type": result.get("type", "note"), - "title": result.get("title", query[:100]), - "content": result.get("content", answer[:500]), + "title": result.get("title", query[:self.MAX_TITLE_LENGTH]), + "content": result.get("content", answer[:self.MAX_CONTENT_LENGTH]), "reason": result.get("reason", "Important Q&A from knowledge query"), "tags": result.get("tags", ["query-based"]), "importance": result.get("importance", 0.5) @@ -471,7 +486,7 @@ async def suggest_memory_from_query( "should_save": True, "suggested_memory": suggested_memory, "query": query, - "answer_excerpt": answer[:200] + "answer_excerpt": answer[:self.MAX_STRING_EXCERPT_LENGTH] } else: return { @@ -532,7 +547,7 @@ async def batch_extract_from_repository( logger.info(f"Analyzing last {max_commits} git commits...") commits = self._get_recent_commits(repo_path, max_commits) - for commit in commits[:20]: # Focus on most recent 20 for efficiency + for commit in commits[:self.MAX_COMMITS_TO_PROCESS]: # Focus on most recent commits for efficiency try: result = await self.extract_from_git_commit( project_id=project_id, @@ -558,7 +573,7 @@ async def batch_extract_from_repository( source_files.extend(repo_path_obj.rglob(pattern)) # Sample files to avoid overload - sampled_files = list(source_files)[:30] + sampled_files = list(source_files)[:self.MAX_FILES_TO_SAMPLE] for file_path in sampled_files: try: @@ -636,13 +651,13 @@ def _parse_llm_json_response(self, response_text: str) -> List[Dict[str, Any]]: # Remove markdown code blocks if present if "```json" in response_text: - response_text = re.search(r"```json\s*(.*?)\s*```", response_text, re.DOTALL) - if response_text: - response_text = response_text.group(1) + match = re.search(r"```json\s*(.*?)\s*```", response_text, re.DOTALL) + if match: + response_text = match.group(1) elif "```" in response_text: - response_text = re.search(r"```\s*(.*?)\s*```", response_text, re.DOTALL) - if response_text: - response_text = response_text.group(1) + match = re.search(r"```\s*(.*?)\s*```", response_text, re.DOTALL) + if match: + response_text = match.group(1) # Try to parse JSON try: @@ -653,7 +668,7 @@ def _parse_llm_json_response(self, response_text: str) -> List[Dict[str, Any]]: return result if isinstance(result, list) else [] except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON from LLM: {e}") - logger.debug(f"Response text: {response_text[:200]}") + logger.debug(f"Response text: {response_text[:self.MAX_STRING_EXCERPT_LENGTH]}") return [] def _classify_commit_type(self, commit_message: str) -> str: @@ -766,11 +781,11 @@ def _combine_related_comments(self, comments: List[Dict[str, Any]]) -> List[Dict grouped[mem_type] = [] grouped[mem_type].append(comment) - # Take top 3 per type by importance + # Take top items per type by importance combined = [] for mem_type, items in grouped.items(): sorted_items = sorted(items, key=lambda x: x.get("importance", 0), reverse=True) - combined.extend(sorted_items[:3]) + combined.extend(sorted_items[:self.MAX_ITEMS_PER_TYPE]) return combined @@ -829,7 +844,7 @@ def _extract_from_documentation(self, content: str, filename: str) -> Optional[D # Extract first few paragraphs as project overview lines = content.split('\n') description = [] - for line in lines[1:20]: # Skip first line (usually title) + for line in lines[1:self.MAX_README_LINES]: # Skip first line (usually title) if line.strip() and not line.startswith('#'): description.append(line.strip()) if len(description) >= 5: @@ -839,7 +854,7 @@ def _extract_from_documentation(self, content: str, filename: str) -> Optional[D return { "memory_type": "note", "title": f"Project Overview from {filename}", - "content": " ".join(description)[:500], + "content": " ".join(description)[:self.MAX_CONTENT_LENGTH], "reason": "Core project information from README", "tags": ["documentation", "overview"], "importance": 0.6 @@ -850,7 +865,7 @@ def _extract_from_documentation(self, content: str, filename: str) -> Optional[D return { "memory_type": "note", "title": "Project Changelog Summary", - "content": content[:500], + "content": content[:self.MAX_CONTENT_LENGTH], "reason": "Track project evolution and breaking changes", "tags": ["documentation", "changelog"], "importance": 0.5 @@ -911,7 +926,7 @@ async def auto_save_query_as_memory( reason=suggested_memory.get("reason"), tags=suggested_memory.get("tags", []), importance=importance, - metadata={"source": "auto_query", "query": query[:200]} + metadata={"source": "auto_query", "query": query[:self.MAX_STRING_EXCERPT_LENGTH]} ) if save_result.get("success"): diff --git a/tests/test_mcp_integration.py b/tests/test_mcp_integration.py index f47b241..4297ad4 100644 --- a/tests/test_mcp_integration.py +++ b/tests/test_mcp_integration.py @@ -24,10 +24,10 @@ class TestToolDefinitions: """Test suite for tool definitions""" def test_get_tool_definitions_count(self): - """Test that all 25 tools are defined""" + """Test that all 30 tools are defined""" tools = get_tool_definitions() - assert len(tools) == 25, "Should have exactly 25 tools" + assert len(tools) == 30, "Should have exactly 30 tools" def test_get_tool_definitions_knowledge_tools(self): """Test knowledge base tool definitions""" @@ -57,7 +57,7 @@ def test_get_tool_definitions_memory_tools(self): tools = get_tool_definitions() tool_names = [t.name for t in tools] - # Memory tools + # Manual memory tools assert "add_memory" in tool_names assert "search_memories" in tool_names assert "get_memory" in tool_names @@ -65,6 +65,13 @@ def test_get_tool_definitions_memory_tools(self): assert "delete_memory" in tool_names assert "supersede_memory" in tool_names assert "get_project_summary" in tool_names + + # Automatic extraction tools (v0.7) + assert "extract_from_conversation" in tool_names + assert "extract_from_git_commit" in tool_names + assert "extract_from_code_comments" in tool_names + assert "suggest_memory_from_query" in tool_names + assert "batch_extract_from_repository" in tool_names def test_get_tool_definitions_task_tools(self): """Test task management tool definitions""" From 306e9b571fdf3012402240ae5740bd2515a28487 Mon Sep 17 00:00:00 2001 From: Roy Zhu Date: Thu, 6 Nov 2025 03:15:27 +0800 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- services/memory_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/memory_extractor.py b/services/memory_extractor.py index cceb4ad..1423268 100644 --- a/services/memory_extractor.py +++ b/services/memory_extractor.py @@ -844,7 +844,7 @@ def _extract_from_documentation(self, content: str, filename: str) -> Optional[D # Extract first few paragraphs as project overview lines = content.split('\n') description = [] - for line in lines[1:self.MAX_README_LINES]: # Skip first line (usually title) + for line in lines[1:self.MAX_README_LINES + 1]: # Skip first line (usually title) if line.strip() and not line.startswith('#'): description.append(line.strip()) if len(description) >= 5: