diff --git a/.experiments/code_review/main.py b/.experiments/code_review/main.py index ca0daa03..766a093e 100644 --- a/.experiments/code_review/main.py +++ b/.experiments/code_review/main.py @@ -87,8 +87,8 @@ def save_review(pr_number, review_desc, comments, issues, folder, combined_diff_ with open(issues_file, "w") as f: json.dump(issues, f, indent=2) - - with open(combined_diff, 'w') as f: + + with open(combined_diff, "w") as f: f.write(combined_diff_data) logger.info(f"Saved review files for PR {pr_number}") @@ -111,8 +111,12 @@ def main(pr_urls): logger.info(f"Starting to process PR {pr_number}") # Without re-evaluation - review_desc, comments, issues, combined_diff_data = process_pr(pr_url, reeval_response=False) - save_review(pr_number, review_desc, comments, issues, no_eval_folder, combined_diff_data) + review_desc, comments, issues, combined_diff_data = process_pr( + pr_url, reeval_response=False + ) + save_review( + pr_number, review_desc, comments, issues, no_eval_folder, combined_diff_data + ) # # With re-evaluation # review_desc, comments, topics = process_pr(pr_url, reeval_response=True) diff --git a/.kaizen/unit_test/kaizen/llms/test_llmprovider.py b/.kaizen/unit_test/kaizen/llms/test_llmprovider.py index 0a145d82..abbc0053 100644 --- a/.kaizen/unit_test/kaizen/llms/test_llmprovider.py +++ b/.kaizen/unit_test/kaizen/llms/test_llmprovider.py @@ -7,6 +7,7 @@ from litellm import Router import os + @pytest.fixture def mock_config_data(): return { @@ -19,49 +20,71 @@ def mock_config_data(): } } + @pytest.fixture def mock_litellm(): - with patch('kaizen.llms.provider.litellm') as mock: + with patch("kaizen.llms.provider.litellm") as mock: mock.token_counter.return_value = 100 mock.get_max_tokens.return_value = 4000 mock.cost_per_token.return_value = (0.01, 0.02) yield mock + @pytest.fixture def llm_provider(mock_config_data, mock_litellm): - with patch.object(ConfigData, 'get_config_data', return_value=mock_config_data): + with patch.object(ConfigData, "get_config_data", return_value=mock_config_data): return LLMProvider() + def test_initialization(llm_provider): assert llm_provider.system_prompt is not None assert llm_provider.model_config == {"model": "gpt-4o-mini"} assert llm_provider.default_temperature == 0.3 + def test_validate_config_correct_setup(llm_provider): assert llm_provider.models[0]["model_name"] == "default" + def test_validate_config_missing_language_model(): - with patch.object(ConfigData, 'get_config_data', return_value={}): - with pytest.raises(ValueError, match="Missing 'language_model' in configuration"): + with patch.object(ConfigData, "get_config_data", return_value={}): + with pytest.raises( + ValueError, match="Missing 'language_model' in configuration" + ): LLMProvider() + def test_token_limit_check_with_valid_prompt(llm_provider, mock_litellm): assert llm_provider.is_inside_token_limit("Test prompt") is True + def test_available_tokens_calculation(llm_provider, mock_litellm): assert llm_provider.available_tokens("Test message") == 3200 + def test_usage_cost_calculation(llm_provider, mock_litellm): total_usage = {"prompt_tokens": 100, "completion_tokens": 200} cost = llm_provider.get_usage_cost(total_usage) assert cost == (0.01, 0.02) + def test_setup_redis_missing_env_vars(): with patch.dict(os.environ, {}, clear=True): - with patch.object(ConfigData, 'get_config_data', return_value={"language_model": {"redis_enabled": True}}): - with pytest.raises(ValueError, match="Redis is enabled but REDIS_HOST or REDIS_PORT environment variables are missing"): + with patch.object( + ConfigData, + "get_config_data", + return_value={"language_model": {"redis_enabled": True}}, + ): + with pytest.raises( + ValueError, + match="Redis is enabled but REDIS_HOST or REDIS_PORT environment variables are missing", + ): LLMProvider() + def test_token_limit_check_boundary_condition(llm_provider, mock_litellm): mock_litellm.token_counter.return_value = 3200 - assert llm_provider.is_inside_token_limit("Boundary test prompt", percentage=0.8) is True \ No newline at end of file + assert ( + llm_provider.is_inside_token_limit("Boundary test prompt", percentage=0.8) + is True + ) diff --git a/.kaizen/unit_test/kaizen/llms/test_set_all_loggers_to_error.py b/.kaizen/unit_test/kaizen/llms/test_set_all_loggers_to_error.py index 073e1e76..649e1882 100644 --- a/.kaizen/unit_test/kaizen/llms/test_set_all_loggers_to_error.py +++ b/.kaizen/unit_test/kaizen/llms/test_set_all_loggers_to_error.py @@ -4,37 +4,40 @@ # Assuming the function is in the module kaizen/llms/provider.py from kaizen.llms.provider import set_all_loggers_to_ERROR + @pytest.fixture def setup_loggers(): # Setup: Create some loggers with different levels loggers = { - 'logger1': logging.getLogger('logger1'), - 'logger2': logging.getLogger('logger2'), - 'logger3': logging.getLogger('logger3') + "logger1": logging.getLogger("logger1"), + "logger2": logging.getLogger("logger2"), + "logger3": logging.getLogger("logger3"), } - loggers['logger1'].setLevel(logging.DEBUG) - loggers['logger2'].setLevel(logging.INFO) - loggers['logger3'].setLevel(logging.WARNING) - + loggers["logger1"].setLevel(logging.DEBUG) + loggers["logger2"].setLevel(logging.INFO) + loggers["logger3"].setLevel(logging.WARNING) + yield loggers - + # Teardown: Reset loggers to default level (WARNING) for logger in loggers.values(): logger.setLevel(logging.WARNING) + def test_set_all_loggers_to_ERROR(setup_loggers): # Test: Verify all existing loggers are set to ERROR level set_all_loggers_to_ERROR() - + for name, logger in setup_loggers.items(): assert logger.level == logging.ERROR, f"Logger {name} not set to ERROR level" + def test_no_loggers_present(monkeypatch): # Edge Case: Handle scenario where no loggers are present # Mock the loggerDict to simulate no loggers - monkeypatch.setattr(logging.Logger.manager, 'loggerDict', {}) - + monkeypatch.setattr(logging.Logger.manager, "loggerDict", {}) + set_all_loggers_to_ERROR() - + # Verify no errors occur and loggerDict is still empty - assert logging.Logger.manager.loggerDict == {}, "LoggerDict should be empty" \ No newline at end of file + assert logging.Logger.manager.loggerDict == {}, "LoggerDict should be empty" diff --git a/examples/code_fix/main.py b/examples/code_fix/main.py new file mode 100644 index 00000000..ab4ad327 --- /dev/null +++ b/examples/code_fix/main.py @@ -0,0 +1,292 @@ +from kaizen.generator.code_fixer import CodeFixer +import json + +issues = [ + { + "severity": "critical", + "category": "security", + "description": "Improper handling of HTTPException in the webhook handler.", + "location": {"line_start": 33, "line_end": 33}, + "impact": "Raising HTTPException directly returns a response but does not stop the function execution, leading to potential unintended behavior.", + "suggestion": "Use 'raise' to properly raise the HTTPException.", + "solution": "Replace 'return HTTPException(...)' with 'raise HTTPException(...)'.", + "good_for_first_time": "true", + "issue_title": "Improper Exception Handling", + "file_path": "github_app/main.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Potentially unused variable 'body' in the webhook handler.", + "location": {"line_start": 26, "line_end": 26}, + "impact": "The variable 'body' is fetched but not used, which can lead to confusion and unnecessary resource usage.", + "suggestion": "Remove the 'body' variable if it is not needed, or utilize it if required for further processing.", + "solution": "Remove line 26 if 'body' is not needed.", + "good_for_first_time": "true", + "issue_title": "Unused Variable", + "file_path": "github_app/main.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Hard-coded status code in the HTTPException.", + "location": {"line_start": 33, "line_end": 33}, + "impact": "Hard-coding status codes can make it difficult to manage and change them in the future.", + "suggestion": "Define constants for HTTP status codes to improve maintainability.", + "solution": "Define a constant like 'HTTP_404_NOT_FOUND = 404' and use it in the exception.", + "good_for_first_time": "false", + "issue_title": "Hard-Coded Status Code", + "file_path": "github_app/main.py", + }, + { + "severity": "medium", + "category": "performance", + "description": "Potential performance issue due to multiple checks on the configuration data.", + "location": {"line_start": 29, "line_end": 29}, + "impact": "Repeated access to configuration data can lead to performance degradation, especially if the configuration is complex.", + "suggestion": "Cache the configuration data in a variable to avoid multiple accesses.", + "solution": "Store CONFIG_DATA in a variable before the checks to improve performance.", + "good_for_first_time": "true", + "issue_title": "Repeated Configuration Access", + "file_path": "github_app/main.py", + }, + { + "severity": "high", + "category": "security", + "description": "Potential exposure of sensitive data in headers.", + "location": {"line_start": 7, "line_end": 8}, + "impact": "If the headers are logged or exposed, sensitive information such as the JWT could be compromised.", + "suggestion": "Ensure that sensitive information in headers is not logged or exposed in error messages.", + "solution": "Consider using a logging library that can mask sensitive information.", + "good_for_first_time": "false", + "issue_title": "Sensitive Data Exposure", + "file_path": "github_app/github_helper/installation.py", + }, + { + "severity": "high", + "category": "bug", + "description": "Missing body in POST request.", + "location": {"line_start": 24, "line_end": 24}, + "impact": "The request to get the installation access token may fail if the body is required but not provided.", + "suggestion": "Include the body in the POST request.", + "solution": "Change line 24 to: response = requests.post(url, headers=headers, json=body)", + "good_for_first_time": "true", + "issue_title": "Missing Request Body", + "file_path": "github_app/github_helper/installation.py", + }, + { + "severity": "medium", + "category": "performance", + "description": "Repeated calls to generate JWT for each request.", + "location": {"line_start": 8, "line_end": 8}, + "impact": "Generating a JWT for every request can lead to performance overhead.", + "suggestion": "Consider caching the JWT for a short duration to reduce the number of times it is generated.", + "solution": "Store the JWT in a variable and reuse it until it expires.", + "good_for_first_time": "false", + "issue_title": "Inefficient JWT Generation", + "file_path": "github_app/github_helper/installation.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Hardcoded endpoint URLs.", + "location": {"line_start": 9, "line_end": 9}, + "impact": "Hardcoding URLs can lead to maintainability issues if the endpoint changes.", + "suggestion": "Consider externalizing the endpoint configuration to a settings file or environment variable.", + "solution": "Use a configuration management approach to manage endpoint URLs.", + "good_for_first_time": "false", + "issue_title": "Hardcoded Endpoint URLs", + "file_path": "github_app/github_helper/installation.py", + }, + { + "severity": "high", + "category": "security", + "description": "Potential exposure of sensitive information in logs.", + "location": {"line_start": 35, "line_end": 39}, + "impact": "Logging sensitive information, such as the contents of the private key, can lead to security breaches if logs are exposed.", + "suggestion": "Avoid logging sensitive information. Remove or mask sensitive data in logs.", + "solution": "logger.info('JWT generated successfully.')", + "good_for_first_time": "true", + "issue_title": "Sensitive Information Exposure in Logs", + "file_path": "github_app/github_helper/utils.py", + }, + { + "severity": "medium", + "category": "performance", + "description": "Redundant file read operation in generate_jwt function.", + "location": {"line_start": 36, "line_end": 39}, + "impact": "Reading the file twice is inefficient and can lead to performance degradation, especially if the file is large.", + "suggestion": "Read the file content once and store it in a variable.", + "solution": "key_content = f.read() \nencoded_jwt = jwt.encode(payload, key_content, algorithm='RS256')", + "good_for_first_time": "true", + "issue_title": "Redundant File Read Operation", + "file_path": "github_app/github_helper/utils.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Hardcoded expiration time in generate_jwt function.", + "location": {"line_start": 30, "line_end": 30}, + "impact": "Hardcoding values makes the code less flexible and harder to maintain. It can lead to issues if the expiration time needs to change.", + "suggestion": "Define expiration time as a constant or pass it as a parameter.", + "solution": "EXPIRATION_TIME = 7 * 60 # Define as a constant", + "good_for_first_time": "true", + "issue_title": "Hardcoded Value for JWT Expiration", + "file_path": "github_app/github_helper/utils.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Lack of error handling for file operations in generate_jwt function.", + "location": {"line_start": 36, "line_end": 36}, + "impact": "If the file cannot be opened (e.g., file not found), it will raise an unhandled exception, causing the application to crash.", + "suggestion": "Implement try-except blocks to handle potential file I/O errors.", + "solution": "try:\n with open(file_path, 'r') as f:\n key_content = f.read()\nexcept IOError as e:\n logger.error(f'File error:{e}')\n return None", + "good_for_first_time": "true", + "issue_title": "Missing Error Handling for File Operations", + "file_path": "github_app/github_helper/utils.py", + }, + { + "severity": "low", + "category": "quality", + "description": "Unused import statements.", + "location": {"line_start": 1, "line_end": 7}, + "impact": "Unused imports can clutter the code and make it harder to read and maintain.", + "suggestion": "Remove any unused import statements.", + "solution": "Remove `import os`, `import jwt`, `import time`, `import requests`, `import logging`, `import hmac`, and `import hashlib` if not used.", + "good_for_first_time": "true", + "issue_title": "Unused Import Statements", + "file_path": "github_app/github_helper/utils.py", + }, + { + "severity": "high", + "category": "quality", + "description": "Duplicate keys in the dictionary.", + "location": {"line_start": 1, "line_end": 8}, + "impact": "Having duplicate keys in a dictionary can lead to unexpected behavior, as only the last occurrence of a key will be retained. This can cause confusion and bugs in the code.", + "suggestion": "Remove duplicate keys and ensure each key is unique.", + "solution": 'PULL_REQUEST_PERMISSION ={\n "issues_read": "read",\n "issues_write": "write",\n "checks": "read",\n "contents": "read",\n "pull_requests_read": "read",\n "pull_requests_write": "write",\n}', + "good_for_first_time": "true", + "issue_title": "Duplicate keys in dictionary", + "file_path": "github_app/github_helper/permissions.py", + }, + { + "severity": "high", + "category": "security", + "description": "Potential exposure of sensitive information through environment variables.", + "location": {"line_start": 14, "line_end": 14}, + "impact": "If the environment variable GITHUB_API_BASE_URL is not properly secured, it could expose sensitive information to unauthorized users.", + "suggestion": "Ensure that environment variables are secured and not logged. Consider using a secrets management tool.", + "solution": "Remove any logging statements that might inadvertently log sensitive information.", + "good_for_first_time": "false", + "issue_title": "Insecure handling of environment variables", + "file_path": "github_app/github_helper/pull_requests.py", + }, + { + "severity": "high", + "category": "bug", + "description": "Potential null pointer dereference when accessing payload fields.", + "location": {"line_start": 30, "line_end": 38}, + "impact": "If the payload does not contain the expected structure, it could lead to runtime exceptions.", + "suggestion": "Add checks to ensure that the required fields exist in the payload before accessing them.", + "solution": "Use a validation library or implement manual checks to ensure the presence of required keys.", + "good_for_first_time": "true", + "issue_title": "Null pointer dereference risk", + "file_path": "github_app/github_helper/pull_requests.py", + }, + { + "severity": "medium", + "category": "performance", + "description": "Repeated calls to get_installation_access_token can lead to performance issues.", + "location": {"line_start": 40, "line_end": 110}, + "impact": "Multiple calls to get_installation_access_token may lead to unnecessary overhead and potential rate limiting.", + "suggestion": "Store the access token in a variable after the first call and reuse it.", + "solution": "access_token = get_installation_access_token(installation_id, PULL_REQUEST_PERMISSION) # Call once and reuse", + "good_for_first_time": "true", + "issue_title": "Inefficient access token retrieval", + "file_path": "github_app/github_helper/pull_requests.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Code duplication in process_pull_request and process_pr_desc functions.", + "location": {"line_start": 29, "line_end": 104}, + "impact": "Duplication makes the code harder to maintain and increases the risk of bugs when changes are made.", + "suggestion": "Refactor the common logic into a separate function to reduce duplication.", + "solution": "Create a helper function that handles the common logic between process_pull_request and process_pr_desc.", + "good_for_first_time": "false", + "issue_title": "Code duplication", + "file_path": "github_app/github_helper/pull_requests.py", + }, + { + "severity": "low", + "category": "quality", + "description": "Lack of error handling for HTTP requests.", + "location": {"line_start": 118, "line_end": 132}, + "impact": "If the HTTP requests fail, there is no mechanism to handle the failure, which could lead to silent errors.", + "suggestion": "Implement error handling for HTTP requests to manage failures gracefully.", + "solution": "Check response.status_code and handle errors accordingly.", + "good_for_first_time": "true", + "issue_title": "Missing error handling for HTTP requests", + "file_path": "github_app/github_helper/pull_requests.py", + }, + { + "severity": "high", + "category": "security", + "description": "Environment variable access without validation", + "location": {"line_start": 5, "line_end": 5}, + "impact": "Accessing environment variables without validation can lead to application crashes or unexpected behavior if the variable is not set or contains malicious content.", + "suggestion": "Validate the presence of the environment variable before using it.", + "solution": "GITHUB_API_BASE_URL = os.environ.get('GITHUB_API_BASE_URL') or raise ValueError('GITHUB_API_BASE_URL not set')", + "good_for_first_time": "true", + "issue_title": "Unvalidated environment variable access", + "file_path": "github_app/github_helper/endpoints.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Hardcoded URL structure in endpoint definitions", + "location": {"line_start": 7, "line_end": 9}, + "impact": "Hardcoding parts of the URL can lead to maintenance issues if the API structure changes. It may also lead to inconsistencies if the base URL is modified but not reflected in all endpoints.", + "suggestion": "Use a function to construct URLs to ensure consistency and maintainability.", + "solution": "def create_endpoint(path): return urljoin(GITHUB_API_BASE_URL, path)\nGITHUB_ENDPOINTS ={\n 'get_installations': create_endpoint('installations'),\n 'get_installation_access_token': create_endpoint('/app/installations/{installation_id}/access_tokens'),\n}", + "good_for_first_time": "false", + "issue_title": "Hardcoded URL structure", + "file_path": "github_app/github_helper/endpoints.py", + }, + { + "severity": "medium", + "category": "quality", + "description": "Potential issue with URL joining due to leading slashes", + "location": {"line_start": 8, "line_end": 9}, + "impact": "Using leading slashes in URL paths can lead to incorrect URL formation, especially if the base URL ends with a slash. This can result in 404 errors or unintended API calls.", + "suggestion": "Ensure that the paths used in urljoin do not have leading slashes or handle them appropriately.", + "solution": "Remove leading slashes from the endpoint paths.", + "good_for_first_time": "true", + "issue_title": "Leading slashes in URL paths", + "file_path": "github_app/github_helper/endpoints.py", + }, +] + + +def group_by_files(issues): + grouped = {} + for issue in issues: + file = issue["file_path"] + if file not in grouped: + grouped[file] = [] + grouped[file].append(issue) + + return grouped + + +grouped_issues = group_by_files(issues) + +for k, v in grouped_issues.items(): + print(f"File: {k}") + with open(k, "r") as f: + file_content = f.read() + fixer = CodeFixer() + res = fixer.fix_code(original_code=file_content, issues=v) + print(json.dumps(res.fixed_code, indent=2)) + break diff --git a/kaizen/actors/unit_test_runner.py b/kaizen/actors/unit_test_runner.py index b51000f7..518b9d3a 100644 --- a/kaizen/actors/unit_test_runner.py +++ b/kaizen/actors/unit_test_runner.py @@ -29,15 +29,32 @@ def __init__(self, test_directory="./.kaizen/unit_test/"): def find_project_root(self, file_path): current_dir = os.path.dirname(os.path.abspath(file_path)) while current_dir != "/": - if any(os.path.exists(os.path.join(current_dir, marker)) for marker in - ["package.json", "Cargo.toml", "pytest.ini", "setup.py", "setup.cfg", "tox.ini", "pyproject.toml"]): + if any( + os.path.exists(os.path.join(current_dir, marker)) + for marker in [ + "package.json", + "Cargo.toml", + "pytest.ini", + "setup.py", + "setup.cfg", + "tox.ini", + "pyproject.toml", + ] + ): return current_dir current_dir = os.path.dirname(current_dir) return None def run_command(self, command, cwd=None): try: - result = subprocess.run(command, capture_output=True, text=True, cwd=cwd, shell=False, timeout=300) + result = subprocess.run( + command, + capture_output=True, + text=True, + cwd=cwd, + shell=False, + timeout=300, + ) return result.returncode, result.stdout, result.stderr except subprocess.TimeoutExpired: return 124, "", f"Command timed out after 300 seconds" @@ -53,35 +70,43 @@ def discover_and_run_tests(self, test_file=None): return self._run_tests_in_parallel(test_files) def _discover_test_files(self, test_file): - pattern = os.path.join(self.test_directory, "**", f"test_*.{'py' if test_file else '*'}") - return [f for f in glob.glob(pattern, recursive=True) if f.split('.')[-1] in self.supported_extensions] + pattern = os.path.join( + self.test_directory, "**", f"test_*.{'py' if test_file else '*'}" + ) + return [ + f + for f in glob.glob(pattern, recursive=True) + if f.split(".")[-1] in self.supported_extensions + ] def _run_tests_in_parallel(self, test_files): results = {} with ThreadPoolExecutor() as executor: - future_to_file = {executor.submit(self._run_test, file): file for file in test_files} + future_to_file = { + executor.submit(self._run_test, file): file for file in test_files + } for future in as_completed(future_to_file): file = future_to_file[future] results[file] = future.result() return results def _run_test(self, file_path): - extension = file_path.split('.')[-1] + extension = file_path.split(".")[-1] return self.supported_extensions[extension](file_path) def run_python_tests(self, file_path): relative_path = os.path.relpath(file_path, self.project_root) - + captured_output = io.StringIO() sys.stdout = captured_output sys.stderr = captured_output - + pytest_args = [relative_path, "-v"] result = pytest.main(pytest_args) - + sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ - + output = captured_output.getvalue() return self._parse_pytest_output(output, result) @@ -92,7 +117,9 @@ def _parse_pytest_output(self, output, result): tests_run = len(passed_tests) + len(failed_tests) + len(error_tests) failure_details = {} - for match in re.findall(r"FAILED (.*?) - Failed:(.*?)(?:\n|$)", output, re.MULTILINE): + for match in re.findall( + r"FAILED (.*?) - Failed:(.*?)(?:\n|$)", output, re.MULTILINE + ): test_path, reason = match file_name = test_path.split("::")[0] test_name = test_path.split("::")[-1] @@ -112,7 +139,9 @@ def _parse_pytest_output(self, output, result): def run_javascript_tests(self, file_path): relative_path = os.path.relpath(file_path, self.project_root) - code, stdout, stderr = self.run_command(["npx", "jest", relative_path], cwd=self.project_root) + code, stdout, stderr = self.run_command( + ["npx", "jest", relative_path], cwd=self.project_root + ) return self._parse_jest_output(stdout, stderr, code) def _parse_jest_output(self, stdout, stderr, code): @@ -142,7 +171,9 @@ def run_react_ts_tests(self, file_path): def run_rust_tests(self, file_path): # relative_path = os.path.relpath(file_path, self.project_root) - code, stdout, stderr = self.run_command(["cargo", "test", "--", "--nocapture"], cwd=self.project_root) + code, stdout, stderr = self.run_command( + ["cargo", "test", "--", "--nocapture"], cwd=self.project_root + ) return self._parse_rust_output(stdout, stderr, code) def _parse_rust_output(self, stdout, stderr, code): @@ -150,7 +181,9 @@ def _parse_rust_output(self, stdout, stderr, code): failures = stdout.count("FAILED") errors = stderr.count("error:") - failure_details = re.findall(r"---- .*? ----\n.*?\n\nthread.*?panicked.*?\n(.*?)\n\n", stdout, re.DOTALL) + failure_details = re.findall( + r"---- .*? ----\n.*?\n\nthread.*?panicked.*?\n(.*?)\n\n", stdout, re.DOTALL + ) error_details = re.findall(r"error:.*?\n(.*?)\n\n", stderr, re.DOTALL) return { diff --git a/kaizen/generator/code_fixer.py b/kaizen/generator/code_fixer.py new file mode 100644 index 00000000..19fe8bac --- /dev/null +++ b/kaizen/generator/code_fixer.py @@ -0,0 +1,51 @@ +from typing import List, Dict, Optional +import logging +import json +from dataclasses import dataclass +from kaizen.llms.provider import LLMProvider +from kaizen.llms.prompts.code_fix_prompts import ( + CODE_FIX_SYSTEM_PROMPT, + CODE_FIX_PROMPT, +) + + +@dataclass +class CodeFixerOutput: + fixed_code: dict + total_usage: dict + + +class CodeFixer: + def __init__(self): + self.logger = logging.getLogger(__name__) + self.provider = LLMProvider() + self.provider.system_prompt = ( + CODE_FIX_SYSTEM_PROMPT # You'll need to define this + ) + self.total_usage = { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + } + + def fix_code( + self, original_code: str, issues: List[Dict], user: Optional[str] = None + ) -> CodeFixerOutput: + self.logger.info("Starting code fixing process") + + fix_prompt = CODE_FIX_PROMPT.format( + file_content=original_code, issue_json=json.dumps(issues, indent=2) + ) + + if not self.provider.is_inside_token_limit(PROMPT=fix_prompt): + self.logger.warning( + f"Fix prompt for issue exceeds token limit. Skipping." + ) + raise Exception("File Size too big!") + + resp, usage = self.provider.chat_completion_with_json( + fix_prompt, user=user, model="best" + ) + self.total_usage = self.provider.update_usage(self.total_usage, usage) + + return CodeFixerOutput(fixed_code=resp, total_usage=self.total_usage) diff --git a/kaizen/generator/pr_description.py b/kaizen/generator/pr_description.py index 5bf0c4f1..24fe3bfd 100644 --- a/kaizen/generator/pr_description.py +++ b/kaizen/generator/pr_description.py @@ -129,10 +129,16 @@ def _process_files_generator( filename.split(".")[-1] not in parser.EXCLUDED_FILETYPES and patch_details is not None ): - - diff_parts.append(f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}") - if available_tokens - self.provider.get_token_count("".join(diff_parts)) > 0: + diff_parts.append( + f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}" + ) + + if ( + available_tokens + - self.provider.get_token_count("".join(diff_parts)) + > 0 + ): combined_diff_data = "".join(diff_parts) continue @@ -142,7 +148,9 @@ def _process_files_generator( pull_request_desc, user, ) - diff_parts = [f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"] + diff_parts = [ + f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}" + ] if diff_parts: yield self._process_file_chunk( diff --git a/kaizen/generator/unit_test.py b/kaizen/generator/unit_test.py index 51e81fac..88626bc4 100644 --- a/kaizen/generator/unit_test.py +++ b/kaizen/generator/unit_test.py @@ -117,7 +117,7 @@ def generate_tests_from_dir( usage=self.total_usage, model_name=self.provider.model, cost={"prompt_cost": prompt_cost, "completion_cost": completion_cost}, - scenarios=self.test_scenarios + scenarios=self.test_scenarios, ) def generate_tests( diff --git a/kaizen/helpers/parser.py b/kaizen/helpers/parser.py index 5f7c6dd9..cc993570 100644 --- a/kaizen/helpers/parser.py +++ b/kaizen/helpers/parser.py @@ -4,24 +4,54 @@ EXCLUDED_FILETYPES = [ # Compiled output - "class", "o", "obj", "exe", "dll", "pyc", "pyo", + "class", + "o", + "obj", + "exe", + "dll", + "pyc", + "pyo", # Package manager files "lock", # Covers package-lock.json, yarn.lock, Gemfile.lock, composer.lock # IDE configurations - "idea", "vscode", "project", "classpath", + "idea", + "vscode", + "project", + "classpath", # Binary and large files - "zip", "tar", "gz", "rar", "pdf", "doc", "docx", "xls", "xlsx", - "jpg", "jpeg", "png", "gif", "bmp", "ico", "mp3", "mp4", "avi", "mov", + "zip", + "tar", + "gz", + "rar", + "pdf", + "doc", + "docx", + "xls", + "xlsx", + "jpg", + "jpeg", + "png", + "gif", + "bmp", + "ico", + "mp3", + "mp4", + "avi", + "mov", # Log files "log", # Database files - "db", "sqlite", + "db", + "sqlite", # Temporary files - "tmp", "temp", + "tmp", + "temp", # OS-specific files - "DS_Store", "Thumbs.db", + "DS_Store", + "Thumbs.db", # Configuration files - "gitignore", "dockerignore", + "gitignore", + "dockerignore", # Add any other specific extensions you want to exclude ] @@ -42,14 +72,14 @@ def should_ignore_file(filepath): """ Check if a file should be ignored based on its path, name, or extension. - + :param filepath: The full path of the file to check :return: True if the file should be ignored, False otherwise """ # Get the file name and extension filename = os.path.basename(filepath) _, extension = os.path.splitext(filename) - extension = extension.lstrip('.') # Remove the leading dot + extension = extension.lstrip(".") # Remove the leading dot # Check if the file is in an excluded folder for folder in EXCLUDED_FOLDERS: @@ -61,7 +91,14 @@ def should_ignore_file(filepath): return True # Check for specific filenames - if filename in ["package-lock.json", "yarn.lock", "Gemfile.lock", "composer.lock", ".DS_Store", "Thumbs.db"]: + if filename in [ + "package-lock.json", + "yarn.lock", + "Gemfile.lock", + "composer.lock", + ".DS_Store", + "Thumbs.db", + ]: return True return False diff --git a/kaizen/llms/prompts/code_fix_prompts.py b/kaizen/llms/prompts/code_fix_prompts.py new file mode 100644 index 00000000..24e93307 --- /dev/null +++ b/kaizen/llms/prompts/code_fix_prompts.py @@ -0,0 +1,77 @@ +CODE_FIX_SYSTEM_PROMPT = """ +You are an expert software engineer and code reviewer with extensive experience across multiple programming languages and best practices. Your task is to analyze code, identify issues, and provide fixes while adhering to the following guidelines: + +1. Accuracy: Ensure that your fixes address the identified issues correctly and do not introduce new problems. +2. Best Practices: Apply industry-standard best practices and coding conventions appropriate for the given programming language. +3. Security: Pay special attention to security-related issues and provide robust, secure solutions. +4. Readability: Improve code readability and maintainability where possible, without drastically changing the overall structure. +5. Performance: Consider performance implications of your fixes, optimizing where appropriate. +6. Explanation: Provide clear, concise explanations for your changes to help developers understand the rationale behind each fix. +7. Confidence: Assess the necessity and impact of each change, providing a confidence score to indicate whether a change should be applied or could potentially be ignored. + +You will be provided with file contents and issue information. Your responses should be thorough, precise, and actionable, allowing developers to easily understand and implement your suggested fixes. +""" + + +CODE_FIX_PROMPT = """ +You are an AI assistant tasked with fixing code issues. You will be provided with the full content of a file and a JSON object containing information about the issues in that file. Your task is to provide the fixed code, along with additional information. + +File Content: +{file_content} + +Issue Information: +{issue_json} + +Please provide the following information for each issue: + +1. Original Code: The specific lines of code where the issue occurs. +2. Fixed Code: The updated code that addresses the issue. +3. Lines Changed: The line numbers where changes were made. +4. Confidence Score: A score out of 10 indicating how confident you are that this change should be performed (10) or can be ignored (1). + +Format your response in JSON format as shown below: +{{ + "fixes": [{{ + "original_code": "The specific lines of code where the issue occurs", + "fixed_code": "The updated code that addresses the issue", + "start_line": "Starting line number of the change", + "end_line": "Ending line number of the change", + "confidence_score": "Score out of 10 indicating confidence in the fix", + "needs_fix": "yes|no", + "reason_for_fix": "Explanation of the issue and why the fix is necessary", + "needs_additional_changes": "yes|no", + "additional_change_summary": "Summary of any additional changes needed", + "needs_additional_changes": "yes|no", + "additional_change_summary": "Summarized changes needed to fix this issue", + "fix_affect_other_part_of_code": "yes|no" + }}] +}} + +Guidelines: +1. "needs_additional_changes" should be yes only when only fixing the current code doesnt address the issue and we need to change thigns at multiple places. +2. Always try to find a way to fix without any additional changes if possible. +3. Only include fixes for issues which make sense, feel free to ignore issues which dont make sense + +""" + +APPLY_FIXES_PROMPT = """ +You are an AI assistant tasked with applying multiple fixes to a file. You have been provided with the original file content and a list of fixes to apply. Your task is to apply all the fixes and return the fully updated file content. + +Original File Content: +{original_file_content} + +Fixes to Apply: +{fixes} + +Please apply all the fixes to the original file content and return the fully updated file. Make sure to maintain the original structure and formatting of the file where possible, only changing the specific lines that need to be updated. + +Return the updated file content in the following format: + +Updated File Content: +```language +UPDATED_FILE_CONTENT +``` + +If there are any conflicts or issues in applying the fixes, please note them after the updated file content. + +""" diff --git a/kaizen/llms/prompts/code_scan_prompts.py b/kaizen/llms/prompts/code_scan_prompts.py index 0d4369f9..17109843 100644 --- a/kaizen/llms/prompts/code_scan_prompts.py +++ b/kaizen/llms/prompts/code_scan_prompts.py @@ -57,7 +57,7 @@ {{ "issues": [ {{ "severity": "critical|high|medium|low", - "category": "security|performance|quality|bug", + "category": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming", "description": "Brief description of the issue", "location": {{ "line_start": 1, @@ -100,7 +100,7 @@ "issues": [ {{ "severity": "critical|high|medium|low", - "category": "security|performance|quality|bug", + "category": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming", "description": "Brief description of the issue", "location": {{ "line_start": 1, diff --git a/kaizen/llms/provider.py b/kaizen/llms/provider.py index 08268d9e..0045e886 100644 --- a/kaizen/llms/provider.py +++ b/kaizen/llms/provider.py @@ -33,6 +33,7 @@ def set_all_loggers_to_ERROR(): logging.basicConfig( level=LOGLEVEL, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) +logger = logging.getLogger(__name__) class LLMProvider: @@ -204,6 +205,7 @@ def chat_completion_with_json( custom_model=custom_model, messages=messages, ) + # logger.info(f"completiong response: {response}") response = extract_json(response) return response, usage diff --git a/kaizen/reviewer/code_review.py b/kaizen/reviewer/code_review.py index 2db16c5b..2138fd9f 100644 --- a/kaizen/reviewer/code_review.py +++ b/kaizen/reviewer/code_review.py @@ -182,7 +182,7 @@ def review_pull_request( issues=reviews, code_quality=code_quality, cost={"prompt_cost": prompt_cost, "completion_cost": completion_cost}, - file_count=self.files_processed + file_count=self.files_processed, ) def _process_full_diff( @@ -246,14 +246,17 @@ def _process_files_generator( patch_details = file.get("patch") filename = file.get("filename", "").replace(" ", "") - if ( - not parser.should_ignore_file(filename) - and patch_details is not None - ): + if not parser.should_ignore_file(filename) and patch_details is not None: self.files_processed += 1 - diff_parts.append(f"\n---->\nFile Name: {filename}\nPatch Details:\n{parser.patch_to_combined_chunks(patch_details, self.ignore_deletions)}") + diff_parts.append( + f"\n---->\nFile Name: {filename}\nPatch Details:\n{parser.patch_to_combined_chunks(patch_details, self.ignore_deletions)}" + ) - if available_tokens - self.provider.get_token_count("".join(diff_parts)) > 0: + if ( + available_tokens + - self.provider.get_token_count("".join(diff_parts)) + > 0 + ): combined_diff_data = "".join(diff_parts) continue @@ -265,7 +268,9 @@ def _process_files_generator( reeval_response, custom_context, ) - diff_parts = [f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_combined_chunks(patch_details, self.ignore_deletions)}"] + diff_parts = [ + f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_combined_chunks(patch_details, self.ignore_deletions)}" + ] if diff_parts: yield self._process_file_chunk( diff --git a/pyproject.toml b/pyproject.toml index dd1ac497..4e9a1394 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "kaizen-cloudcode" -version = "0.4.17" +version = "0.4.18" description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly." authors = ["Saurav Panda "] license = "Apache2.0"