From c9a10fb1836c1db3a2512411efe405712cb36bed Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Sun, 21 May 2023 20:42:44 +0800 Subject: [PATCH 01/10] Add sde team programming task --- agentverse/environments/__init__.py | 1 + .../environments/rules/order/__init__.py | 1 + .../environments/rules/order/sde_team.py | 30 +++ .../environments/rules/selector/__init__.py | 1 + .../environments/rules/selector/code_api.py | 103 +++++++ .../environments/rules/selector/sde_team.py | 72 +++++ .../environments/rules/updater/__init__.py | 1 + .../environments/rules/updater/sde_team.py | 48 ++++ .../environments/rules/visibility/__init__.py | 1 + .../environments/rules/visibility/sde_team.py | 24 ++ agentverse/environments/sde_team.py | 127 +++++++++ agentverse/memory/__init__.py | 1 + agentverse/memory/sde_team.py | 32 +++ agentverse/tasks/__init__.py | 1 + .../tasks/sde_team_3players_nolc/config.yaml | 252 ++++++++++++++++++ .../sde_team_3players_nolc/output_parser.py | 18 ++ main.py | 2 +- 17 files changed, 714 insertions(+), 1 deletion(-) create mode 100644 agentverse/environments/rules/order/sde_team.py create mode 100644 agentverse/environments/rules/selector/code_api.py create mode 100644 agentverse/environments/rules/selector/sde_team.py create mode 100644 agentverse/environments/rules/updater/sde_team.py create mode 100644 agentverse/environments/rules/visibility/sde_team.py create mode 100644 agentverse/environments/sde_team.py create mode 100644 agentverse/memory/sde_team.py create mode 100644 agentverse/tasks/sde_team_3players_nolc/config.yaml create mode 100644 agentverse/tasks/sde_team_3players_nolc/output_parser.py diff --git a/agentverse/environments/__init__.py b/agentverse/environments/__init__.py index 7617d9476..8b5e1517c 100644 --- a/agentverse/environments/__init__.py +++ b/agentverse/environments/__init__.py @@ -6,3 +6,4 @@ from .base import BaseEnvironment from .basic import BasicEnvironment from .pokemon import PokemonEnvironment +from .sde_team import SdeTeamEnvironment diff --git a/agentverse/environments/rules/order/__init__.py b/agentverse/environments/rules/order/__init__.py index 4ac22ec7b..14b95c9fb 100644 --- a/agentverse/environments/rules/order/__init__.py +++ b/agentverse/environments/rules/order/__init__.py @@ -7,3 +7,4 @@ from .concurrent import ConcurrentOrder from .classroom import ClassroomOrder from .prisoner import PrisonerOrder +from .sde_team import SdeTeamOrder diff --git a/agentverse/environments/rules/order/sde_team.py b/agentverse/environments/rules/order/sde_team.py new file mode 100644 index 000000000..ac0d54267 --- /dev/null +++ b/agentverse/environments/rules/order/sde_team.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import logging +import re +import random +from typing import TYPE_CHECKING, Any, List, Optional + +from . import order_registry as OrderRegistry +from .base import BaseOrder + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@OrderRegistry.register("sde_team") +class SdeTeamOrder(BaseOrder): + """The order for a code problem solving + """ + next_agent_idx: int = 2 + + def get_next_agent_idx(self, environment: BaseEnvironment) -> List[int]: + if self.next_agent_idx == 2: + self.next_agent_idx = 0 + return [2] * 5 # TODO set the number in yaml + elif self.next_agent_idx == 0: + self.next_agent_idx = 1 + return [0] + elif self.next_agent_idx == 1: + self.next_agent_idx = 0 + return [1] \ No newline at end of file diff --git a/agentverse/environments/rules/selector/__init__.py b/agentverse/environments/rules/selector/__init__.py index da11bda59..56b520dbe 100644 --- a/agentverse/environments/rules/selector/__init__.py +++ b/agentverse/environments/rules/selector/__init__.py @@ -5,3 +5,4 @@ from .base import BaseSelector from .basic import BasicSelector from .classroom import ClassroomSelector +from .sde_team import SdeTeamSelector diff --git a/agentverse/environments/rules/selector/code_api.py b/agentverse/environments/rules/selector/code_api.py new file mode 100644 index 000000000..38245bf7f --- /dev/null +++ b/agentverse/environments/rules/selector/code_api.py @@ -0,0 +1,103 @@ +import io +import sys +import ast +import json +import astunparse +import concurrent.futures +from typing import List + + +def get_call_str(assert_statement: str) -> str: + call_str = ast.parse(assert_statement).body[0].test.left # type: ignore + return astunparse.unparse(call_str).strip() + +def get_output(func: str, assert_statement: str) -> str: + try: + func_call = get_call_str(assert_statement) + exec(func, globals()) + output = eval(func_call) + return output + except Exception as e: + return str(type(e).__name__) + +def worker(code, globals=None, locals=None): + old_stdout = sys.stdout + redirected_output = sys.stdout = io.StringIO() + if locals is None: + locals = {} + try: + # TODO: exec(code, globals, locals) could be buggy + # In cases where both import statement and function exits in the code, if the locals are given, + # the code will not find the imported package. + # For example, + # code = "import math\ndef f(x):\n\treturn math.pow(x, 2)\nassert f(2) == 4" + # It will return "NameError: name 'math' is not defined" + exec(code, locals, locals) + stdout = redirected_output.getvalue() + return stdout, globals, locals + except Exception as e: + return f"Error: {e}", globals, locals + finally: + sys.stdout = old_stdout # restore the original stdout + +def execute_code(code: str) -> str: + """Execute a snippet of python code and return the output or the error message. + """ + timeout = 5 + try: + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(worker, code) + result, _, _ = future.result(timeout) + return result + except concurrent.futures.TimeoutError: + return "Timeout" + +def execute_unit_tests(func_impl: str, tests: str) -> str: + """Run a python function on a bunch of unit tests tests and return detailed feedback. + """ + # tests = eval(tests) + # assert type(tests) == list + + # Combine function code and assert statement + func_test_list = [f'{func_impl}\n{test}' for test in tests] + + # Run the tests and collect the results + success_tests = [] + failed_tests = [] + is_passing = True + num_tests = len(func_test_list) + for i in range(num_tests): + output = execute_code(func_test_list[i]) + if output == "Timeout": + failed_tests += [f"{tests[i]} # output: Timeout"] + is_passing = False + elif output.startswith("Error: "): + print(output) + func_output = get_output(func_impl, tests[i]) + failed_tests += [f"{tests[i]} # output: {func_output}"] + is_passing = False + else: + success_tests += [tests[i]] + + feedback = "Tested passed:" + for test in success_tests: + feedback += f"\n{test}" + feedback += "\n\nTests failed:" + for test in failed_tests: + feedback += f"\n{test}" + + return json.dumps({"is_passing": is_passing, + "feedback": feedback}) + +if __name__ == "__main__": + code = """ +from typing import List + +def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] +""" + tests = ["assert two_sum([2, 7, 11, 15], 9) == [0, 1]", "assert two_sum([3, 2, 4], 6) == [1, 2]", "assert two_sum([3, 5, 2, 7], 9) == [2, 3]"] + print(execute_unit_tests(code, tests)) \ No newline at end of file diff --git a/agentverse/environments/rules/selector/sde_team.py b/agentverse/environments/rules/selector/sde_team.py new file mode 100644 index 000000000..7a4b571ad --- /dev/null +++ b/agentverse/environments/rules/selector/sde_team.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from agentverse.message import Message + +from . import selector_registry as SelectorRegistry +from .base import BaseSelector + +import json +import re + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + +def extract(content: str, keyword: str): + result = "" + flag = False + for line in content.split('\n'): + if line.strip().startswith(keyword): + flag = True + continue + if flag: + result += line + result += "\n" + return result + + +@SelectorRegistry.register("sde_team") +class SdeTeamSelector(BaseSelector): + def select_message(self, environment: BaseEnvironment, messages: List[Message]) -> List[Message]: + last_sender = environment.last_messages[0].sender + selected = messages + + if last_sender == "unit_test_generator": + unit_tests = set() + for message in selected: + unit_test = extract(message.content, ":") + if unit_test not in unit_tests: + unit_tests.add(extract(message.content, ":")) + unit_tests = list(unit_tests) + environment.rule_params["unit_tests"] = str(unit_tests) + new_message = Message( + content="", + sender="unit_test_generator", + receiver=[], + ) # TODO: set the content of the message + selected = [new_message] + + elif last_sender == "code_writer": + cur_code = extract(selected[0].content, ":") + environment.rule_params["code"] = cur_code + + from .code_api import execute_unit_tests + feedback = execute_unit_tests(environment.rule_params["code"], eval(environment.rule_params["unit_tests"])) + + environment.rule_params["feedback"] = feedback + selected[0].content = f":\n\n{cur_code}\n\n:\n{feedback}" + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + elif last_sender == "code_reviewer": + code_review = selected[0].content + cur_code = environment.rule_params["code"] + selected[0].content = f":\n\n{cur_code}\n\n{code_review}" + feedback = environment.rule_params["feedback"] + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + return selected \ No newline at end of file diff --git a/agentverse/environments/rules/updater/__init__.py b/agentverse/environments/rules/updater/__init__.py index 81c30a37f..b61e66f1f 100644 --- a/agentverse/environments/rules/updater/__init__.py +++ b/agentverse/environments/rules/updater/__init__.py @@ -5,3 +5,4 @@ from .base import BaseUpdater from .basic import BasicUpdater from .classroom import ClassroomUpdater +from .sde_team import SdeTeamUpdater diff --git a/agentverse/environments/rules/updater/sde_team.py b/agentverse/environments/rules/updater/sde_team.py new file mode 100644 index 000000000..049bae413 --- /dev/null +++ b/agentverse/environments/rules/updater/sde_team.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Tuple + +from . import updater_registry as UpdaterRegistry +from .base import BaseUpdater +from agentverse.message import Message + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + from agentverse.agents import BaseAgent + + +@UpdaterRegistry.register("sde_team") +class SdeTeamUpdater(BaseUpdater): + """ + The basic version of updater. + The messages will be seen by all the receiver specified in the message. + """ + + def update_memory(self, environment: BaseEnvironment): + added = False + for message in environment.last_messages: + if message.content == "": + continue + added |= self.add_message_to_all_agents(environment.agents, message) + + def add_message_to_all_agents( + self, agents: List[BaseAgent], message: Message + ) -> bool: + if "all" in message.receiver: + # If receiver is all, then add the message to all agents + for agent in agents: + agent.add_message_to_memory([message]) + return True + else: + # If receiver is not all, then add the message to the specified agents + receiver_set = message.receiver + for agent in agents: + if agent.name in receiver_set: + agent.add_message_to_memory([message]) + receiver_set.remove(agent.name) + if len(receiver_set) > 0: + missing_receiver = ", ".join(list(receiver_set)) + raise ValueError( + "Receiver {} not found. Message discarded".format(missing_receiver) + ) + return True diff --git a/agentverse/environments/rules/visibility/__init__.py b/agentverse/environments/rules/visibility/__init__.py index c5fba1455..3ce2ba3c1 100644 --- a/agentverse/environments/rules/visibility/__init__.py +++ b/agentverse/environments/rules/visibility/__init__.py @@ -9,3 +9,4 @@ from .classroom import ClassroomVisibility from .oneself import OneselfVisibility from .prisoner import PrisonerVisibility +from .sde_team import SdeTeamVisibility diff --git a/agentverse/environments/rules/visibility/sde_team.py b/agentverse/environments/rules/visibility/sde_team.py new file mode 100644 index 000000000..be56a2fad --- /dev/null +++ b/agentverse/environments/rules/visibility/sde_team.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import random +from typing import TYPE_CHECKING, Any, List, Union + +from . import visibility_registry as VisibilityRegistry +from .base import BaseVisibility + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@VisibilityRegistry.register("sde_team") +class SdeTeamVisibility(BaseVisibility): + """ + Visibility function for code problem. No need to change visibility. + + """ + + def update_visible_agents(self, environment: BaseEnvironment): + return + + def reset(self): + return \ No newline at end of file diff --git a/agentverse/environments/sde_team.py b/agentverse/environments/sde_team.py new file mode 100644 index 000000000..cb1735dca --- /dev/null +++ b/agentverse/environments/sde_team.py @@ -0,0 +1,127 @@ +import asyncio +import logging +from typing import Any, Dict, List +from icecream import ic +import json + +from agentverse.agents.conversation_agent import BaseAgent +from agentverse.environments.rules.base import Rule +from agentverse.message import Message + +from . import env_registry as EnvironmentRegistry +from .base import BaseEnvironment +from agentverse.initialization import load_tools + + +@EnvironmentRegistry.register("sde_team") +class SdeTeamEnvironment(BaseEnvironment): + """ + A basic environment implementing the logic of conversation to craft code. + + Args: + agents: List of agents + rule: Rule for the environment + max_turns: Maximum number of turns + cnt_turn: Current turn number + last_messages: Messages from last turn + rule_params: Variables set by the rule + """ + + agents: List[BaseAgent] + rule: Rule + max_turns: int = 10 + cnt_turn: int = 0 + last_messages: List[Message] = [] + rule_params: Dict = {} + task_name: str = "test" + + def __init__(self, rule, **kwargs): + rule_config = rule + order_config = rule_config.get("order", {"type": "sde_team"}) + visibility_config = rule_config.get("visibility", {"type": "base"}) + selector_config = rule_config.get("selector", {"type": "sde_team"}) + updater_config = rule_config.get("updater", {"type": "sde_team"}) + describer_config = rule_config.get("describer", {"type": "base"}) + rule = Rule( + order_config, + visibility_config, + selector_config, + updater_config, + describer_config, + ) + super().__init__(rule=rule, **kwargs) + self.rule_params["first_round"] = True + self.rule_params["end_flag"] = False + + # # Test code + # self.rule_params["name_to_tools"] = {tool.name: tool for tool in load_tools([{"tool_name": "code_interpreter", "tool_url" : "http://127.0.0.1:8079/tools/code_interpreter/"}])} + # tool = self.rule_params["name_to_tools"]["execute_unit_tests"] + # print(type(tool)) + + # d = { + # "func_impl": "def f(x):\n\treturn x + 1", + # "tests": ["assert f(1) == 2"] + # } + # # input_str = json.dumps(d) + # json.loads(input_str) + # tool.run(input_str, verbose=True) + # exit() + + async def step(self) -> List[Message]: + """Run one step of the environment""" + + # Get the next agent index + agent_ids = self.rule.get_next_agent_idx(self) # order + + # Generate current environment description + # env_descriptions = self.rule.get_env_description(self) # describer + + # # Generate the next message + # messages = await asyncio.gather( + # *[self.agents[i].astep(env_descriptions[i]) for i in agent_ids] + # ) # call chatgpt api + + messages = await asyncio.gather( + *[self.agents[i].astep("") for i in agent_ids] + ) + + # Track the messages to get the role of the sender + self.last_messages = messages + + # Some rules will select certain messages from all the messages + selected_messages = self.rule.select_message(self, messages) # selector + self.last_messages = selected_messages + self.print_messages(selected_messages) + + # Update the memory of the agents + self.rule.update_memory(self) # updater: update memory + + # Update the set of visible agents for each agent + self.rule.update_visible_agents(self) # change receiver + + self.cnt_turn += 1 + + return selected_messages + + def print_messages(self, messages: List[Message]) -> None: + for message in messages: + if message is not None: + logging.info(f"{message.sender}: {message.content}") + + def reset(self) -> None: + """Reset the environment""" + self.cnt_turn = 0 + self.rule.reset() + for agent in self.agents: + agent.reset() + + def is_done(self) -> bool: + """Check if the environment is done""" + if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: + # with open("record_human_eval.txt", "a") as f: + # wd = dict() + # wd['task_id'] = self.task_name + # wd['code'] = self.rule_params['code'] + # f.write(json.dumps(wd)) + return True + return False diff --git a/agentverse/memory/__init__.py b/agentverse/memory/__init__.py index 2ced03877..9d053b86e 100644 --- a/agentverse/memory/__init__.py +++ b/agentverse/memory/__init__.py @@ -5,3 +5,4 @@ from .base import BaseMemory from .chat_history import ChatHistoryMemory from .summary import SummaryMemory +from .sde_team import SdeTeamMemory diff --git a/agentverse/memory/sde_team.py b/agentverse/memory/sde_team.py new file mode 100644 index 000000000..435d297a4 --- /dev/null +++ b/agentverse/memory/sde_team.py @@ -0,0 +1,32 @@ +import re +from string import Template +from typing import List + +from pydantic import Field, validator + +from agentverse.initialization import load_llm +from agentverse.llms.base import BaseLLM +from agentverse.message import Message + +from . import memory_registry +from .base import BaseMemory + + +@memory_registry.register("sde_team") +class SdeTeamMemory(BaseMemory): + """SdeTeamMemory is a memory for SdeTeamEnvironment. + It is a simple memory that only stores the most recent info in the buffer. + TODO: add summarized history + """ + buffer: str = Field(default="") + + def add_message(self, messages: List[Message]) -> None: + new_lines = "\n".join([message.content for message in messages]) + # self.update_buffer(new_lines) + self.buffer = new_lines + + def to_string(self, *args, **kwargs) -> str: + return self.buffer + + def reset(self) -> None: + self.buffer = "" diff --git a/agentverse/tasks/__init__.py b/agentverse/tasks/__init__.py index 200679c96..29ac6ef97 100644 --- a/agentverse/tasks/__init__.py +++ b/agentverse/tasks/__init__.py @@ -21,3 +21,4 @@ NlpClassroom3PlayersWithtoolNolcParser, ) from .pokemon.output_parser import PokemonParser +from .sde_team_3players_nolc.output_parser import SdeTeamParser diff --git a/agentverse/tasks/sde_team_3players_nolc/config.yaml b/agentverse/tasks/sde_team_3players_nolc/config.yaml new file mode 100644 index 000000000..fe4ff4369 --- /dev/null +++ b/agentverse/tasks/sde_team_3players_nolc/config.yaml @@ -0,0 +1,252 @@ +prompts: + prompt: &prompt |- + You are working in a programming team to solve a python code implementation problem. + + ${role_description} + + ${chat_history} + + code_writer_role_prompt: &code_writer_role_prompt | + You are a professional coding assistant. You will be given a function signature and docstring (in ). You should copy the import statement, the function signature first and then finish the function body. Before writing any code, you should first think about the task and write down your thoughts in . Then you can write your code in . + + Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + + Here is the steps of the code writing and refinement process: + 1 - generate + 2 - generate + + The following is two examples on code writing and refinement. + + [Example on code writing]: + + : + def is_palindrome(x: int) -> bool: + """ Given an integer x, return True if x is a palindrome, and False otherwise. + An integer is a palindrome when it reads the same forward and backward. + >>> is_palindrome(121) + True + >>> is_palindrome(10) + False + """ + + : + I need to convert the integer to a string and then check if the string is a palindrome. + + : + def is_palindrome(x: int) -> bool: + s = str(x) + return s == s[::-1] + + [Example on refinement]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + : + The code reviewer is right. I should start the second for loop from i + 1. + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(i + 1, len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + [Start of new task]: + + Now please solve the following problem. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + + + code_reviewer_role_prompt: &code_reviewer_role_prompt | + You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . + The code has been executed on a series of unit tests. The execution feedback is provided in . Your job is to write a code review in to help the code writer improve his code. + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + [Start of new task]: + + Now please review the following submitted code. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + + + unit_test_generator_role_prompt: &unit_test_generator_role_prompt | + You are a code tester that produces unit test case. You will be given a function signature and docstring in . Your job is to reason and write ONE unit test case in to test the code. + + Here is the steps to write a unit test case. + 1 - compose a valid input in + 2 - given the , generate + 3 - generate the correct expected output + 4 - generate the assertion statement + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + [5, 7, 1, 9, 3], 14 + + : + The output should be a list of two indices that sum up to 14. 5 (index 0) + 9 (index 3) = 14. So the output should be [0, 3]. + + : + [0, 3] + + : + assert two_sum([5, 7, 1, 9, 3], 14) == [0, 3] + + [Start of new task]: + + Now please write a unit test case for the following problem. There are serveral guys doing the same task, you should try to compose a unique test case. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + +# tools: &tools +# - tool_name: "execute_unit_tests" +# tool_url: "http://127.0.0.1:8079/tools/execute_unit_tests/" + +environment: + env_type: sde_team + max_turns: 5 + task_name: HumanEval/0 + rule: + order: + type: sde_team + visibility: + type: all + selector: + type: sde_team + updater: + type: sde_team + describer: + type: basic + +agents: + - agent_type: conversation + name: code_writer + role_description: *code_writer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer, code_reviewer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.3 + max_tokens: 1024 + + - agent_type: conversation + name: code_reviewer + role_description: *code_reviewer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.3 + max_tokens: 1024 + + - agent_type: conversation + name: unit_test_generator + role_description: *unit_test_generator_role_prompt + prompt_template: *prompt + verbose: true + llm: + llm_type: gpt-3.5-turbo + temperature: 1.0 + max_tokens: 1024 \ No newline at end of file diff --git a/agentverse/tasks/sde_team_3players_nolc/output_parser.py b/agentverse/tasks/sde_team_3players_nolc/output_parser.py new file mode 100644 index 000000000..9bb40fe86 --- /dev/null +++ b/agentverse/tasks/sde_team_3players_nolc/output_parser.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import re +from typing import Union + +from langchain.agents import AgentOutputParser + +# from langchain.schema import AgentAction, AgentFinish + +from agentverse.parser import OutputParserError, output_parser_registry, OutputParser +from agentverse.llms.base import LLMResult +from agentverse.utils import AgentAction, AgentFinish + + +@output_parser_registry.register("sde_team_3players_nolc") +class SdeTeamParser(OutputParser): + def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]: + return AgentFinish({"output": output.content}, output.content) diff --git a/main.py b/main.py index 8464b0413..ce2f12d8d 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ parser = ArgumentParser() # parser.add_argument("--task", type=str, default="nlp_classroom_9players") -parser.add_argument("--task", type=str, default="nlp_classroom_3players_nolc") +parser.add_argument("--task", type=str, default="sde_team_3players_nolcda") args = parser.parse_args() agentverse = AgentVerse.from_task(args.task) From 074ab28bb07fe01f0c52031889b556d8f9034b43 Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Mon, 22 May 2023 16:37:49 +0800 Subject: [PATCH 02/10] Update local code api --- agentverse/environments/rules/selector/code_api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/agentverse/environments/rules/selector/code_api.py b/agentverse/environments/rules/selector/code_api.py index 38245bf7f..b75e07f6f 100644 --- a/agentverse/environments/rules/selector/code_api.py +++ b/agentverse/environments/rules/selector/code_api.py @@ -4,7 +4,6 @@ import json import astunparse import concurrent.futures -from typing import List def get_call_str(assert_statement: str) -> str: @@ -18,7 +17,7 @@ def get_output(func: str, assert_statement: str) -> str: output = eval(func_call) return output except Exception as e: - return str(type(e).__name__) + return str(e) def worker(code, globals=None, locals=None): old_stdout = sys.stdout @@ -72,7 +71,7 @@ def execute_unit_tests(func_impl: str, tests: str) -> str: failed_tests += [f"{tests[i]} # output: Timeout"] is_passing = False elif output.startswith("Error: "): - print(output) + # print(output) func_output = get_output(func_impl, tests[i]) failed_tests += [f"{tests[i]} # output: {func_output}"] is_passing = False From 747beac4569257638d2acd0b51e39d0d7bad801d Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Mon, 22 May 2023 17:12:10 +0800 Subject: [PATCH 03/10] Add a readme for task sde_team_3players_nolc --- agentverse/tasks/sde_team_3players_nolc/readme.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 agentverse/tasks/sde_team_3players_nolc/readme.md diff --git a/agentverse/tasks/sde_team_3players_nolc/readme.md b/agentverse/tasks/sde_team_3players_nolc/readme.md new file mode 100644 index 000000000..dbfd5ebdf --- /dev/null +++ b/agentverse/tasks/sde_team_3players_nolc/readme.md @@ -0,0 +1,12 @@ +# A SDE team with three players + +### Roles + +- *code writer*: write Python code to fill in a function body given function signature and doc string. The generated code will be automatically tested on a series of unit test cases. This role is required to refine the previously submitted code if it haven't passed all the tests. Test feedback and a code review will be given in this case. +- *code reviewer*: write detailed code review of the code submitted by the code writer considering the feedback of the unit test. +- *unit test generator*: generate a series of unit test cases for the coding problem. + +### Note + +The generated unit tests are always perfect. They may not be correct and diversified. We will tune our method to encourage more diversity. We also plan to incorporate execution tools to raise the correctness of the generated cases. + From 45488326f25cb478e7415a7f74b511da4be8f96a Mon Sep 17 00:00:00 2001 From: tzw2698 Date: Mon, 22 May 2023 19:06:23 +0800 Subject: [PATCH 04/10] Add 2players --- agentverse/environments/__init__.py | 1 + .../environments/rules/order/__init__.py | 1 + .../rules/order/sde_team_given_tests.py | 27 ++ .../environments/rules/selector/__init__.py | 1 + .../rules/selector/sde_team_given_tests.py | 57 ++++ .../environments/sde_team_given_tests.py | 114 ++++++++ agentverse/llms/openai.py | 1 + agentverse/tasks/__init__.py | 1 + .../sde_team_2players_nolc/build_config.py | 20 ++ .../sde_team_2players_nolc/code_problem.json | 8 + .../tasks/sde_team_2players_nolc/config.yaml | 197 ++++++++++++++ .../sde_team_2players_nolc/output_parser.py | 18 ++ .../partial_config.yaml | 168 ++++++++++++ .../tasks/sde_team_2players_nolc/readme.md | 253 ++++++++++++++++++ .../sde_team_3players_nolc/output_parser.py | 2 +- imgs/prison/police.png | Bin main.py | 10 - 17 files changed, 868 insertions(+), 11 deletions(-) create mode 100644 agentverse/environments/rules/order/sde_team_given_tests.py create mode 100644 agentverse/environments/rules/selector/sde_team_given_tests.py create mode 100644 agentverse/environments/sde_team_given_tests.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/build_config.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/code_problem.json create mode 100644 agentverse/tasks/sde_team_2players_nolc/config.yaml create mode 100644 agentverse/tasks/sde_team_2players_nolc/output_parser.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/partial_config.yaml create mode 100644 agentverse/tasks/sde_team_2players_nolc/readme.md mode change 100755 => 100644 imgs/prison/police.png delete mode 100644 main.py diff --git a/agentverse/environments/__init__.py b/agentverse/environments/__init__.py index 8b5e1517c..594e9a024 100644 --- a/agentverse/environments/__init__.py +++ b/agentverse/environments/__init__.py @@ -7,3 +7,4 @@ from .basic import BasicEnvironment from .pokemon import PokemonEnvironment from .sde_team import SdeTeamEnvironment +from .sde_team_given_tests import SdeTeamGivenTestsEnvironment diff --git a/agentverse/environments/rules/order/__init__.py b/agentverse/environments/rules/order/__init__.py index 14b95c9fb..1c6ea9f0e 100644 --- a/agentverse/environments/rules/order/__init__.py +++ b/agentverse/environments/rules/order/__init__.py @@ -8,3 +8,4 @@ from .classroom import ClassroomOrder from .prisoner import PrisonerOrder from .sde_team import SdeTeamOrder +from .sde_team_given_tests import SdeTeamGivenTestsOrder diff --git a/agentverse/environments/rules/order/sde_team_given_tests.py b/agentverse/environments/rules/order/sde_team_given_tests.py new file mode 100644 index 000000000..02944f0f3 --- /dev/null +++ b/agentverse/environments/rules/order/sde_team_given_tests.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import logging +import re +import random +from typing import TYPE_CHECKING, Any, List, Optional + +from . import order_registry as OrderRegistry +from .base import BaseOrder + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@OrderRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsOrder(BaseOrder): + """The order for a code problem solving given unit tests + """ + next_agent_idx: int = 0 + + def get_next_agent_idx(self, environment: BaseEnvironment) -> List[int]: + if self.next_agent_idx == 0: + self.next_agent_idx = 1 + return [0] + elif self.next_agent_idx == 1: + self.next_agent_idx = 0 + return [1] \ No newline at end of file diff --git a/agentverse/environments/rules/selector/__init__.py b/agentverse/environments/rules/selector/__init__.py index 56b520dbe..98a7716cd 100644 --- a/agentverse/environments/rules/selector/__init__.py +++ b/agentverse/environments/rules/selector/__init__.py @@ -6,3 +6,4 @@ from .basic import BasicSelector from .classroom import ClassroomSelector from .sde_team import SdeTeamSelector +from .sde_team_given_tests import SdeTeamGivenTestsSelector diff --git a/agentverse/environments/rules/selector/sde_team_given_tests.py b/agentverse/environments/rules/selector/sde_team_given_tests.py new file mode 100644 index 000000000..8683e67fb --- /dev/null +++ b/agentverse/environments/rules/selector/sde_team_given_tests.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from agentverse.message import Message + +from . import selector_registry as SelectorRegistry +from .base import BaseSelector + +import json +import re + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + +def extract(content: str, keyword: str): + result = "" + flag = False + for line in content.split('\n'): + if line.strip().startswith(keyword): + flag = True + continue + if flag: + result += line + result += "\n" + return result + + +@SelectorRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsSelector(BaseSelector): + def select_message(self, environment: BaseEnvironment, messages: List[Message]) -> List[Message]: + last_sender = environment.last_messages[0].sender + selected = messages + + if last_sender == "code_writer": + cur_code = extract(selected[0].content, ":") + environment.rule_params["code"] = cur_code + + from .code_api import execute_unit_tests + feedback = execute_unit_tests(environment.rule_params["code"], eval(environment.unit_tests)) + + environment.rule_params["feedback"] = feedback + selected[0].content = f":\n\n{cur_code}\n\n:\n{feedback}" + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + elif last_sender == "code_reviewer": + code_review = selected[0].content + cur_code = environment.rule_params["code"] + selected[0].content = f":\n\n{cur_code}\n\n{code_review}" + feedback = environment.rule_params["feedback"] + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + return selected \ No newline at end of file diff --git a/agentverse/environments/sde_team_given_tests.py b/agentverse/environments/sde_team_given_tests.py new file mode 100644 index 000000000..5d113f8a6 --- /dev/null +++ b/agentverse/environments/sde_team_given_tests.py @@ -0,0 +1,114 @@ +import asyncio +import logging +from typing import Any, Dict, List +from icecream import ic +import json + +from agentverse.agents.conversation_agent import BaseAgent +from agentverse.environments.rules.base import Rule +from agentverse.message import Message + +from . import env_registry as EnvironmentRegistry +from .base import BaseEnvironment +from agentverse.initialization import load_tools + + +@EnvironmentRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsEnvironment(BaseEnvironment): + """ + A basic environment implementing the logic of conversation to craft code. + + Args: + agents: List of agents + rule: Rule for the environment + max_turns: Maximum number of turns + cnt_turn: Current turn number + last_messages: Messages from last turn + rule_params: Variables set by the rule + """ + + agents: List[BaseAgent] + rule: Rule + max_turns: int = 10 + cnt_turn: int = 0 + last_messages: List[Message] = [] + rule_params: Dict = {} + task_name: str = "test" + unit_tests: str = "" + + def __init__(self, rule, **kwargs): + rule_config = rule + order_config = rule_config.get("order", {"type": "sde_team_given_tests"}) + visibility_config = rule_config.get("visibility", {"type": "base"}) + selector_config = rule_config.get("selector", {"type": "sde_team_given_tests"}) + updater_config = rule_config.get("updater", {"type": "sde_team"}) + describer_config = rule_config.get("describer", {"type": "base"}) + rule = Rule( + order_config, + visibility_config, + selector_config, + updater_config, + describer_config, + ) + super().__init__(rule=rule, **kwargs) + self.rule_params["first_round"] = True + self.rule_params["end_flag"] = False + + async def step(self) -> List[Message]: + """Run one step of the environment""" + + # Get the next agent index + agent_ids = self.rule.get_next_agent_idx(self) # order + + # Generate current environment description + # env_descriptions = self.rule.get_env_description(self) # describer + + # # Generate the next message + # messages = await asyncio.gather( + # *[self.agents[i].astep(env_descriptions[i]) for i in agent_ids] + # ) # call chatgpt api + + messages = await asyncio.gather( + *[self.agents[i].astep("") for i in agent_ids] + ) + + # Track the messages to get the role of the sender + self.last_messages = messages + + # Some rules will select certain messages from all the messages + selected_messages = self.rule.select_message(self, messages) # selector + self.last_messages = selected_messages + self.print_messages(selected_messages) + + # Update the memory of the agents + self.rule.update_memory(self) # updater: update memory + + # Update the set of visible agents for each agent + self.rule.update_visible_agents(self) # change receiver + + self.cnt_turn += 1 + + return selected_messages + + def print_messages(self, messages: List[Message]) -> None: + for message in messages: + if message is not None: + logging.info(f"{message.sender}: {message.content}") + + def reset(self) -> None: + """Reset the environment""" + self.cnt_turn = 0 + self.rule.reset() + for agent in self.agents: + agent.reset() + + def is_done(self) -> bool: + """Check if the environment is done""" + if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: + # with open("record_human_eval.txt", "a") as f: + # wd = dict() + # wd['task_id'] = self.task_name + # wd['code'] = self.rule_params['code'] + # f.write(json.dumps(wd)) + return True + return False diff --git a/agentverse/llms/openai.py b/agentverse/llms/openai.py index e79b3c633..188230dc1 100644 --- a/agentverse/llms/openai.py +++ b/agentverse/llms/openai.py @@ -20,6 +20,7 @@ openai.proxy = os.environ.get("http_proxy") if openai.proxy is None: openai.proxy = os.environ.get("HTTP_PROXY") + openai.proxy="http://127.0.0.1:7890" if openai.api_key is None: logging.warning( "OpenAI API key is not set. Please set the environment variable OPENAI_API_KEY" diff --git a/agentverse/tasks/__init__.py b/agentverse/tasks/__init__.py index 29ac6ef97..ef5c827c4 100644 --- a/agentverse/tasks/__init__.py +++ b/agentverse/tasks/__init__.py @@ -22,3 +22,4 @@ ) from .pokemon.output_parser import PokemonParser from .sde_team_3players_nolc.output_parser import SdeTeamParser +from .sde_team_2players_nolc.output_parser import SdeTeamGivenTestsParser diff --git a/agentverse/tasks/sde_team_2players_nolc/build_config.py b/agentverse/tasks/sde_team_2players_nolc/build_config.py new file mode 100644 index 000000000..1fea37cae --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/build_config.py @@ -0,0 +1,20 @@ +import yaml +import json + +config_path = "partial_config.yaml" + +code_problem = json.load(open("code_problem.json", "r")) +problem_string = "\n\n:\n" + code_problem["problem"] +unit_tests = str(code_problem["unit_tests"]) + +print(problem_string) +print(unit_tests) + +task_config = yaml.safe_load(open(config_path)) + +for agent_configs in task_config["agents"]: + agent_configs["role_description"] += problem_string +task_config["environment"]["unit_tests"] = unit_tests + +with open("config.yaml", "w") as f: + yaml.safe_dump(task_config, f) \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/code_problem.json b/agentverse/tasks/sde_team_2players_nolc/code_problem.json new file mode 100644 index 000000000..540b78442 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/code_problem.json @@ -0,0 +1,8 @@ +{ + "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", + "unit_tests": [ + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" + ] +} \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/config.yaml b/agentverse/tasks/sde_team_2players_nolc/config.yaml new file mode 100644 index 000000000..f8c55d757 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/config.yaml @@ -0,0 +1,197 @@ +agents: +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_writer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + - code_reviewer + role_description: "You are a professional coding assistant. You will be given a\ + \ function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nHere is the steps of the code writing and refinement process:\n1 - generate\ + \ \n2 - generate \n\nThe following is two examples on code writing\ + \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ + \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ + \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ + \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ + \"\"\n\n:\nI need to convert the integer to a string and then check if\ + \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ + \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ + [Start of new task]:\n\nNow please solve the following problem.\n\n\n:\n\ + from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float])\ + \ -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute\ + \ Deviation\n around the mean of this dataset.\n Mean Absolute Deviation\ + \ is the average absolute difference between each\n element and a centerpoint\ + \ (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0,\ + \ 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" + verbose: true +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_reviewer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + role_description: "You are a professional code reviewer. You will be given a function\ + \ signature and docstring in . A code writer has submitted his completion\ + \ in .\nThe code has been executed on a series of unit tests.\ + \ The execution feedback is provided in . Your job is to write\ + \ a code review in to help the code writer improve his code.\n\n\ + The following is an examples.\n\n[Example]:\n\n:\nfrom typing import\ + \ List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\"\"\" Given\ + \ an array of integers nums and an integer target, return indices of the two numbers\ + \ such that they add up to target.\nYou may assume that each input would have\ + \ exactly one solution, and you may not use the same element twice. \nThe answer\ + \ should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4],\ + \ 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n:\n{\"is_passing\": false,\ + \ \"feedback\": \"Tested passed:\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\\ + nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n[Start of new task]:\n\nNow please review the following submitted\ + \ code.\n\n\n:\nfrom typing import List\n\n\ndef mean_absolute_deviation(numbers:\ + \ List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate\ + \ Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute\ + \ Deviation is the average absolute difference between each\n element and a\ + \ centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>>\ + \ mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" + verbose: true +environment: + env_type: sde_team_given_tests + max_turns: 5 + rule: + describer: + type: basic + order: + type: sde_team_given_tests + selector: + type: sde_team_given_tests + updater: + type: sde_team + visibility: + type: all + task_name: HumanEval/0 + unit_tests: '[''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < + 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < + 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) + < 1e-6\n\n'']' +prompts: + code_reviewer_role_prompt: "You are a professional code reviewer. You will be given\ + \ a function signature and docstring in . A code writer has submitted\ + \ his completion in .\nThe code has been executed on a series\ + \ of unit tests. The execution feedback is provided in . Your\ + \ job is to write a code review in to help the code writer improve\ + \ his code.\n\nThe following is an examples.\n\n[Example]:\n\n:\nfrom\ + \ typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n[Start of new task]:\n\ + \nNow please review the following submitted code.\n" + code_writer_role_prompt: "You are a professional coding assistant. You will be given\ + \ a function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nHere is the steps of the code writing and refinement process:\n1 - generate\ + \ \n2 - generate \n\nThe following is two examples on code writing\ + \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ + \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ + \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ + \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ + \"\"\n\n:\nI need to convert the integer to a string and then check if\ + \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ + \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ + [Start of new task]:\n\nNow please solve the following problem.\n" + prompt: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' diff --git a/agentverse/tasks/sde_team_2players_nolc/output_parser.py b/agentverse/tasks/sde_team_2players_nolc/output_parser.py new file mode 100644 index 000000000..ed07d7fc1 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/output_parser.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import re +from typing import Union + +from langchain.agents import AgentOutputParser + +# from langchain.schema import AgentAction, AgentFinish + +from agentverse.parser import OutputParserError, output_parser_registry, OutputParser +from agentverse.llms.base import LLMResult +from agentverse.utils import AgentAction, AgentFinish + + +@output_parser_registry.register("sde_team_2players_nolc") +class SdeTeamGivenTestsParser(OutputParser): + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: + return AgentFinish({"output": output.content}, output.content) diff --git a/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml new file mode 100644 index 000000000..84a3a9091 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml @@ -0,0 +1,168 @@ + prompts: + prompt: &prompt |- + You are working in a programming team to solve a python code implementation problem. + + ${role_description} + + ${chat_history} + + code_writer_role_prompt: &code_writer_role_prompt | + You are a professional coding assistant. You will be given a function signature and docstring (in ). You should copy the import statement, the function signature first and then finish the function body. Before writing any code, you should first think about the task and write down your thoughts in . Then you can write your code in . + + Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + + Here is the steps of the code writing and refinement process: + 1 - generate + 2 - generate + + The following is two examples on code writing and refinement. + + [Example on code writing]: + + : + def is_palindrome(x: int) -> bool: + """ Given an integer x, return True if x is a palindrome, and False otherwise. + An integer is a palindrome when it reads the same forward and backward. + >>> is_palindrome(121) + True + >>> is_palindrome(10) + False + """ + + : + I need to convert the integer to a string and then check if the string is a palindrome. + + : + def is_palindrome(x: int) -> bool: + s = str(x) + return s == s[::-1] + + [Example on refinement]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + : + The code reviewer is right. I should start the second for loop from i + 1. + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(i + 1, len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + [Start of new task]: + + Now please solve the following problem. + + code_reviewer_role_prompt: &code_reviewer_role_prompt | + You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . + The code has been executed on a series of unit tests. The execution feedback is provided in . Your job is to write a code review in to help the code writer improve his code. + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + [Start of new task]: + + Now please review the following submitted code. + +environment: + env_type: sde_team_given_tests + max_turns: 5 + task_name: HumanEval/0 + unit_tests: None + rule: + order: + type: sde_team_given_tests + visibility: + type: all + selector: + type: sde_team_given_tests + updater: + type: sde_team + describer: + type: basic + +agents: + - agent_type: conversation + name: code_writer + role_description: *code_writer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer, code_reviewer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 + + - agent_type: conversation + name: code_reviewer + role_description: *code_reviewer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/readme.md b/agentverse/tasks/sde_team_2players_nolc/readme.md new file mode 100644 index 000000000..4c5968179 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/readme.md @@ -0,0 +1,253 @@ +# A SDE team with two players + +In this task, LLMs working in a software development team to solve python code implementation problem. There are 2 agents in this task: code writer and code reviewer. + +### Roles + +#### *code writer* + +Code writer will write the code to satisfy the given requirement. The requirements were given in \ parts. The code writer first think about the task (the thoughts written in \ parts) and then write relevant code in \ part. + +Code writer will modify the written code to debug. The written code will be tested on a series of unit tests to get the feedback. Then the feedback (in \ part) together with a proffesional review (in \ review part) will be given. Then code writer will leverage this information to refine the generated code. + +The refinement will take multiple iterations. + +#### *code reviewer* + +Code reviewer will write professional review for the code writer written code. The code writer written code will be given in \ part, the execution feedback of unit tests will be given in \ part and the review will be written in \ part. + +### How to use + +#### Write problem and tests in a json + +The code problem and relevant unit tests should be given in `agentverse/tasks/sde_team_2players_nolc/code_problem.json`. Here is an example. + +```json +{ + "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", + "unit_tests": [ + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" + ] +} +``` + +#### Build the configuration + +Run `agentverse/tasks/sde_team_2players_nolc/build_config.py` to generate `config.yaml`. + +```bash +python agentverse/tasks/sde_team_2players_nolc/build_config.py +``` + +#### Run the session + +After generating `config.yaml`, run the `main.py` to start the task. + +```python +from agentverse.agentverse import AgentVerse +from argparse import ArgumentParser + +parser = ArgumentParser() +parser.add_argument("--task", type=str, default="sde_team_2players_nolc") + +args = parser.parse_args() +agentverse = AgentVerse.from_task(args.task) +agentverse.run() +``` + +### A running case + +This part will show an example of the whole code writing progress. + + + +Here is an example problem and unit tests. + +Problem: +```python +from typing import List + + +def separate_paren_groups(paren_string: str) -> List[str]: + """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to + separate those group into separate strings and return the list of those. + Separate groups are balanced (each open brace is properly closed) and not nested within each other + Ignore any spaces in the input string. + >>> separate_paren_groups('( ) (( )) (( )( ))') + ['()', '(())', '(()())'] + """ +``` + +Unit test: +```python +assert candidate('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] +assert candidate('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] +assert candidate('(()(())((())))') == ['(()(())((())))'] +assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] +``` + +A typical running log is like +```bash +05/22/2023 15:36:18 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": false, "feedback": "Tested passed:\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\n\nTests failed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] # output: ['(()())', ' ((()))', ' ()', ' ((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] # output: ['()', ' (())', ' ((()))', ' (((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] # output: ['( )', ' (( ))', ' (( )( ))']"} + +05/22/2023 15:36:33 - INFO - root - code_reviewer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +The code is correct in terms of logic and passes one of the test cases. However, it fails the other two test cases. The reason for the failure is that the code does not ignore spaces in the input string as mentioned in the problem statement. + +To fix this issue, we can modify the code to remove all spaces from the input string before processing it. We can do this by adding the following line at the beginning of the function: + +paren_string = paren_string.replace(" ", "") + +This will remove all spaces from the input string and the code should now pass all the test cases. + +05/22/2023 15:36:54 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + paren_string = paren_string.replace(" ", "") + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": true, "feedback": "Tested passed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\nTests failed:"} +``` diff --git a/agentverse/tasks/sde_team_3players_nolc/output_parser.py b/agentverse/tasks/sde_team_3players_nolc/output_parser.py index 9bb40fe86..031194ed6 100644 --- a/agentverse/tasks/sde_team_3players_nolc/output_parser.py +++ b/agentverse/tasks/sde_team_3players_nolc/output_parser.py @@ -14,5 +14,5 @@ @output_parser_registry.register("sde_team_3players_nolc") class SdeTeamParser(OutputParser): - def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]: + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: return AgentFinish({"output": output.content}, output.content) diff --git a/imgs/prison/police.png b/imgs/prison/police.png old mode 100755 new mode 100644 diff --git a/main.py b/main.py deleted file mode 100644 index ce2f12d8d..000000000 --- a/main.py +++ /dev/null @@ -1,10 +0,0 @@ -from agentverse.agentverse import AgentVerse -from argparse import ArgumentParser - -parser = ArgumentParser() -# parser.add_argument("--task", type=str, default="nlp_classroom_9players") -parser.add_argument("--task", type=str, default="sde_team_3players_nolcda") - -args = parser.parse_args() -agentverse = AgentVerse.from_task(args.task) -agentverse.run() From ea7f38ade2cbce687e523eebc90f43f1fe96e282 Mon Sep 17 00:00:00 2001 From: tzw2698 Date: Tue, 23 May 2023 13:31:37 +0800 Subject: [PATCH 05/10] 2players --- agentverse/environments/__init__.py | 1 + .../environments/rules/order/__init__.py | 1 + .../rules/order/sde_team_given_tests.py | 27 ++ .../environments/rules/selector/__init__.py | 1 + .../rules/selector/sde_team_given_tests.py | 57 ++++ .../environments/sde_team_given_tests.py | 114 ++++++++ agentverse/llms/openai.py | 1 + agentverse/tasks/__init__.py | 1 + .../sde_team_2players_nolc/build_config.py | 20 ++ .../sde_team_2players_nolc/code_problem.json | 8 + .../tasks/sde_team_2players_nolc/config.yaml | 197 ++++++++++++++ .../sde_team_2players_nolc/output_parser.py | 18 ++ .../partial_config.yaml | 168 ++++++++++++ .../tasks/sde_team_2players_nolc/readme.md | 253 ++++++++++++++++++ .../sde_team_3players_nolc/output_parser.py | 2 +- imgs/prison/police.png | Bin main.py | 3 +- user_unit_tests.txt | 1 + 18 files changed, 871 insertions(+), 2 deletions(-) create mode 100644 agentverse/environments/rules/order/sde_team_given_tests.py create mode 100644 agentverse/environments/rules/selector/sde_team_given_tests.py create mode 100644 agentverse/environments/sde_team_given_tests.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/build_config.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/code_problem.json create mode 100644 agentverse/tasks/sde_team_2players_nolc/config.yaml create mode 100644 agentverse/tasks/sde_team_2players_nolc/output_parser.py create mode 100644 agentverse/tasks/sde_team_2players_nolc/partial_config.yaml create mode 100644 agentverse/tasks/sde_team_2players_nolc/readme.md mode change 100755 => 100644 imgs/prison/police.png create mode 100644 user_unit_tests.txt diff --git a/agentverse/environments/__init__.py b/agentverse/environments/__init__.py index 8b5e1517c..594e9a024 100644 --- a/agentverse/environments/__init__.py +++ b/agentverse/environments/__init__.py @@ -7,3 +7,4 @@ from .basic import BasicEnvironment from .pokemon import PokemonEnvironment from .sde_team import SdeTeamEnvironment +from .sde_team_given_tests import SdeTeamGivenTestsEnvironment diff --git a/agentverse/environments/rules/order/__init__.py b/agentverse/environments/rules/order/__init__.py index 14b95c9fb..1c6ea9f0e 100644 --- a/agentverse/environments/rules/order/__init__.py +++ b/agentverse/environments/rules/order/__init__.py @@ -8,3 +8,4 @@ from .classroom import ClassroomOrder from .prisoner import PrisonerOrder from .sde_team import SdeTeamOrder +from .sde_team_given_tests import SdeTeamGivenTestsOrder diff --git a/agentverse/environments/rules/order/sde_team_given_tests.py b/agentverse/environments/rules/order/sde_team_given_tests.py new file mode 100644 index 000000000..02944f0f3 --- /dev/null +++ b/agentverse/environments/rules/order/sde_team_given_tests.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import logging +import re +import random +from typing import TYPE_CHECKING, Any, List, Optional + +from . import order_registry as OrderRegistry +from .base import BaseOrder + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@OrderRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsOrder(BaseOrder): + """The order for a code problem solving given unit tests + """ + next_agent_idx: int = 0 + + def get_next_agent_idx(self, environment: BaseEnvironment) -> List[int]: + if self.next_agent_idx == 0: + self.next_agent_idx = 1 + return [0] + elif self.next_agent_idx == 1: + self.next_agent_idx = 0 + return [1] \ No newline at end of file diff --git a/agentverse/environments/rules/selector/__init__.py b/agentverse/environments/rules/selector/__init__.py index 56b520dbe..98a7716cd 100644 --- a/agentverse/environments/rules/selector/__init__.py +++ b/agentverse/environments/rules/selector/__init__.py @@ -6,3 +6,4 @@ from .basic import BasicSelector from .classroom import ClassroomSelector from .sde_team import SdeTeamSelector +from .sde_team_given_tests import SdeTeamGivenTestsSelector diff --git a/agentverse/environments/rules/selector/sde_team_given_tests.py b/agentverse/environments/rules/selector/sde_team_given_tests.py new file mode 100644 index 000000000..8683e67fb --- /dev/null +++ b/agentverse/environments/rules/selector/sde_team_given_tests.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from agentverse.message import Message + +from . import selector_registry as SelectorRegistry +from .base import BaseSelector + +import json +import re + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + +def extract(content: str, keyword: str): + result = "" + flag = False + for line in content.split('\n'): + if line.strip().startswith(keyword): + flag = True + continue + if flag: + result += line + result += "\n" + return result + + +@SelectorRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsSelector(BaseSelector): + def select_message(self, environment: BaseEnvironment, messages: List[Message]) -> List[Message]: + last_sender = environment.last_messages[0].sender + selected = messages + + if last_sender == "code_writer": + cur_code = extract(selected[0].content, ":") + environment.rule_params["code"] = cur_code + + from .code_api import execute_unit_tests + feedback = execute_unit_tests(environment.rule_params["code"], eval(environment.unit_tests)) + + environment.rule_params["feedback"] = feedback + selected[0].content = f":\n\n{cur_code}\n\n:\n{feedback}" + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + elif last_sender == "code_reviewer": + code_review = selected[0].content + cur_code = environment.rule_params["code"] + selected[0].content = f":\n\n{cur_code}\n\n{code_review}" + feedback = environment.rule_params["feedback"] + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + return selected \ No newline at end of file diff --git a/agentverse/environments/sde_team_given_tests.py b/agentverse/environments/sde_team_given_tests.py new file mode 100644 index 000000000..5d113f8a6 --- /dev/null +++ b/agentverse/environments/sde_team_given_tests.py @@ -0,0 +1,114 @@ +import asyncio +import logging +from typing import Any, Dict, List +from icecream import ic +import json + +from agentverse.agents.conversation_agent import BaseAgent +from agentverse.environments.rules.base import Rule +from agentverse.message import Message + +from . import env_registry as EnvironmentRegistry +from .base import BaseEnvironment +from agentverse.initialization import load_tools + + +@EnvironmentRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsEnvironment(BaseEnvironment): + """ + A basic environment implementing the logic of conversation to craft code. + + Args: + agents: List of agents + rule: Rule for the environment + max_turns: Maximum number of turns + cnt_turn: Current turn number + last_messages: Messages from last turn + rule_params: Variables set by the rule + """ + + agents: List[BaseAgent] + rule: Rule + max_turns: int = 10 + cnt_turn: int = 0 + last_messages: List[Message] = [] + rule_params: Dict = {} + task_name: str = "test" + unit_tests: str = "" + + def __init__(self, rule, **kwargs): + rule_config = rule + order_config = rule_config.get("order", {"type": "sde_team_given_tests"}) + visibility_config = rule_config.get("visibility", {"type": "base"}) + selector_config = rule_config.get("selector", {"type": "sde_team_given_tests"}) + updater_config = rule_config.get("updater", {"type": "sde_team"}) + describer_config = rule_config.get("describer", {"type": "base"}) + rule = Rule( + order_config, + visibility_config, + selector_config, + updater_config, + describer_config, + ) + super().__init__(rule=rule, **kwargs) + self.rule_params["first_round"] = True + self.rule_params["end_flag"] = False + + async def step(self) -> List[Message]: + """Run one step of the environment""" + + # Get the next agent index + agent_ids = self.rule.get_next_agent_idx(self) # order + + # Generate current environment description + # env_descriptions = self.rule.get_env_description(self) # describer + + # # Generate the next message + # messages = await asyncio.gather( + # *[self.agents[i].astep(env_descriptions[i]) for i in agent_ids] + # ) # call chatgpt api + + messages = await asyncio.gather( + *[self.agents[i].astep("") for i in agent_ids] + ) + + # Track the messages to get the role of the sender + self.last_messages = messages + + # Some rules will select certain messages from all the messages + selected_messages = self.rule.select_message(self, messages) # selector + self.last_messages = selected_messages + self.print_messages(selected_messages) + + # Update the memory of the agents + self.rule.update_memory(self) # updater: update memory + + # Update the set of visible agents for each agent + self.rule.update_visible_agents(self) # change receiver + + self.cnt_turn += 1 + + return selected_messages + + def print_messages(self, messages: List[Message]) -> None: + for message in messages: + if message is not None: + logging.info(f"{message.sender}: {message.content}") + + def reset(self) -> None: + """Reset the environment""" + self.cnt_turn = 0 + self.rule.reset() + for agent in self.agents: + agent.reset() + + def is_done(self) -> bool: + """Check if the environment is done""" + if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: + # with open("record_human_eval.txt", "a") as f: + # wd = dict() + # wd['task_id'] = self.task_name + # wd['code'] = self.rule_params['code'] + # f.write(json.dumps(wd)) + return True + return False diff --git a/agentverse/llms/openai.py b/agentverse/llms/openai.py index e79b3c633..188230dc1 100644 --- a/agentverse/llms/openai.py +++ b/agentverse/llms/openai.py @@ -20,6 +20,7 @@ openai.proxy = os.environ.get("http_proxy") if openai.proxy is None: openai.proxy = os.environ.get("HTTP_PROXY") + openai.proxy="http://127.0.0.1:7890" if openai.api_key is None: logging.warning( "OpenAI API key is not set. Please set the environment variable OPENAI_API_KEY" diff --git a/agentverse/tasks/__init__.py b/agentverse/tasks/__init__.py index 29ac6ef97..ef5c827c4 100644 --- a/agentverse/tasks/__init__.py +++ b/agentverse/tasks/__init__.py @@ -22,3 +22,4 @@ ) from .pokemon.output_parser import PokemonParser from .sde_team_3players_nolc.output_parser import SdeTeamParser +from .sde_team_2players_nolc.output_parser import SdeTeamGivenTestsParser diff --git a/agentverse/tasks/sde_team_2players_nolc/build_config.py b/agentverse/tasks/sde_team_2players_nolc/build_config.py new file mode 100644 index 000000000..1fea37cae --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/build_config.py @@ -0,0 +1,20 @@ +import yaml +import json + +config_path = "partial_config.yaml" + +code_problem = json.load(open("code_problem.json", "r")) +problem_string = "\n\n:\n" + code_problem["problem"] +unit_tests = str(code_problem["unit_tests"]) + +print(problem_string) +print(unit_tests) + +task_config = yaml.safe_load(open(config_path)) + +for agent_configs in task_config["agents"]: + agent_configs["role_description"] += problem_string +task_config["environment"]["unit_tests"] = unit_tests + +with open("config.yaml", "w") as f: + yaml.safe_dump(task_config, f) \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/code_problem.json b/agentverse/tasks/sde_team_2players_nolc/code_problem.json new file mode 100644 index 000000000..540b78442 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/code_problem.json @@ -0,0 +1,8 @@ +{ + "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", + "unit_tests": [ + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" + ] +} \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/config.yaml b/agentverse/tasks/sde_team_2players_nolc/config.yaml new file mode 100644 index 000000000..f8c55d757 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/config.yaml @@ -0,0 +1,197 @@ +agents: +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_writer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + - code_reviewer + role_description: "You are a professional coding assistant. You will be given a\ + \ function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nHere is the steps of the code writing and refinement process:\n1 - generate\ + \ \n2 - generate \n\nThe following is two examples on code writing\ + \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ + \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ + \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ + \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ + \"\"\n\n:\nI need to convert the integer to a string and then check if\ + \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ + \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ + [Start of new task]:\n\nNow please solve the following problem.\n\n\n:\n\ + from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float])\ + \ -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute\ + \ Deviation\n around the mean of this dataset.\n Mean Absolute Deviation\ + \ is the average absolute difference between each\n element and a centerpoint\ + \ (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0,\ + \ 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" + verbose: true +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_reviewer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + role_description: "You are a professional code reviewer. You will be given a function\ + \ signature and docstring in . A code writer has submitted his completion\ + \ in .\nThe code has been executed on a series of unit tests.\ + \ The execution feedback is provided in . Your job is to write\ + \ a code review in to help the code writer improve his code.\n\n\ + The following is an examples.\n\n[Example]:\n\n:\nfrom typing import\ + \ List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\"\"\" Given\ + \ an array of integers nums and an integer target, return indices of the two numbers\ + \ such that they add up to target.\nYou may assume that each input would have\ + \ exactly one solution, and you may not use the same element twice. \nThe answer\ + \ should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4],\ + \ 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n:\n{\"is_passing\": false,\ + \ \"feedback\": \"Tested passed:\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\\ + nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n[Start of new task]:\n\nNow please review the following submitted\ + \ code.\n\n\n:\nfrom typing import List\n\n\ndef mean_absolute_deviation(numbers:\ + \ List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate\ + \ Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute\ + \ Deviation is the average absolute difference between each\n element and a\ + \ centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>>\ + \ mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" + verbose: true +environment: + env_type: sde_team_given_tests + max_turns: 5 + rule: + describer: + type: basic + order: + type: sde_team_given_tests + selector: + type: sde_team_given_tests + updater: + type: sde_team + visibility: + type: all + task_name: HumanEval/0 + unit_tests: '[''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < + 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < + 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) + < 1e-6\n\n'']' +prompts: + code_reviewer_role_prompt: "You are a professional code reviewer. You will be given\ + \ a function signature and docstring in . A code writer has submitted\ + \ his completion in .\nThe code has been executed on a series\ + \ of unit tests. The execution feedback is provided in . Your\ + \ job is to write a code review in to help the code writer improve\ + \ his code.\n\nThe following is an examples.\n\n[Example]:\n\n:\nfrom\ + \ typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n[Start of new task]:\n\ + \nNow please review the following submitted code.\n" + code_writer_role_prompt: "You are a professional coding assistant. You will be given\ + \ a function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nHere is the steps of the code writing and refinement process:\n1 - generate\ + \ \n2 - generate \n\nThe following is two examples on code writing\ + \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ + \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ + \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ + \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ + \"\"\n\n:\nI need to convert the integer to a string and then check if\ + \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ + \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ + \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ + \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ + \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ + from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ + [Start of new task]:\n\nNow please solve the following problem.\n" + prompt: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' diff --git a/agentverse/tasks/sde_team_2players_nolc/output_parser.py b/agentverse/tasks/sde_team_2players_nolc/output_parser.py new file mode 100644 index 000000000..ed07d7fc1 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/output_parser.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import re +from typing import Union + +from langchain.agents import AgentOutputParser + +# from langchain.schema import AgentAction, AgentFinish + +from agentverse.parser import OutputParserError, output_parser_registry, OutputParser +from agentverse.llms.base import LLMResult +from agentverse.utils import AgentAction, AgentFinish + + +@output_parser_registry.register("sde_team_2players_nolc") +class SdeTeamGivenTestsParser(OutputParser): + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: + return AgentFinish({"output": output.content}, output.content) diff --git a/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml new file mode 100644 index 000000000..84a3a9091 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml @@ -0,0 +1,168 @@ + prompts: + prompt: &prompt |- + You are working in a programming team to solve a python code implementation problem. + + ${role_description} + + ${chat_history} + + code_writer_role_prompt: &code_writer_role_prompt | + You are a professional coding assistant. You will be given a function signature and docstring (in ). You should copy the import statement, the function signature first and then finish the function body. Before writing any code, you should first think about the task and write down your thoughts in . Then you can write your code in . + + Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + + Here is the steps of the code writing and refinement process: + 1 - generate + 2 - generate + + The following is two examples on code writing and refinement. + + [Example on code writing]: + + : + def is_palindrome(x: int) -> bool: + """ Given an integer x, return True if x is a palindrome, and False otherwise. + An integer is a palindrome when it reads the same forward and backward. + >>> is_palindrome(121) + True + >>> is_palindrome(10) + False + """ + + : + I need to convert the integer to a string and then check if the string is a palindrome. + + : + def is_palindrome(x: int) -> bool: + s = str(x) + return s == s[::-1] + + [Example on refinement]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + : + The code reviewer is right. I should start the second for loop from i + 1. + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(i + 1, len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + [Start of new task]: + + Now please solve the following problem. + + code_reviewer_role_prompt: &code_reviewer_role_prompt | + You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . + The code has been executed on a series of unit tests. The execution feedback is provided in . Your job is to write a code review in to help the code writer improve his code. + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + [Start of new task]: + + Now please review the following submitted code. + +environment: + env_type: sde_team_given_tests + max_turns: 5 + task_name: HumanEval/0 + unit_tests: None + rule: + order: + type: sde_team_given_tests + visibility: + type: all + selector: + type: sde_team_given_tests + updater: + type: sde_team + describer: + type: basic + +agents: + - agent_type: conversation + name: code_writer + role_description: *code_writer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer, code_reviewer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 + + - agent_type: conversation + name: code_reviewer + role_description: *code_reviewer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/readme.md b/agentverse/tasks/sde_team_2players_nolc/readme.md new file mode 100644 index 000000000..4c5968179 --- /dev/null +++ b/agentverse/tasks/sde_team_2players_nolc/readme.md @@ -0,0 +1,253 @@ +# A SDE team with two players + +In this task, LLMs working in a software development team to solve python code implementation problem. There are 2 agents in this task: code writer and code reviewer. + +### Roles + +#### *code writer* + +Code writer will write the code to satisfy the given requirement. The requirements were given in \ parts. The code writer first think about the task (the thoughts written in \ parts) and then write relevant code in \ part. + +Code writer will modify the written code to debug. The written code will be tested on a series of unit tests to get the feedback. Then the feedback (in \ part) together with a proffesional review (in \ review part) will be given. Then code writer will leverage this information to refine the generated code. + +The refinement will take multiple iterations. + +#### *code reviewer* + +Code reviewer will write professional review for the code writer written code. The code writer written code will be given in \ part, the execution feedback of unit tests will be given in \ part and the review will be written in \ part. + +### How to use + +#### Write problem and tests in a json + +The code problem and relevant unit tests should be given in `agentverse/tasks/sde_team_2players_nolc/code_problem.json`. Here is an example. + +```json +{ + "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", + "unit_tests": [ + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", + "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" + ] +} +``` + +#### Build the configuration + +Run `agentverse/tasks/sde_team_2players_nolc/build_config.py` to generate `config.yaml`. + +```bash +python agentverse/tasks/sde_team_2players_nolc/build_config.py +``` + +#### Run the session + +After generating `config.yaml`, run the `main.py` to start the task. + +```python +from agentverse.agentverse import AgentVerse +from argparse import ArgumentParser + +parser = ArgumentParser() +parser.add_argument("--task", type=str, default="sde_team_2players_nolc") + +args = parser.parse_args() +agentverse = AgentVerse.from_task(args.task) +agentverse.run() +``` + +### A running case + +This part will show an example of the whole code writing progress. + + + +Here is an example problem and unit tests. + +Problem: +```python +from typing import List + + +def separate_paren_groups(paren_string: str) -> List[str]: + """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to + separate those group into separate strings and return the list of those. + Separate groups are balanced (each open brace is properly closed) and not nested within each other + Ignore any spaces in the input string. + >>> separate_paren_groups('( ) (( )) (( )( ))') + ['()', '(())', '(()())'] + """ +``` + +Unit test: +```python +assert candidate('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] +assert candidate('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] +assert candidate('(()(())((())))') == ['(()(())((())))'] +assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] +``` + +A typical running log is like +```bash +05/22/2023 15:36:18 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": false, "feedback": "Tested passed:\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\n\nTests failed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] # output: ['(()())', ' ((()))', ' ()', ' ((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] # output: ['()', ' (())', ' ((()))', ' (((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] # output: ['( )', ' (( ))', ' (( )( ))']"} + +05/22/2023 15:36:33 - INFO - root - code_reviewer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +The code is correct in terms of logic and passes one of the test cases. However, it fails the other two test cases. The reason for the failure is that the code does not ignore spaces in the input string as mentioned in the problem statement. + +To fix this issue, we can modify the code to remove all spaces from the input string before processing it. We can do this by adding the following line at the beginning of the function: + +paren_string = paren_string.replace(" ", "") + +This will remove all spaces from the input string and the code should now pass all the test cases. + +05/22/2023 15:36:54 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + paren_string = paren_string.replace(" ", "") + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": true, "feedback": "Tested passed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\nTests failed:"} +``` diff --git a/agentverse/tasks/sde_team_3players_nolc/output_parser.py b/agentverse/tasks/sde_team_3players_nolc/output_parser.py index 9bb40fe86..031194ed6 100644 --- a/agentverse/tasks/sde_team_3players_nolc/output_parser.py +++ b/agentverse/tasks/sde_team_3players_nolc/output_parser.py @@ -14,5 +14,5 @@ @output_parser_registry.register("sde_team_3players_nolc") class SdeTeamParser(OutputParser): - def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]: + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: return AgentFinish({"output": output.content}, output.content) diff --git a/imgs/prison/police.png b/imgs/prison/police.png old mode 100755 new mode 100644 diff --git a/main.py b/main.py index ce2f12d8d..c48577a1a 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,8 @@ parser = ArgumentParser() # parser.add_argument("--task", type=str, default="nlp_classroom_9players") -parser.add_argument("--task", type=str, default="sde_team_3players_nolcda") +# parser.add_argument("--task", type=str, default="sde_team_3players_nolc") +parser.add_argument("--task", type=str, default="sde_team_2players_nolc") args = parser.parse_args() agentverse = AgentVerse.from_task(args.task) diff --git a/user_unit_tests.txt b/user_unit_tests.txt new file mode 100644 index 000000000..cbd5443e3 --- /dev/null +++ b/user_unit_tests.txt @@ -0,0 +1 @@ +assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\nassert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\nassert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\nassert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\nassert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\nassert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\nassert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False \ No newline at end of file From 24a92423c5ef9924139ab43123e0a0166e2b9a59 Mon Sep 17 00:00:00 2001 From: tzw2698 Date: Tue, 23 May 2023 13:36:31 +0800 Subject: [PATCH 06/10] 2players --- user_unit_tests.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 user_unit_tests.txt diff --git a/user_unit_tests.txt b/user_unit_tests.txt deleted file mode 100644 index cbd5443e3..000000000 --- a/user_unit_tests.txt +++ /dev/null @@ -1 +0,0 @@ -assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\nassert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\nassert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\nassert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\nassert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\nassert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\nassert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False \ No newline at end of file From 49c1d68f1a4fa3dcd987d64a3a26bfd3ca24374d Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Fri, 26 May 2023 13:51:48 +0800 Subject: [PATCH 07/10] Update py environment apis --- .../environments/rules/selector/code_api.py | 39 ++++++++----------- .../partial_config.yaml | 11 ++++-- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/agentverse/environments/rules/selector/code_api.py b/agentverse/environments/rules/selector/code_api.py index b75e07f6f..a134b649b 100644 --- a/agentverse/environments/rules/selector/code_api.py +++ b/agentverse/environments/rules/selector/code_api.py @@ -4,6 +4,7 @@ import json import astunparse import concurrent.futures +import traceback def get_call_str(assert_statement: str) -> str: @@ -13,11 +14,14 @@ def get_call_str(assert_statement: str) -> str: def get_output(func: str, assert_statement: str) -> str: try: func_call = get_call_str(assert_statement) - exec(func, globals()) - output = eval(func_call) - return output - except Exception as e: - return str(e) + try: + exec(func, globals()) + output = eval(func_call) + return output + except Exception as e: + return str(e) + except: + return "get_call_str error" def worker(code, globals=None, locals=None): old_stdout = sys.stdout @@ -35,7 +39,8 @@ def worker(code, globals=None, locals=None): stdout = redirected_output.getvalue() return stdout, globals, locals except Exception as e: - return f"Error: {e}", globals, locals + trace_str = traceback.format_exc() + return f"Error: {trace_str}", globals, locals finally: sys.stdout = old_stdout # restore the original stdout @@ -73,30 +78,20 @@ def execute_unit_tests(func_impl: str, tests: str) -> str: elif output.startswith("Error: "): # print(output) func_output = get_output(func_impl, tests[i]) + if func_output == "get_call_str error": + func_output = output failed_tests += [f"{tests[i]} # output: {func_output}"] is_passing = False else: success_tests += [tests[i]] - feedback = "Tested passed:" + feedback = "Tested passed:\n\n" for test in success_tests: - feedback += f"\n{test}" - feedback += "\n\nTests failed:" + feedback += f"{test}\n\n" + feedback += "Tests failed:\n\n" for test in failed_tests: - feedback += f"\n{test}" + feedback += f"{test}\n\n" return json.dumps({"is_passing": is_passing, "feedback": feedback}) -if __name__ == "__main__": - code = """ -from typing import List - -def two_sum(nums: List[int], target: int) -> List[int]: - for i in range(len(nums)): - for j in range(len(nums)): - if nums[i] + nums[j] == target: - return [i, j] -""" - tests = ["assert two_sum([2, 7, 11, 15], 9) == [0, 1]", "assert two_sum([3, 2, 4], 6) == [1, 2]", "assert two_sum([3, 5, 2, 7], 9) == [2, 3]"] - print(execute_unit_tests(code, tests)) \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml index 84a3a9091..885afb3b4 100644 --- a/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml +++ b/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml @@ -1,4 +1,4 @@ - prompts: +prompts: prompt: &prompt |- You are working in a programming team to solve a python code implementation problem. @@ -11,6 +11,8 @@ Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + The unit tests have been prepared. DO NOT generate unit tests! + Here is the steps of the code writing and refinement process: 1 - generate 2 - generate @@ -62,7 +64,7 @@ return [i, j] : - {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + {"is_passing": false, "feedback": "Tested passed:\n\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\n\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\n\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} : The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. @@ -81,7 +83,7 @@ [Start of new task]: - Now please solve the following problem. + Now please solve the following problem. DO NOT generate unit tests. code_reviewer_role_prompt: &code_reviewer_role_prompt | You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . @@ -114,7 +116,7 @@ return [i, j] : - {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + {"is_passing": false, "feedback": "Tested passed:\n\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\n\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\n\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} : The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. @@ -127,6 +129,7 @@ environment: env_type: sde_team_given_tests max_turns: 5 task_name: HumanEval/0 + # experiment_name: 2player_temperature-0.0 unit_tests: None rule: order: From 6d38bb2eb8c2540a41afb0fe68019c243b30f7fa Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Fri, 26 May 2023 13:56:58 +0800 Subject: [PATCH 08/10] Update main.py --- main.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 000000000..5231529ab --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +from agentverse.agentverse import AgentVerse +from argparse import ArgumentParser + +parser = ArgumentParser() +# parser.add_argument("--task", type=str, default="nlp_classroom_9players") +# parser.add_argument("--task", type=str, default="sde_team_3players_nolc") +parser.add_argument("--task", type=str, default="sde_team_2players_nolc") + +args = parser.parse_args() +agentverse = AgentVerse.from_task(args.task) +agentverse.run() \ No newline at end of file From c380ee6451862124ba2020557b28a831c9855b15 Mon Sep 17 00:00:00 2001 From: Bowen Li Date: Fri, 26 May 2023 15:26:04 +0800 Subject: [PATCH 09/10] Support folder in tasks and update sde_team --- .../environments/sde_team_given_tests.py | 20 +- agentverse/initialization.py | 6 + agentverse/tasks/__init__.py | 4 +- agentverse/tasks/sde_team/readme.md | 170 ++++++++++++ .../sde_team_2players_nolc/build_config.py | 0 .../sde_team_2players_nolc/code_problem.json | 9 + .../sde_team_2players_nolc/config.yaml | 202 ++++++++++++++ .../sde_team_2players_nolc/output_parser.py | 2 +- .../partial_config.yaml | 0 .../sde_team_3players_nolc/config.yaml | 0 .../sde_team_3players_nolc/output_parser.py | 2 +- .../sde_team_2players_nolc/code_problem.json | 8 - .../tasks/sde_team_2players_nolc/config.yaml | 197 -------------- .../tasks/sde_team_2players_nolc/readme.md | 253 ------------------ .../tasks/sde_team_3players_nolc/readme.md | 12 - main.py | 2 +- 16 files changed, 409 insertions(+), 478 deletions(-) create mode 100644 agentverse/tasks/sde_team/readme.md rename agentverse/tasks/{ => sde_team}/sde_team_2players_nolc/build_config.py (100%) create mode 100644 agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json create mode 100644 agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml rename agentverse/tasks/{ => sde_team}/sde_team_2players_nolc/output_parser.py (89%) rename agentverse/tasks/{ => sde_team}/sde_team_2players_nolc/partial_config.yaml (100%) rename agentverse/tasks/{ => sde_team}/sde_team_3players_nolc/config.yaml (100%) rename agentverse/tasks/{ => sde_team}/sde_team_3players_nolc/output_parser.py (89%) delete mode 100644 agentverse/tasks/sde_team_2players_nolc/code_problem.json delete mode 100644 agentverse/tasks/sde_team_2players_nolc/config.yaml delete mode 100644 agentverse/tasks/sde_team_2players_nolc/readme.md delete mode 100644 agentverse/tasks/sde_team_3players_nolc/readme.md diff --git a/agentverse/environments/sde_team_given_tests.py b/agentverse/environments/sde_team_given_tests.py index 5d113f8a6..b983dc988 100644 --- a/agentverse/environments/sde_team_given_tests.py +++ b/agentverse/environments/sde_team_given_tests.py @@ -33,8 +33,10 @@ class SdeTeamGivenTestsEnvironment(BaseEnvironment): cnt_turn: int = 0 last_messages: List[Message] = [] rule_params: Dict = {} - task_name: str = "test" unit_tests: str = "" + # # variables for experiment + # task_name: str = "test" + # experiment_name: str = "" def __init__(self, rule, **kwargs): rule_config = rule @@ -54,6 +56,15 @@ def __init__(self, rule, **kwargs): self.rule_params["first_round"] = True self.rule_params["end_flag"] = False + # # Set up logging for experiment + # filename = self.task_name.replace("/", "_") + # import os + # import os.path + # if not os.path.exists(f"human_eval_experiments/{self.experiment_name}/log"): + # os.makedirs(f"human_eval_experiments/{self.experiment_name}/log") + # file_handler = logging.FileHandler(f"human_eval_experiments/{self.experiment_name}/log/{filename}.txt") + # logging.getLogger().addHandler(file_handler) + async def step(self) -> List[Message]: """Run one step of the environment""" @@ -105,10 +116,13 @@ def reset(self) -> None: def is_done(self) -> bool: """Check if the environment is done""" if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: - # with open("record_human_eval.txt", "a") as f: + # # Write to file for experiment + # with open(f"human_eval_experiments/{self.experiment_name}/record_human_eval_prediction.jsonl", "a") as f: # wd = dict() # wd['task_id'] = self.task_name # wd['code'] = self.rule_params['code'] - # f.write(json.dumps(wd)) + # # print(wd) + # f.write(json.dumps(wd) + "\n") + # logging.getLogger().handlers.pop() return True return False diff --git a/agentverse/initialization.py b/agentverse/initialization.py index 38764c80a..30c9e8948 100644 --- a/agentverse/initialization.py +++ b/agentverse/initialization.py @@ -70,6 +70,12 @@ def prepare_task_config(task): and task != "__pycache__" ): all_tasks.append(task) + for subtask in os.listdir(os.path.join(all_task_dir, task)): + if ( + os.path.isdir(os.path.join(all_task_dir, task, subtask)) + and subtask != "__pycache__" + ): + all_tasks.append(f"{task}/{subtask}") raise ValueError(f"Task {task} not found. Available tasks: {all_tasks}") if not os.path.exists(config_path): raise ValueError( diff --git a/agentverse/tasks/__init__.py b/agentverse/tasks/__init__.py index ef5c827c4..feaffa443 100644 --- a/agentverse/tasks/__init__.py +++ b/agentverse/tasks/__init__.py @@ -21,5 +21,5 @@ NlpClassroom3PlayersWithtoolNolcParser, ) from .pokemon.output_parser import PokemonParser -from .sde_team_3players_nolc.output_parser import SdeTeamParser -from .sde_team_2players_nolc.output_parser import SdeTeamGivenTestsParser +from .sde_team.sde_team_3players_nolc.output_parser import SdeTeamParser +from .sde_team.sde_team_2players_nolc.output_parser import SdeTeamGivenTestsParser diff --git a/agentverse/tasks/sde_team/readme.md b/agentverse/tasks/sde_team/readme.md new file mode 100644 index 000000000..4ceca230d --- /dev/null +++ b/agentverse/tasks/sde_team/readme.md @@ -0,0 +1,170 @@ +# SDE team 🧑🏾‍💻👩‍💻🧑🏻‍💻 + +In this task, LLMs work as a software development team to solve code implementation problem. We have simulated two scenarios *sde_team/sde_team_2players_nolc* and *sde_team/sde_team_3players_nolc*. + +The performance on [HumanEval](https://github.com/openai/human-eval) is shown below. + +| Methods | Pass@1 HumanEval | +|---------------------------------|-----------| +| Codex (175B)* | 0.47 | +|     + CodeT* | 0.658 | +| PaLM Coder (540B)* | 0.36 | +| GPT-4* | 0.67 | +| ChatGPT (gpt-3.5-turbo)* | 0.573 | +|     + Self-collaboration* | 0.744 | +| Our *sde_team/sde_team_2players_nolc* (gpt-3.5-turbo) | **0.799** | + +*: Results are from [Self-collaboration](https://arxiv.org/abs/2304.07590). The methods in the table all employed the provided unit tests. + +Our *sde_team/sde_team_2players_nolc* shares the similar spirit as Self-collaboration at the moment. We are working to introduce more features in this repo! + + +## *sde_team/sde_team_2players_nolc* + +In this case, we are simulating a code generation problem that a python function body is required to be generated given function signature, doc string and unit tests. In the following, we will elaborate the details. + +### Roles + +Detailed role description and prompts can be found in `config.yaml` + +#### *code writer* + +Code writer will write the code to satisfy the given requirement. The requirement is given in the \ field of the prompt. The code writer first thinks about the task (the thoughts written in \) and then write the code in \. + +The submitted code will be tested automatically on a series of unit tests. Then the feedback (in \) together with a professional code review (in \) will be returned. Then code writer will leverage this information to refine the previously submitted code. The refinement will take multiple iterations. + +#### *code reviewer* + +Code reviewer will write professional review for the submitted code. The submitted code will be given in \, the execution feedback of unit tests will be given in \ and the review will be composed in \. + +### How to run the simulation + +#### Provide problem and unit tests + +The code problem and unit tests should be given in `agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json`. Here is an example. + +```json +{ + "problem": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n", + "unit_tests": [ + "assert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']", + "assert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']", + "assert separate_paren_groups('(()(())((())))') == ['(()(())((())))']", + "assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']" + ] +} +``` + +#### Build the configuration file + +Run `agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py` to generate `config.yaml`. + +```bash +cd agentverse/tasks/sde_team/sde_team_2players_nolc/ +python build_config.py +``` + +#### Run the session + +After generating `config.yaml`, run the `main.py` to start the task. + +```python +from agentverse.agentverse import AgentVerse +from argparse import ArgumentParser + +parser = ArgumentParser() +parser.add_argument("--task", type=str, default="sde_team/sde_team_2players_nolc") + +args = parser.parse_args() +agentverse = AgentVerse.from_task(args.task) +agentverse.run() +``` + +### A running case + +A typical running log of the previous problem is shown below. + +```bash +05/22/2023 15:36:18 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": false, "feedback": "Tested passed:\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\n\nTests failed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] # output: ['(()())', ' ((()))', ' ()', ' ((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] # output: ['()', ' (())', ' ((()))', ' (((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] # output: ['( )', ' (( ))', ' (( )( ))']"} + +05/22/2023 15:36:33 - INFO - root - code_reviewer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +The code is correct in terms of logic and passes one of the test cases. However, it fails the other two test cases. The reason for the failure is that the code does not ignore spaces in the input string as mentioned in the problem statement. + +To fix this issue, we can modify the code to remove all spaces from the input string before processing it. We can do this by adding the following line at the beginning of the function: + +paren_string = paren_string.replace(" ", "") + +This will remove all spaces from the input string and the code should now pass all the test cases. + +05/22/2023 15:36:54 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + paren_string = paren_string.replace(" ", "") + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": true, "feedback": "Tested passed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\nTests failed:"} +``` + +## *sde_team/sde_team_3players_nolc* + +Different from *sde_team/sde_team_2players_nolc*, we additionally introduce a role to automatically generate unit tests. + +- *unit test generator*: generate a series of unit test cases for the coding problem. + +### Stay tuned + +The generated unit tests are not always perfect, as they may not be correct. We plan to incorporate tools to raise the correctness of the generated cases. \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/build_config.py b/agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py similarity index 100% rename from agentverse/tasks/sde_team_2players_nolc/build_config.py rename to agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json b/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json new file mode 100644 index 000000000..23f444b00 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json @@ -0,0 +1,9 @@ +{ + "problem": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n", + "unit_tests": [ + "assert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']", + "assert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']", + "assert separate_paren_groups('(()(())((())))') == ['(()(())((())))']", + "assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']" + ] +} \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml b/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml new file mode 100644 index 000000000..4b616cf60 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml @@ -0,0 +1,202 @@ +agents: +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_writer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + - code_reviewer + role_description: "You are a professional coding assistant. You will be given a\ + \ function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nThe unit tests have been prepared. DO NOT generate unit tests!\n\nHere is the\ + \ steps of the code writing and refinement process:\n1 - generate \n\ + 2 - generate \n\nThe following is two examples on code writing and refinement.\n\ + \n[Example on code writing]:\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \"\"\" Given an integer x, return True if x is a palindrome, and False otherwise.\n\ + An integer is a palindrome when it reads the same forward and backward.\n>>> is_palindrome(121)\n\ + True\n>>> is_palindrome(10)\nFalse\n\"\"\"\n\n:\nI need to convert the\ + \ integer to a string and then check if the string is a palindrome.\n\n:\n\ + def is_palindrome(x: int) -> bool:\n s = str(x)\n return s == s[::-1]\n\n\ + [Example on refinement]:\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n\"\"\" Given an array of integers nums\ + \ and an integer target, return indices of the two numbers such that they add\ + \ up to target.\nYou may assume that each input would have exactly one solution,\ + \ and you may not use the same element twice. \nThe answer should be in an ascending\ + \ order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\ + \"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums: List[int],\ + \ target: int) -> List[int]:\n for i in range(len(nums)):\n for j in\ + \ range(len(nums)):\n if nums[i] + nums[j] == target:\n \ + \ return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\"\ + : \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert\ + \ two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n:\nThe code reviewer is right. I should start the second\ + \ for loop from i + 1.\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n[Start of new task]:\n\nNow please solve the\ + \ following problem. DO NOT generate unit tests.\n\n\n:\nfrom typing\ + \ import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n\ + \ \"\"\" Input to this function is a string containing multiple groups of nested\ + \ parentheses. Your goal is to\n separate those group into separate strings\ + \ and return the list of those.\n Separate groups are balanced (each open brace\ + \ is properly closed) and not nested within each other\n Ignore any spaces\ + \ in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n\ + \ ['()', '(())', '(()())']\n \"\"\"\n" + verbose: true +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_reviewer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + role_description: "You are a professional code reviewer. You will be given a function\ + \ signature and docstring in . A code writer has submitted his completion\ + \ in .\nThe code has been executed on a series of unit tests.\ + \ The execution feedback is provided in . Your job is to write\ + \ a code review in to help the code writer improve his code.\n\n\ + The following is an examples.\n\n[Example]:\n\n:\nfrom typing import\ + \ List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\"\"\" Given\ + \ an array of integers nums and an integer target, return indices of the two numbers\ + \ such that they add up to target.\nYou may assume that each input would have\ + \ exactly one solution, and you may not use the same element twice. \nThe answer\ + \ should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4],\ + \ 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n:\n{\"is_passing\": false,\ + \ \"feedback\": \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0,\ + \ 1]\\n\\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert\ + \ two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous\ + \ code is not correct. It failed all the tests. The second for loop should start\ + \ from i + 1.\n\n[Start of new task]:\n\nNow please review the following submitted\ + \ code.\n\n\n:\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string:\ + \ str) -> List[str]:\n \"\"\" Input to this function is a string containing\ + \ multiple groups of nested parentheses. Your goal is to\n separate those group\ + \ into separate strings and return the list of those.\n Separate groups are\ + \ balanced (each open brace is properly closed) and not nested within each other\n\ + \ Ignore any spaces in the input string.\n >>> separate_paren_groups('(\ + \ ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n" + verbose: true +environment: + env_type: sde_team_given_tests + max_turns: 5 + rule: + describer: + type: basic + order: + type: sde_team_given_tests + selector: + type: sde_team_given_tests + updater: + type: sde_team + visibility: + type: all + task_name: HumanEval/0 + unit_tests: '["assert separate_paren_groups(''(()()) ((())) () ((())()())'') == + [''(()())'', ''((()))'', ''()'', ''((())()())'']", "assert separate_paren_groups(''() + (()) ((())) (((())))'') == [''()'', ''(())'', ''((()))'', ''(((())))'']", "assert + separate_paren_groups(''(()(())((())))'') == [''(()(())((())))'']", "assert separate_paren_groups(''( + ) (( )) (( )( ))'') == [''()'', ''(())'', ''(()())'']"]' +prompts: + code_reviewer_role_prompt: "You are a professional code reviewer. You will be given\ + \ a function signature and docstring in . A code writer has submitted\ + \ his completion in .\nThe code has been executed on a series\ + \ of unit tests. The execution feedback is provided in . Your\ + \ job is to write a code review in to help the code writer improve\ + \ his code.\n\nThe following is an examples.\n\n[Example]:\n\n:\nfrom\ + \ typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\n\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert two_sum([3, 5, 2, 7], 9) ==\ + \ [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output:\ + \ [0, 0]\"}\n\n:\nThe previous code is not correct. It failed all\ + \ the tests. The second for loop should start from i + 1.\n\n[Start of new task]:\n\ + \nNow please review the following submitted code.\n" + code_writer_role_prompt: "You are a professional coding assistant. You will be given\ + \ a function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nThe unit tests have been prepared. DO NOT generate unit tests!\n\nHere is the\ + \ steps of the code writing and refinement process:\n1 - generate \n\ + 2 - generate \n\nThe following is two examples on code writing and refinement.\n\ + \n[Example on code writing]:\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \"\"\" Given an integer x, return True if x is a palindrome, and False otherwise.\n\ + An integer is a palindrome when it reads the same forward and backward.\n>>> is_palindrome(121)\n\ + True\n>>> is_palindrome(10)\nFalse\n\"\"\"\n\n:\nI need to convert the\ + \ integer to a string and then check if the string is a palindrome.\n\n:\n\ + def is_palindrome(x: int) -> bool:\n s = str(x)\n return s == s[::-1]\n\n\ + [Example on refinement]:\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n\"\"\" Given an array of integers nums\ + \ and an integer target, return indices of the two numbers such that they add\ + \ up to target.\nYou may assume that each input would have exactly one solution,\ + \ and you may not use the same element twice. \nThe answer should be in an ascending\ + \ order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\ + \"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums: List[int],\ + \ target: int) -> List[int]:\n for i in range(len(nums)):\n for j in\ + \ range(len(nums)):\n if nums[i] + nums[j] == target:\n \ + \ return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\"\ + : \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert\ + \ two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n:\nThe code reviewer is right. I should start the second\ + \ for loop from i + 1.\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n[Start of new task]:\n\nNow please solve the\ + \ following problem. DO NOT generate unit tests.\n" + prompt: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' diff --git a/agentverse/tasks/sde_team_2players_nolc/output_parser.py b/agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py similarity index 89% rename from agentverse/tasks/sde_team_2players_nolc/output_parser.py rename to agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py index ed07d7fc1..3e3243c6c 100644 --- a/agentverse/tasks/sde_team_2players_nolc/output_parser.py +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py @@ -12,7 +12,7 @@ from agentverse.utils import AgentAction, AgentFinish -@output_parser_registry.register("sde_team_2players_nolc") +@output_parser_registry.register("sde_team/sde_team_2players_nolc") class SdeTeamGivenTestsParser(OutputParser): def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: return AgentFinish({"output": output.content}, output.content) diff --git a/agentverse/tasks/sde_team_2players_nolc/partial_config.yaml b/agentverse/tasks/sde_team/sde_team_2players_nolc/partial_config.yaml similarity index 100% rename from agentverse/tasks/sde_team_2players_nolc/partial_config.yaml rename to agentverse/tasks/sde_team/sde_team_2players_nolc/partial_config.yaml diff --git a/agentverse/tasks/sde_team_3players_nolc/config.yaml b/agentverse/tasks/sde_team/sde_team_3players_nolc/config.yaml similarity index 100% rename from agentverse/tasks/sde_team_3players_nolc/config.yaml rename to agentverse/tasks/sde_team/sde_team_3players_nolc/config.yaml diff --git a/agentverse/tasks/sde_team_3players_nolc/output_parser.py b/agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py similarity index 89% rename from agentverse/tasks/sde_team_3players_nolc/output_parser.py rename to agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py index 031194ed6..53376d0d8 100644 --- a/agentverse/tasks/sde_team_3players_nolc/output_parser.py +++ b/agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py @@ -12,7 +12,7 @@ from agentverse.utils import AgentAction, AgentFinish -@output_parser_registry.register("sde_team_3players_nolc") +@output_parser_registry.register("sde_team/sde_team_3players_nolc") class SdeTeamParser(OutputParser): def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: return AgentFinish({"output": output.content}, output.content) diff --git a/agentverse/tasks/sde_team_2players_nolc/code_problem.json b/agentverse/tasks/sde_team_2players_nolc/code_problem.json deleted file mode 100644 index 540b78442..000000000 --- a/agentverse/tasks/sde_team_2players_nolc/code_problem.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", - "unit_tests": [ - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" - ] -} \ No newline at end of file diff --git a/agentverse/tasks/sde_team_2players_nolc/config.yaml b/agentverse/tasks/sde_team_2players_nolc/config.yaml deleted file mode 100644 index f8c55d757..000000000 --- a/agentverse/tasks/sde_team_2players_nolc/config.yaml +++ /dev/null @@ -1,197 +0,0 @@ -agents: -- agent_type: conversation - llm: - llm_type: gpt-3.5-turbo - max_tokens: 1024 - temperature: 0.1 - memory: - memory_type: sde_team - name: code_writer - prompt_template: 'You are working in a programming team to solve a python code implementation - problem. - - - ${role_description} - - - ${chat_history}' - receiver: - - code_writer - - code_reviewer - role_description: "You are a professional coding assistant. You will be given a\ - \ function signature and docstring (in ). You should copy the import\ - \ statement, the function signature first and then finish the function body. Before\ - \ writing any code, you should first think about the task and write down your\ - \ thoughts in . Then you can write your code in .\n\nYour submitted\ - \ code (in ) will be tested on a series of unit tests. You will\ - \ be given the feedback (in ) of the test, together with the\ - \ review of your code (in ) of a professional code reviewer. You\ - \ can use these feedbacks to refine your code. The refinement will take multiple\ - \ iterations. You can use the to record your thoughts during the refinement.\n\ - \nHere is the steps of the code writing and refinement process:\n1 - generate\ - \ \n2 - generate \n\nThe following is two examples on code writing\ - \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ - \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ - \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ - \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ - \"\"\n\n:\nI need to convert the integer to a string and then check if\ - \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ - \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ - from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ - \"\"\" Given an array of integers nums and an integer target, return indices of\ - \ the two numbers such that they add up to target.\nYou may assume that each input\ - \ would have exactly one solution, and you may not use the same element twice.\ - \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ - [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ - \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ - \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ - \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ - \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ - \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ - \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ - \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ - \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ - from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ - \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ - \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ - [Start of new task]:\n\nNow please solve the following problem.\n\n\n:\n\ - from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float])\ - \ -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute\ - \ Deviation\n around the mean of this dataset.\n Mean Absolute Deviation\ - \ is the average absolute difference between each\n element and a centerpoint\ - \ (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0,\ - \ 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" - verbose: true -- agent_type: conversation - llm: - llm_type: gpt-3.5-turbo - max_tokens: 1024 - temperature: 0.1 - memory: - memory_type: sde_team - name: code_reviewer - prompt_template: 'You are working in a programming team to solve a python code implementation - problem. - - - ${role_description} - - - ${chat_history}' - receiver: - - code_writer - role_description: "You are a professional code reviewer. You will be given a function\ - \ signature and docstring in . A code writer has submitted his completion\ - \ in .\nThe code has been executed on a series of unit tests.\ - \ The execution feedback is provided in . Your job is to write\ - \ a code review in to help the code writer improve his code.\n\n\ - The following is an examples.\n\n[Example]:\n\n:\nfrom typing import\ - \ List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\"\"\" Given\ - \ an array of integers nums and an integer target, return indices of the two numbers\ - \ such that they add up to target.\nYou may assume that each input would have\ - \ exactly one solution, and you may not use the same element twice. \nThe answer\ - \ should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4],\ - \ 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ - \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ - \ for j in range(len(nums)):\n if nums[i] + nums[j] == target:\n\ - \ return [i, j]\n\n:\n{\"is_passing\": false,\ - \ \"feedback\": \"Tested passed:\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\\ - nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\nassert two_sum([3,\ - \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ - \ is not correct. It failed all the tests. The second for loop should start from\ - \ i + 1.\n\n[Start of new task]:\n\nNow please review the following submitted\ - \ code.\n\n\n:\nfrom typing import List\n\n\ndef mean_absolute_deviation(numbers:\ - \ List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate\ - \ Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute\ - \ Deviation is the average absolute difference between each\n element and a\ - \ centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>>\ - \ mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n" - verbose: true -environment: - env_type: sde_team_given_tests - max_turns: 5 - rule: - describer: - type: basic - order: - type: sde_team_given_tests - selector: - type: sde_team_given_tests - updater: - type: sde_team - visibility: - type: all - task_name: HumanEval/0 - unit_tests: '[''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < - 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < - 1e-6\n'', ''assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) - < 1e-6\n\n'']' -prompts: - code_reviewer_role_prompt: "You are a professional code reviewer. You will be given\ - \ a function signature and docstring in . A code writer has submitted\ - \ his completion in .\nThe code has been executed on a series\ - \ of unit tests. The execution feedback is provided in . Your\ - \ job is to write a code review in to help the code writer improve\ - \ his code.\n\nThe following is an examples.\n\n[Example]:\n\n:\nfrom\ - \ typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ - \"\"\" Given an array of integers nums and an integer target, return indices of\ - \ the two numbers such that they add up to target.\nYou may assume that each input\ - \ would have exactly one solution, and you may not use the same element twice.\ - \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ - [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ - \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ - \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ - \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ - \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ - \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ - \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ - \ tests. The second for loop should start from i + 1.\n\n[Start of new task]:\n\ - \nNow please review the following submitted code.\n" - code_writer_role_prompt: "You are a professional coding assistant. You will be given\ - \ a function signature and docstring (in ). You should copy the import\ - \ statement, the function signature first and then finish the function body. Before\ - \ writing any code, you should first think about the task and write down your\ - \ thoughts in . Then you can write your code in .\n\nYour submitted\ - \ code (in ) will be tested on a series of unit tests. You will\ - \ be given the feedback (in ) of the test, together with the\ - \ review of your code (in ) of a professional code reviewer. You\ - \ can use these feedbacks to refine your code. The refinement will take multiple\ - \ iterations. You can use the to record your thoughts during the refinement.\n\ - \nHere is the steps of the code writing and refinement process:\n1 - generate\ - \ \n2 - generate \n\nThe following is two examples on code writing\ - \ and refinement.\n\n[Example on code writing]:\n\n:\ndef is_palindrome(x:\ - \ int) -> bool:\n\"\"\" Given an integer x, return True if x is a palindrome,\ - \ and False otherwise.\nAn integer is a palindrome when it reads the same forward\ - \ and backward.\n>>> is_palindrome(121)\nTrue\n>>> is_palindrome(10)\nFalse\n\"\ - \"\"\n\n:\nI need to convert the integer to a string and then check if\ - \ the string is a palindrome.\n\n:\ndef is_palindrome(x: int) -> bool:\n\ - \ s = str(x)\n return s == s[::-1]\n\n[Example on refinement]:\n\n:\n\ - from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ - \"\"\" Given an array of integers nums and an integer target, return indices of\ - \ the two numbers such that they add up to target.\nYou may assume that each input\ - \ would have exactly one solution, and you may not use the same element twice.\ - \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ - [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ - \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ - \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ - \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\nassert\ - \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\nassert two_sum([3, 5, 2, 7], 9) == [2,\ - \ 3]\\n\\nTests failed:\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0,\ - \ 0]\"}\n\n:\nThe previous code is not correct. It failed all the\ - \ tests. The second for loop should start from i + 1.\n\n:\nThe code\ - \ reviewer is right. I should start the second for loop from i + 1.\n\n:\n\ - from typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ - \ for i in range(len(nums)):\n for j in range(i + 1, len(nums)):\n \ - \ if nums[i] + nums[j] == target:\n return [i, j]\n\n\ - [Start of new task]:\n\nNow please solve the following problem.\n" - prompt: 'You are working in a programming team to solve a python code implementation - problem. - - - ${role_description} - - - ${chat_history}' diff --git a/agentverse/tasks/sde_team_2players_nolc/readme.md b/agentverse/tasks/sde_team_2players_nolc/readme.md deleted file mode 100644 index 4c5968179..000000000 --- a/agentverse/tasks/sde_team_2players_nolc/readme.md +++ /dev/null @@ -1,253 +0,0 @@ -# A SDE team with two players - -In this task, LLMs working in a software development team to solve python code implementation problem. There are 2 agents in this task: code writer and code reviewer. - -### Roles - -#### *code writer* - -Code writer will write the code to satisfy the given requirement. The requirements were given in \ parts. The code writer first think about the task (the thoughts written in \ parts) and then write relevant code in \ part. - -Code writer will modify the written code to debug. The written code will be tested on a series of unit tests to get the feedback. Then the feedback (in \ part) together with a proffesional review (in \ review part) will be given. Then code writer will leverage this information to refine the generated code. - -The refinement will take multiple iterations. - -#### *code reviewer* - -Code reviewer will write professional review for the code writer written code. The code writer written code will be given in \ part, the execution feedback of unit tests will be given in \ part and the review will be written in \ part. - -### How to use - -#### Write problem and tests in a json - -The code problem and relevant unit tests should be given in `agentverse/tasks/sde_team_2players_nolc/code_problem.json`. Here is an example. - -```json -{ - "problem": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n \"\"\"\n", - "unit_tests": [ - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n", - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n", - "assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n" - ] -} -``` - -#### Build the configuration - -Run `agentverse/tasks/sde_team_2players_nolc/build_config.py` to generate `config.yaml`. - -```bash -python agentverse/tasks/sde_team_2players_nolc/build_config.py -``` - -#### Run the session - -After generating `config.yaml`, run the `main.py` to start the task. - -```python -from agentverse.agentverse import AgentVerse -from argparse import ArgumentParser - -parser = ArgumentParser() -parser.add_argument("--task", type=str, default="sde_team_2players_nolc") - -args = parser.parse_args() -agentverse = AgentVerse.from_task(args.task) -agentverse.run() -``` - -### A running case - -This part will show an example of the whole code writing progress. - - - -Here is an example problem and unit tests. - -Problem: -```python -from typing import List - - -def separate_paren_groups(paren_string: str) -> List[str]: - """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to - separate those group into separate strings and return the list of those. - Separate groups are balanced (each open brace is properly closed) and not nested within each other - Ignore any spaces in the input string. - >>> separate_paren_groups('( ) (( )) (( )( ))') - ['()', '(())', '(()())'] - """ -``` - -Unit test: -```python -assert candidate('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] -assert candidate('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] -assert candidate('(()(())((())))') == ['(()(())((())))'] -assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] -``` - -A typical running log is like -```bash -05/22/2023 15:36:18 - INFO - root - code_writer: : - -from typing import List - -def separate_paren_groups(paren_string: str) -> List[str]: - result = [] - open_count = 0 - start = 0 - for i, c in enumerate(paren_string): - if c == '(': - open_count += 1 - elif c == ')': - open_count -= 1 - if open_count == 0: - result.append(paren_string[start:i+1]) - start = i+1 - return result - - -: -{"is_passing": false, "feedback": "Tested passed:\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\n\nTests failed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] # output: ['(()())', ' ((()))', ' ()', ' ((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] # output: ['()', ' (())', ' ((()))', ' (((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] # output: ['( )', ' (( ))', ' (( )( ))']"} - -05/22/2023 15:36:33 - INFO - root - code_reviewer: : - -from typing import List - -def separate_paren_groups(paren_string: str) -> List[str]: - result = [] - open_count = 0 - start = 0 - for i, c in enumerate(paren_string): - if c == '(': - open_count += 1 - elif c == ')': - open_count -= 1 - if open_count == 0: - result.append(paren_string[start:i+1]) - start = i+1 - return result - - -: -The code is correct in terms of logic and passes one of the test cases. However, it fails the other two test cases. The reason for the failure is that the code does not ignore spaces in the input string as mentioned in the problem statement. - -To fix this issue, we can modify the code to remove all spaces from the input string before processing it. We can do this by adding the following line at the beginning of the function: - -paren_string = paren_string.replace(" ", "") - -This will remove all spaces from the input string and the code should now pass all the test cases. - -05/22/2023 15:36:54 - INFO - root - code_writer: : - -from typing import List - -def separate_paren_groups(paren_string: str) -> List[str]: - paren_string = paren_string.replace(" ", "") - result = [] - open_count = 0 - start = 0 - for i, c in enumerate(paren_string): - if c == '(': - open_count += 1 - elif c == ')': - open_count -= 1 - if open_count == 0: - result.append(paren_string[start:i+1]) - start = i+1 - return result - - -: -{"is_passing": true, "feedback": "Tested passed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\nTests failed:"} -``` diff --git a/agentverse/tasks/sde_team_3players_nolc/readme.md b/agentverse/tasks/sde_team_3players_nolc/readme.md deleted file mode 100644 index dbfd5ebdf..000000000 --- a/agentverse/tasks/sde_team_3players_nolc/readme.md +++ /dev/null @@ -1,12 +0,0 @@ -# A SDE team with three players - -### Roles - -- *code writer*: write Python code to fill in a function body given function signature and doc string. The generated code will be automatically tested on a series of unit test cases. This role is required to refine the previously submitted code if it haven't passed all the tests. Test feedback and a code review will be given in this case. -- *code reviewer*: write detailed code review of the code submitted by the code writer considering the feedback of the unit test. -- *unit test generator*: generate a series of unit test cases for the coding problem. - -### Note - -The generated unit tests are always perfect. They may not be correct and diversified. We will tune our method to encourage more diversity. We also plan to incorporate execution tools to raise the correctness of the generated cases. - diff --git a/main.py b/main.py index 5231529ab..4502f07a6 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ parser = ArgumentParser() # parser.add_argument("--task", type=str, default="nlp_classroom_9players") # parser.add_argument("--task", type=str, default="sde_team_3players_nolc") -parser.add_argument("--task", type=str, default="sde_team_2players_nolc") +parser.add_argument("--task", type=str, default="sde_team/sde_team_2players_nolc") args = parser.parse_args() agentverse = AgentVerse.from_task(args.task) From f07e5577f12d3d2edc3e09b020ca4af926d0d8b1 Mon Sep 17 00:00:00 2001 From: Weize Chen <32613237+chenweize1998@users.noreply.github.com> Date: Sun, 28 May 2023 15:49:56 +0800 Subject: [PATCH 10/10] Update main.py --- main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 4502f07a6..9f5786224 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,8 @@ parser = ArgumentParser() # parser.add_argument("--task", type=str, default="nlp_classroom_9players") -# parser.add_argument("--task", type=str, default="sde_team_3players_nolc") -parser.add_argument("--task", type=str, default="sde_team/sde_team_2players_nolc") +parser.add_argument("--task", type=str, default="nlp_classroom_3players") args = parser.parse_args() agentverse = AgentVerse.from_task(args.task) -agentverse.run() \ No newline at end of file +agentverse.run()