diff --git a/agentverse/environments/__init__.py b/agentverse/environments/__init__.py index 7617d9476..594e9a024 100644 --- a/agentverse/environments/__init__.py +++ b/agentverse/environments/__init__.py @@ -6,3 +6,5 @@ from .base import BaseEnvironment from .basic import BasicEnvironment from .pokemon import PokemonEnvironment +from .sde_team import SdeTeamEnvironment +from .sde_team_given_tests import SdeTeamGivenTestsEnvironment diff --git a/agentverse/environments/rules/order/__init__.py b/agentverse/environments/rules/order/__init__.py index 4ac22ec7b..1c6ea9f0e 100644 --- a/agentverse/environments/rules/order/__init__.py +++ b/agentverse/environments/rules/order/__init__.py @@ -7,3 +7,5 @@ from .concurrent import ConcurrentOrder from .classroom import ClassroomOrder from .prisoner import PrisonerOrder +from .sde_team import SdeTeamOrder +from .sde_team_given_tests import SdeTeamGivenTestsOrder diff --git a/agentverse/environments/rules/order/sde_team.py b/agentverse/environments/rules/order/sde_team.py new file mode 100644 index 000000000..ac0d54267 --- /dev/null +++ b/agentverse/environments/rules/order/sde_team.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import logging +import re +import random +from typing import TYPE_CHECKING, Any, List, Optional + +from . import order_registry as OrderRegistry +from .base import BaseOrder + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@OrderRegistry.register("sde_team") +class SdeTeamOrder(BaseOrder): + """The order for a code problem solving + """ + next_agent_idx: int = 2 + + def get_next_agent_idx(self, environment: BaseEnvironment) -> List[int]: + if self.next_agent_idx == 2: + self.next_agent_idx = 0 + return [2] * 5 # TODO set the number in yaml + elif self.next_agent_idx == 0: + self.next_agent_idx = 1 + return [0] + elif self.next_agent_idx == 1: + self.next_agent_idx = 0 + return [1] \ No newline at end of file diff --git a/agentverse/environments/rules/order/sde_team_given_tests.py b/agentverse/environments/rules/order/sde_team_given_tests.py new file mode 100644 index 000000000..02944f0f3 --- /dev/null +++ b/agentverse/environments/rules/order/sde_team_given_tests.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import logging +import re +import random +from typing import TYPE_CHECKING, Any, List, Optional + +from . import order_registry as OrderRegistry +from .base import BaseOrder + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@OrderRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsOrder(BaseOrder): + """The order for a code problem solving given unit tests + """ + next_agent_idx: int = 0 + + def get_next_agent_idx(self, environment: BaseEnvironment) -> List[int]: + if self.next_agent_idx == 0: + self.next_agent_idx = 1 + return [0] + elif self.next_agent_idx == 1: + self.next_agent_idx = 0 + return [1] \ No newline at end of file diff --git a/agentverse/environments/rules/selector/__init__.py b/agentverse/environments/rules/selector/__init__.py index da11bda59..98a7716cd 100644 --- a/agentverse/environments/rules/selector/__init__.py +++ b/agentverse/environments/rules/selector/__init__.py @@ -5,3 +5,5 @@ from .base import BaseSelector from .basic import BasicSelector from .classroom import ClassroomSelector +from .sde_team import SdeTeamSelector +from .sde_team_given_tests import SdeTeamGivenTestsSelector diff --git a/agentverse/environments/rules/selector/code_api.py b/agentverse/environments/rules/selector/code_api.py new file mode 100644 index 000000000..a134b649b --- /dev/null +++ b/agentverse/environments/rules/selector/code_api.py @@ -0,0 +1,97 @@ +import io +import sys +import ast +import json +import astunparse +import concurrent.futures +import traceback + + +def get_call_str(assert_statement: str) -> str: + call_str = ast.parse(assert_statement).body[0].test.left # type: ignore + return astunparse.unparse(call_str).strip() + +def get_output(func: str, assert_statement: str) -> str: + try: + func_call = get_call_str(assert_statement) + try: + exec(func, globals()) + output = eval(func_call) + return output + except Exception as e: + return str(e) + except: + return "get_call_str error" + +def worker(code, globals=None, locals=None): + old_stdout = sys.stdout + redirected_output = sys.stdout = io.StringIO() + if locals is None: + locals = {} + try: + # TODO: exec(code, globals, locals) could be buggy + # In cases where both import statement and function exits in the code, if the locals are given, + # the code will not find the imported package. + # For example, + # code = "import math\ndef f(x):\n\treturn math.pow(x, 2)\nassert f(2) == 4" + # It will return "NameError: name 'math' is not defined" + exec(code, locals, locals) + stdout = redirected_output.getvalue() + return stdout, globals, locals + except Exception as e: + trace_str = traceback.format_exc() + return f"Error: {trace_str}", globals, locals + finally: + sys.stdout = old_stdout # restore the original stdout + +def execute_code(code: str) -> str: + """Execute a snippet of python code and return the output or the error message. + """ + timeout = 5 + try: + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(worker, code) + result, _, _ = future.result(timeout) + return result + except concurrent.futures.TimeoutError: + return "Timeout" + +def execute_unit_tests(func_impl: str, tests: str) -> str: + """Run a python function on a bunch of unit tests tests and return detailed feedback. + """ + # tests = eval(tests) + # assert type(tests) == list + + # Combine function code and assert statement + func_test_list = [f'{func_impl}\n{test}' for test in tests] + + # Run the tests and collect the results + success_tests = [] + failed_tests = [] + is_passing = True + num_tests = len(func_test_list) + for i in range(num_tests): + output = execute_code(func_test_list[i]) + if output == "Timeout": + failed_tests += [f"{tests[i]} # output: Timeout"] + is_passing = False + elif output.startswith("Error: "): + # print(output) + func_output = get_output(func_impl, tests[i]) + if func_output == "get_call_str error": + func_output = output + failed_tests += [f"{tests[i]} # output: {func_output}"] + is_passing = False + else: + success_tests += [tests[i]] + + feedback = "Tested passed:\n\n" + for test in success_tests: + feedback += f"{test}\n\n" + feedback += "Tests failed:\n\n" + for test in failed_tests: + feedback += f"{test}\n\n" + + return json.dumps({"is_passing": is_passing, + "feedback": feedback}) + diff --git a/agentverse/environments/rules/selector/sde_team.py b/agentverse/environments/rules/selector/sde_team.py new file mode 100644 index 000000000..7a4b571ad --- /dev/null +++ b/agentverse/environments/rules/selector/sde_team.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from agentverse.message import Message + +from . import selector_registry as SelectorRegistry +from .base import BaseSelector + +import json +import re + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + +def extract(content: str, keyword: str): + result = "" + flag = False + for line in content.split('\n'): + if line.strip().startswith(keyword): + flag = True + continue + if flag: + result += line + result += "\n" + return result + + +@SelectorRegistry.register("sde_team") +class SdeTeamSelector(BaseSelector): + def select_message(self, environment: BaseEnvironment, messages: List[Message]) -> List[Message]: + last_sender = environment.last_messages[0].sender + selected = messages + + if last_sender == "unit_test_generator": + unit_tests = set() + for message in selected: + unit_test = extract(message.content, ":") + if unit_test not in unit_tests: + unit_tests.add(extract(message.content, ":")) + unit_tests = list(unit_tests) + environment.rule_params["unit_tests"] = str(unit_tests) + new_message = Message( + content="", + sender="unit_test_generator", + receiver=[], + ) # TODO: set the content of the message + selected = [new_message] + + elif last_sender == "code_writer": + cur_code = extract(selected[0].content, ":") + environment.rule_params["code"] = cur_code + + from .code_api import execute_unit_tests + feedback = execute_unit_tests(environment.rule_params["code"], eval(environment.rule_params["unit_tests"])) + + environment.rule_params["feedback"] = feedback + selected[0].content = f":\n\n{cur_code}\n\n:\n{feedback}" + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + elif last_sender == "code_reviewer": + code_review = selected[0].content + cur_code = environment.rule_params["code"] + selected[0].content = f":\n\n{cur_code}\n\n{code_review}" + feedback = environment.rule_params["feedback"] + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + return selected \ No newline at end of file diff --git a/agentverse/environments/rules/selector/sde_team_given_tests.py b/agentverse/environments/rules/selector/sde_team_given_tests.py new file mode 100644 index 000000000..8683e67fb --- /dev/null +++ b/agentverse/environments/rules/selector/sde_team_given_tests.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from agentverse.message import Message + +from . import selector_registry as SelectorRegistry +from .base import BaseSelector + +import json +import re + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + +def extract(content: str, keyword: str): + result = "" + flag = False + for line in content.split('\n'): + if line.strip().startswith(keyword): + flag = True + continue + if flag: + result += line + result += "\n" + return result + + +@SelectorRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsSelector(BaseSelector): + def select_message(self, environment: BaseEnvironment, messages: List[Message]) -> List[Message]: + last_sender = environment.last_messages[0].sender + selected = messages + + if last_sender == "code_writer": + cur_code = extract(selected[0].content, ":") + environment.rule_params["code"] = cur_code + + from .code_api import execute_unit_tests + feedback = execute_unit_tests(environment.rule_params["code"], eval(environment.unit_tests)) + + environment.rule_params["feedback"] = feedback + selected[0].content = f":\n\n{cur_code}\n\n:\n{feedback}" + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + elif last_sender == "code_reviewer": + code_review = selected[0].content + cur_code = environment.rule_params["code"] + selected[0].content = f":\n\n{cur_code}\n\n{code_review}" + feedback = environment.rule_params["feedback"] + f_dict = json.loads(feedback) + if f_dict["is_passing"]: + environment.rule_params["end_flag"] = True + + return selected \ No newline at end of file diff --git a/agentverse/environments/rules/updater/__init__.py b/agentverse/environments/rules/updater/__init__.py index 81c30a37f..b61e66f1f 100644 --- a/agentverse/environments/rules/updater/__init__.py +++ b/agentverse/environments/rules/updater/__init__.py @@ -5,3 +5,4 @@ from .base import BaseUpdater from .basic import BasicUpdater from .classroom import ClassroomUpdater +from .sde_team import SdeTeamUpdater diff --git a/agentverse/environments/rules/updater/sde_team.py b/agentverse/environments/rules/updater/sde_team.py new file mode 100644 index 000000000..049bae413 --- /dev/null +++ b/agentverse/environments/rules/updater/sde_team.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Tuple + +from . import updater_registry as UpdaterRegistry +from .base import BaseUpdater +from agentverse.message import Message + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + from agentverse.agents import BaseAgent + + +@UpdaterRegistry.register("sde_team") +class SdeTeamUpdater(BaseUpdater): + """ + The basic version of updater. + The messages will be seen by all the receiver specified in the message. + """ + + def update_memory(self, environment: BaseEnvironment): + added = False + for message in environment.last_messages: + if message.content == "": + continue + added |= self.add_message_to_all_agents(environment.agents, message) + + def add_message_to_all_agents( + self, agents: List[BaseAgent], message: Message + ) -> bool: + if "all" in message.receiver: + # If receiver is all, then add the message to all agents + for agent in agents: + agent.add_message_to_memory([message]) + return True + else: + # If receiver is not all, then add the message to the specified agents + receiver_set = message.receiver + for agent in agents: + if agent.name in receiver_set: + agent.add_message_to_memory([message]) + receiver_set.remove(agent.name) + if len(receiver_set) > 0: + missing_receiver = ", ".join(list(receiver_set)) + raise ValueError( + "Receiver {} not found. Message discarded".format(missing_receiver) + ) + return True diff --git a/agentverse/environments/rules/visibility/__init__.py b/agentverse/environments/rules/visibility/__init__.py index c5fba1455..3ce2ba3c1 100644 --- a/agentverse/environments/rules/visibility/__init__.py +++ b/agentverse/environments/rules/visibility/__init__.py @@ -9,3 +9,4 @@ from .classroom import ClassroomVisibility from .oneself import OneselfVisibility from .prisoner import PrisonerVisibility +from .sde_team import SdeTeamVisibility diff --git a/agentverse/environments/rules/visibility/sde_team.py b/agentverse/environments/rules/visibility/sde_team.py new file mode 100644 index 000000000..be56a2fad --- /dev/null +++ b/agentverse/environments/rules/visibility/sde_team.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import random +from typing import TYPE_CHECKING, Any, List, Union + +from . import visibility_registry as VisibilityRegistry +from .base import BaseVisibility + +if TYPE_CHECKING: + from agentverse.environments import BaseEnvironment + + +@VisibilityRegistry.register("sde_team") +class SdeTeamVisibility(BaseVisibility): + """ + Visibility function for code problem. No need to change visibility. + + """ + + def update_visible_agents(self, environment: BaseEnvironment): + return + + def reset(self): + return \ No newline at end of file diff --git a/agentverse/environments/sde_team.py b/agentverse/environments/sde_team.py new file mode 100644 index 000000000..cb1735dca --- /dev/null +++ b/agentverse/environments/sde_team.py @@ -0,0 +1,127 @@ +import asyncio +import logging +from typing import Any, Dict, List +from icecream import ic +import json + +from agentverse.agents.conversation_agent import BaseAgent +from agentverse.environments.rules.base import Rule +from agentverse.message import Message + +from . import env_registry as EnvironmentRegistry +from .base import BaseEnvironment +from agentverse.initialization import load_tools + + +@EnvironmentRegistry.register("sde_team") +class SdeTeamEnvironment(BaseEnvironment): + """ + A basic environment implementing the logic of conversation to craft code. + + Args: + agents: List of agents + rule: Rule for the environment + max_turns: Maximum number of turns + cnt_turn: Current turn number + last_messages: Messages from last turn + rule_params: Variables set by the rule + """ + + agents: List[BaseAgent] + rule: Rule + max_turns: int = 10 + cnt_turn: int = 0 + last_messages: List[Message] = [] + rule_params: Dict = {} + task_name: str = "test" + + def __init__(self, rule, **kwargs): + rule_config = rule + order_config = rule_config.get("order", {"type": "sde_team"}) + visibility_config = rule_config.get("visibility", {"type": "base"}) + selector_config = rule_config.get("selector", {"type": "sde_team"}) + updater_config = rule_config.get("updater", {"type": "sde_team"}) + describer_config = rule_config.get("describer", {"type": "base"}) + rule = Rule( + order_config, + visibility_config, + selector_config, + updater_config, + describer_config, + ) + super().__init__(rule=rule, **kwargs) + self.rule_params["first_round"] = True + self.rule_params["end_flag"] = False + + # # Test code + # self.rule_params["name_to_tools"] = {tool.name: tool for tool in load_tools([{"tool_name": "code_interpreter", "tool_url" : "http://127.0.0.1:8079/tools/code_interpreter/"}])} + # tool = self.rule_params["name_to_tools"]["execute_unit_tests"] + # print(type(tool)) + + # d = { + # "func_impl": "def f(x):\n\treturn x + 1", + # "tests": ["assert f(1) == 2"] + # } + # # input_str = json.dumps(d) + # json.loads(input_str) + # tool.run(input_str, verbose=True) + # exit() + + async def step(self) -> List[Message]: + """Run one step of the environment""" + + # Get the next agent index + agent_ids = self.rule.get_next_agent_idx(self) # order + + # Generate current environment description + # env_descriptions = self.rule.get_env_description(self) # describer + + # # Generate the next message + # messages = await asyncio.gather( + # *[self.agents[i].astep(env_descriptions[i]) for i in agent_ids] + # ) # call chatgpt api + + messages = await asyncio.gather( + *[self.agents[i].astep("") for i in agent_ids] + ) + + # Track the messages to get the role of the sender + self.last_messages = messages + + # Some rules will select certain messages from all the messages + selected_messages = self.rule.select_message(self, messages) # selector + self.last_messages = selected_messages + self.print_messages(selected_messages) + + # Update the memory of the agents + self.rule.update_memory(self) # updater: update memory + + # Update the set of visible agents for each agent + self.rule.update_visible_agents(self) # change receiver + + self.cnt_turn += 1 + + return selected_messages + + def print_messages(self, messages: List[Message]) -> None: + for message in messages: + if message is not None: + logging.info(f"{message.sender}: {message.content}") + + def reset(self) -> None: + """Reset the environment""" + self.cnt_turn = 0 + self.rule.reset() + for agent in self.agents: + agent.reset() + + def is_done(self) -> bool: + """Check if the environment is done""" + if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: + # with open("record_human_eval.txt", "a") as f: + # wd = dict() + # wd['task_id'] = self.task_name + # wd['code'] = self.rule_params['code'] + # f.write(json.dumps(wd)) + return True + return False diff --git a/agentverse/environments/sde_team_given_tests.py b/agentverse/environments/sde_team_given_tests.py new file mode 100644 index 000000000..b983dc988 --- /dev/null +++ b/agentverse/environments/sde_team_given_tests.py @@ -0,0 +1,128 @@ +import asyncio +import logging +from typing import Any, Dict, List +from icecream import ic +import json + +from agentverse.agents.conversation_agent import BaseAgent +from agentverse.environments.rules.base import Rule +from agentverse.message import Message + +from . import env_registry as EnvironmentRegistry +from .base import BaseEnvironment +from agentverse.initialization import load_tools + + +@EnvironmentRegistry.register("sde_team_given_tests") +class SdeTeamGivenTestsEnvironment(BaseEnvironment): + """ + A basic environment implementing the logic of conversation to craft code. + + Args: + agents: List of agents + rule: Rule for the environment + max_turns: Maximum number of turns + cnt_turn: Current turn number + last_messages: Messages from last turn + rule_params: Variables set by the rule + """ + + agents: List[BaseAgent] + rule: Rule + max_turns: int = 10 + cnt_turn: int = 0 + last_messages: List[Message] = [] + rule_params: Dict = {} + unit_tests: str = "" + # # variables for experiment + # task_name: str = "test" + # experiment_name: str = "" + + def __init__(self, rule, **kwargs): + rule_config = rule + order_config = rule_config.get("order", {"type": "sde_team_given_tests"}) + visibility_config = rule_config.get("visibility", {"type": "base"}) + selector_config = rule_config.get("selector", {"type": "sde_team_given_tests"}) + updater_config = rule_config.get("updater", {"type": "sde_team"}) + describer_config = rule_config.get("describer", {"type": "base"}) + rule = Rule( + order_config, + visibility_config, + selector_config, + updater_config, + describer_config, + ) + super().__init__(rule=rule, **kwargs) + self.rule_params["first_round"] = True + self.rule_params["end_flag"] = False + + # # Set up logging for experiment + # filename = self.task_name.replace("/", "_") + # import os + # import os.path + # if not os.path.exists(f"human_eval_experiments/{self.experiment_name}/log"): + # os.makedirs(f"human_eval_experiments/{self.experiment_name}/log") + # file_handler = logging.FileHandler(f"human_eval_experiments/{self.experiment_name}/log/{filename}.txt") + # logging.getLogger().addHandler(file_handler) + + async def step(self) -> List[Message]: + """Run one step of the environment""" + + # Get the next agent index + agent_ids = self.rule.get_next_agent_idx(self) # order + + # Generate current environment description + # env_descriptions = self.rule.get_env_description(self) # describer + + # # Generate the next message + # messages = await asyncio.gather( + # *[self.agents[i].astep(env_descriptions[i]) for i in agent_ids] + # ) # call chatgpt api + + messages = await asyncio.gather( + *[self.agents[i].astep("") for i in agent_ids] + ) + + # Track the messages to get the role of the sender + self.last_messages = messages + + # Some rules will select certain messages from all the messages + selected_messages = self.rule.select_message(self, messages) # selector + self.last_messages = selected_messages + self.print_messages(selected_messages) + + # Update the memory of the agents + self.rule.update_memory(self) # updater: update memory + + # Update the set of visible agents for each agent + self.rule.update_visible_agents(self) # change receiver + + self.cnt_turn += 1 + + return selected_messages + + def print_messages(self, messages: List[Message]) -> None: + for message in messages: + if message is not None: + logging.info(f"{message.sender}: {message.content}") + + def reset(self) -> None: + """Reset the environment""" + self.cnt_turn = 0 + self.rule.reset() + for agent in self.agents: + agent.reset() + + def is_done(self) -> bool: + """Check if the environment is done""" + if self.cnt_turn >= self.max_turns or self.rule_params["end_flag"]: + # # Write to file for experiment + # with open(f"human_eval_experiments/{self.experiment_name}/record_human_eval_prediction.jsonl", "a") as f: + # wd = dict() + # wd['task_id'] = self.task_name + # wd['code'] = self.rule_params['code'] + # # print(wd) + # f.write(json.dumps(wd) + "\n") + # logging.getLogger().handlers.pop() + return True + return False diff --git a/agentverse/initialization.py b/agentverse/initialization.py index 38764c80a..30c9e8948 100644 --- a/agentverse/initialization.py +++ b/agentverse/initialization.py @@ -70,6 +70,12 @@ def prepare_task_config(task): and task != "__pycache__" ): all_tasks.append(task) + for subtask in os.listdir(os.path.join(all_task_dir, task)): + if ( + os.path.isdir(os.path.join(all_task_dir, task, subtask)) + and subtask != "__pycache__" + ): + all_tasks.append(f"{task}/{subtask}") raise ValueError(f"Task {task} not found. Available tasks: {all_tasks}") if not os.path.exists(config_path): raise ValueError( diff --git a/agentverse/memory/__init__.py b/agentverse/memory/__init__.py index 2ced03877..9d053b86e 100644 --- a/agentverse/memory/__init__.py +++ b/agentverse/memory/__init__.py @@ -5,3 +5,4 @@ from .base import BaseMemory from .chat_history import ChatHistoryMemory from .summary import SummaryMemory +from .sde_team import SdeTeamMemory diff --git a/agentverse/memory/sde_team.py b/agentverse/memory/sde_team.py new file mode 100644 index 000000000..435d297a4 --- /dev/null +++ b/agentverse/memory/sde_team.py @@ -0,0 +1,32 @@ +import re +from string import Template +from typing import List + +from pydantic import Field, validator + +from agentverse.initialization import load_llm +from agentverse.llms.base import BaseLLM +from agentverse.message import Message + +from . import memory_registry +from .base import BaseMemory + + +@memory_registry.register("sde_team") +class SdeTeamMemory(BaseMemory): + """SdeTeamMemory is a memory for SdeTeamEnvironment. + It is a simple memory that only stores the most recent info in the buffer. + TODO: add summarized history + """ + buffer: str = Field(default="") + + def add_message(self, messages: List[Message]) -> None: + new_lines = "\n".join([message.content for message in messages]) + # self.update_buffer(new_lines) + self.buffer = new_lines + + def to_string(self, *args, **kwargs) -> str: + return self.buffer + + def reset(self) -> None: + self.buffer = "" diff --git a/agentverse/tasks/__init__.py b/agentverse/tasks/__init__.py index 200679c96..feaffa443 100644 --- a/agentverse/tasks/__init__.py +++ b/agentverse/tasks/__init__.py @@ -21,3 +21,5 @@ NlpClassroom3PlayersWithtoolNolcParser, ) from .pokemon.output_parser import PokemonParser +from .sde_team.sde_team_3players_nolc.output_parser import SdeTeamParser +from .sde_team.sde_team_2players_nolc.output_parser import SdeTeamGivenTestsParser diff --git a/agentverse/tasks/sde_team/readme.md b/agentverse/tasks/sde_team/readme.md new file mode 100644 index 000000000..4ceca230d --- /dev/null +++ b/agentverse/tasks/sde_team/readme.md @@ -0,0 +1,170 @@ +# SDE team 🧑🏾‍💻👩‍💻🧑🏻‍💻 + +In this task, LLMs work as a software development team to solve code implementation problem. We have simulated two scenarios *sde_team/sde_team_2players_nolc* and *sde_team/sde_team_3players_nolc*. + +The performance on [HumanEval](https://github.com/openai/human-eval) is shown below. + +| Methods | Pass@1 HumanEval | +|---------------------------------|-----------| +| Codex (175B)* | 0.47 | +|     + CodeT* | 0.658 | +| PaLM Coder (540B)* | 0.36 | +| GPT-4* | 0.67 | +| ChatGPT (gpt-3.5-turbo)* | 0.573 | +|     + Self-collaboration* | 0.744 | +| Our *sde_team/sde_team_2players_nolc* (gpt-3.5-turbo) | **0.799** | + +*: Results are from [Self-collaboration](https://arxiv.org/abs/2304.07590). The methods in the table all employed the provided unit tests. + +Our *sde_team/sde_team_2players_nolc* shares the similar spirit as Self-collaboration at the moment. We are working to introduce more features in this repo! + + +## *sde_team/sde_team_2players_nolc* + +In this case, we are simulating a code generation problem that a python function body is required to be generated given function signature, doc string and unit tests. In the following, we will elaborate the details. + +### Roles + +Detailed role description and prompts can be found in `config.yaml` + +#### *code writer* + +Code writer will write the code to satisfy the given requirement. The requirement is given in the \ field of the prompt. The code writer first thinks about the task (the thoughts written in \) and then write the code in \. + +The submitted code will be tested automatically on a series of unit tests. Then the feedback (in \) together with a professional code review (in \) will be returned. Then code writer will leverage this information to refine the previously submitted code. The refinement will take multiple iterations. + +#### *code reviewer* + +Code reviewer will write professional review for the submitted code. The submitted code will be given in \, the execution feedback of unit tests will be given in \ and the review will be composed in \. + +### How to run the simulation + +#### Provide problem and unit tests + +The code problem and unit tests should be given in `agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json`. Here is an example. + +```json +{ + "problem": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n", + "unit_tests": [ + "assert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']", + "assert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']", + "assert separate_paren_groups('(()(())((())))') == ['(()(())((())))']", + "assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']" + ] +} +``` + +#### Build the configuration file + +Run `agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py` to generate `config.yaml`. + +```bash +cd agentverse/tasks/sde_team/sde_team_2players_nolc/ +python build_config.py +``` + +#### Run the session + +After generating `config.yaml`, run the `main.py` to start the task. + +```python +from agentverse.agentverse import AgentVerse +from argparse import ArgumentParser + +parser = ArgumentParser() +parser.add_argument("--task", type=str, default="sde_team/sde_team_2players_nolc") + +args = parser.parse_args() +agentverse = AgentVerse.from_task(args.task) +agentverse.run() +``` + +### A running case + +A typical running log of the previous problem is shown below. + +```bash +05/22/2023 15:36:18 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": false, "feedback": "Tested passed:\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\n\nTests failed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())'] # output: ['(()())', ' ((()))', ' ()', ' ((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))'] # output: ['()', ' (())', ' ((()))', ' (((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())'] # output: ['( )', ' (( ))', ' (( )( ))']"} + +05/22/2023 15:36:33 - INFO - root - code_reviewer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +The code is correct in terms of logic and passes one of the test cases. However, it fails the other two test cases. The reason for the failure is that the code does not ignore spaces in the input string as mentioned in the problem statement. + +To fix this issue, we can modify the code to remove all spaces from the input string before processing it. We can do this by adding the following line at the beginning of the function: + +paren_string = paren_string.replace(" ", "") + +This will remove all spaces from the input string and the code should now pass all the test cases. + +05/22/2023 15:36:54 - INFO - root - code_writer: : + +from typing import List + +def separate_paren_groups(paren_string: str) -> List[str]: + paren_string = paren_string.replace(" ", "") + result = [] + open_count = 0 + start = 0 + for i, c in enumerate(paren_string): + if c == '(': + open_count += 1 + elif c == ')': + open_count -= 1 + if open_count == 0: + result.append(paren_string[start:i+1]) + start = i+1 + return result + + +: +{"is_passing": true, "feedback": "Tested passed:\nassert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']\nassert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']\nassert separate_paren_groups('(()(())((())))') == ['(()(())((())))']\nassert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\nTests failed:"} +``` + +## *sde_team/sde_team_3players_nolc* + +Different from *sde_team/sde_team_2players_nolc*, we additionally introduce a role to automatically generate unit tests. + +- *unit test generator*: generate a series of unit test cases for the coding problem. + +### Stay tuned + +The generated unit tests are not always perfect, as they may not be correct. We plan to incorporate tools to raise the correctness of the generated cases. \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py b/agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py new file mode 100644 index 000000000..1fea37cae --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/build_config.py @@ -0,0 +1,20 @@ +import yaml +import json + +config_path = "partial_config.yaml" + +code_problem = json.load(open("code_problem.json", "r")) +problem_string = "\n\n:\n" + code_problem["problem"] +unit_tests = str(code_problem["unit_tests"]) + +print(problem_string) +print(unit_tests) + +task_config = yaml.safe_load(open(config_path)) + +for agent_configs in task_config["agents"]: + agent_configs["role_description"] += problem_string +task_config["environment"]["unit_tests"] = unit_tests + +with open("config.yaml", "w") as f: + yaml.safe_dump(task_config, f) \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json b/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json new file mode 100644 index 000000000..23f444b00 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/code_problem.json @@ -0,0 +1,9 @@ +{ + "problem": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n", + "unit_tests": [ + "assert separate_paren_groups('(()()) ((())) () ((())()())') == ['(()())', '((()))', '()', '((())()())']", + "assert separate_paren_groups('() (()) ((())) (((())))') == ['()', '(())', '((()))', '(((())))']", + "assert separate_paren_groups('(()(())((())))') == ['(()(())((())))']", + "assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']" + ] +} \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml b/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml new file mode 100644 index 000000000..4b616cf60 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/config.yaml @@ -0,0 +1,202 @@ +agents: +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_writer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + - code_reviewer + role_description: "You are a professional coding assistant. You will be given a\ + \ function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nThe unit tests have been prepared. DO NOT generate unit tests!\n\nHere is the\ + \ steps of the code writing and refinement process:\n1 - generate \n\ + 2 - generate \n\nThe following is two examples on code writing and refinement.\n\ + \n[Example on code writing]:\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \"\"\" Given an integer x, return True if x is a palindrome, and False otherwise.\n\ + An integer is a palindrome when it reads the same forward and backward.\n>>> is_palindrome(121)\n\ + True\n>>> is_palindrome(10)\nFalse\n\"\"\"\n\n:\nI need to convert the\ + \ integer to a string and then check if the string is a palindrome.\n\n:\n\ + def is_palindrome(x: int) -> bool:\n s = str(x)\n return s == s[::-1]\n\n\ + [Example on refinement]:\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n\"\"\" Given an array of integers nums\ + \ and an integer target, return indices of the two numbers such that they add\ + \ up to target.\nYou may assume that each input would have exactly one solution,\ + \ and you may not use the same element twice. \nThe answer should be in an ascending\ + \ order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\ + \"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums: List[int],\ + \ target: int) -> List[int]:\n for i in range(len(nums)):\n for j in\ + \ range(len(nums)):\n if nums[i] + nums[j] == target:\n \ + \ return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\"\ + : \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert\ + \ two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n:\nThe code reviewer is right. I should start the second\ + \ for loop from i + 1.\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n[Start of new task]:\n\nNow please solve the\ + \ following problem. DO NOT generate unit tests.\n\n\n:\nfrom typing\ + \ import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n\ + \ \"\"\" Input to this function is a string containing multiple groups of nested\ + \ parentheses. Your goal is to\n separate those group into separate strings\ + \ and return the list of those.\n Separate groups are balanced (each open brace\ + \ is properly closed) and not nested within each other\n Ignore any spaces\ + \ in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n\ + \ ['()', '(())', '(()())']\n \"\"\"\n" + verbose: true +- agent_type: conversation + llm: + llm_type: gpt-3.5-turbo + max_tokens: 1024 + temperature: 0.1 + memory: + memory_type: sde_team + name: code_reviewer + prompt_template: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' + receiver: + - code_writer + role_description: "You are a professional code reviewer. You will be given a function\ + \ signature and docstring in . A code writer has submitted his completion\ + \ in .\nThe code has been executed on a series of unit tests.\ + \ The execution feedback is provided in . Your job is to write\ + \ a code review in to help the code writer improve his code.\n\n\ + The following is an examples.\n\n[Example]:\n\n:\nfrom typing import\ + \ List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\"\"\" Given\ + \ an array of integers nums and an integer target, return indices of the two numbers\ + \ such that they add up to target.\nYou may assume that each input would have\ + \ exactly one solution, and you may not use the same element twice. \nThe answer\ + \ should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4],\ + \ 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n:\n{\"is_passing\": false,\ + \ \"feedback\": \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0,\ + \ 1]\\n\\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert\ + \ two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous\ + \ code is not correct. It failed all the tests. The second for loop should start\ + \ from i + 1.\n\n[Start of new task]:\n\nNow please review the following submitted\ + \ code.\n\n\n:\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string:\ + \ str) -> List[str]:\n \"\"\" Input to this function is a string containing\ + \ multiple groups of nested parentheses. Your goal is to\n separate those group\ + \ into separate strings and return the list of those.\n Separate groups are\ + \ balanced (each open brace is properly closed) and not nested within each other\n\ + \ Ignore any spaces in the input string.\n >>> separate_paren_groups('(\ + \ ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n" + verbose: true +environment: + env_type: sde_team_given_tests + max_turns: 5 + rule: + describer: + type: basic + order: + type: sde_team_given_tests + selector: + type: sde_team_given_tests + updater: + type: sde_team + visibility: + type: all + task_name: HumanEval/0 + unit_tests: '["assert separate_paren_groups(''(()()) ((())) () ((())()())'') == + [''(()())'', ''((()))'', ''()'', ''((())()())'']", "assert separate_paren_groups(''() + (()) ((())) (((())))'') == [''()'', ''(())'', ''((()))'', ''(((())))'']", "assert + separate_paren_groups(''(()(())((())))'') == [''(()(())((())))'']", "assert separate_paren_groups(''( + ) (( )) (( )( ))'') == [''()'', ''(())'', ''(()())'']"]' +prompts: + code_reviewer_role_prompt: "You are a professional code reviewer. You will be given\ + \ a function signature and docstring in . A code writer has submitted\ + \ his completion in .\nThe code has been executed on a series\ + \ of unit tests. The execution feedback is provided in . Your\ + \ job is to write a code review in to help the code writer improve\ + \ his code.\n\nThe following is an examples.\n\n[Example]:\n\n:\nfrom\ + \ typing import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n\ + \"\"\" Given an array of integers nums and an integer target, return indices of\ + \ the two numbers such that they add up to target.\nYou may assume that each input\ + \ would have exactly one solution, and you may not use the same element twice.\ + \ \nThe answer should be in an ascending order.\n>>> two_sum([2,7,11,15], 9)\n\ + [0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\"\"\"\n\n:\nfrom typing\ + \ import List\n\ndef two_sum(nums: List[int], target: int) -> List[int]:\n \ + \ for i in range(len(nums)):\n for j in range(len(nums)):\n \ + \ if nums[i] + nums[j] == target:\n return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\": \"Tested passed:\\n\\nassert\ + \ two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert two_sum([3, 5, 2, 7], 9) ==\ + \ [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3, 2, 4], 6) == [1, 2] # output:\ + \ [0, 0]\"}\n\n:\nThe previous code is not correct. It failed all\ + \ the tests. The second for loop should start from i + 1.\n\n[Start of new task]:\n\ + \nNow please review the following submitted code.\n" + code_writer_role_prompt: "You are a professional coding assistant. You will be given\ + \ a function signature and docstring (in ). You should copy the import\ + \ statement, the function signature first and then finish the function body. Before\ + \ writing any code, you should first think about the task and write down your\ + \ thoughts in . Then you can write your code in .\n\nYour submitted\ + \ code (in ) will be tested on a series of unit tests. You will\ + \ be given the feedback (in ) of the test, together with the\ + \ review of your code (in ) of a professional code reviewer. You\ + \ can use these feedbacks to refine your code. The refinement will take multiple\ + \ iterations. You can use the to record your thoughts during the refinement.\n\ + \nThe unit tests have been prepared. DO NOT generate unit tests!\n\nHere is the\ + \ steps of the code writing and refinement process:\n1 - generate \n\ + 2 - generate \n\nThe following is two examples on code writing and refinement.\n\ + \n[Example on code writing]:\n\n:\ndef is_palindrome(x: int) -> bool:\n\ + \"\"\" Given an integer x, return True if x is a palindrome, and False otherwise.\n\ + An integer is a palindrome when it reads the same forward and backward.\n>>> is_palindrome(121)\n\ + True\n>>> is_palindrome(10)\nFalse\n\"\"\"\n\n:\nI need to convert the\ + \ integer to a string and then check if the string is a palindrome.\n\n:\n\ + def is_palindrome(x: int) -> bool:\n s = str(x)\n return s == s[::-1]\n\n\ + [Example on refinement]:\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n\"\"\" Given an array of integers nums\ + \ and an integer target, return indices of the two numbers such that they add\ + \ up to target.\nYou may assume that each input would have exactly one solution,\ + \ and you may not use the same element twice. \nThe answer should be in an ascending\ + \ order.\n>>> two_sum([2,7,11,15], 9)\n[0, 1]\n>>> two_sum([3,2,4], 6)\n[1, 2]\n\ + \"\"\"\n\n:\nfrom typing import List\n\ndef two_sum(nums: List[int],\ + \ target: int) -> List[int]:\n for i in range(len(nums)):\n for j in\ + \ range(len(nums)):\n if nums[i] + nums[j] == target:\n \ + \ return [i, j]\n\n:\n{\"is_passing\": false, \"feedback\"\ + : \"Tested passed:\\n\\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\\n\\nassert\ + \ two_sum([3, 5, 2, 7], 9) == [2, 3]\\n\\nTests failed:\\n\\nassert two_sum([3,\ + \ 2, 4], 6) == [1, 2] # output: [0, 0]\"}\n\n:\nThe previous code\ + \ is not correct. It failed all the tests. The second for loop should start from\ + \ i + 1.\n\n:\nThe code reviewer is right. I should start the second\ + \ for loop from i + 1.\n\n:\nfrom typing import List\n\ndef two_sum(nums:\ + \ List[int], target: int) -> List[int]:\n for i in range(len(nums)):\n \ + \ for j in range(i + 1, len(nums)):\n if nums[i] + nums[j] == target:\n\ + \ return [i, j]\n\n[Start of new task]:\n\nNow please solve the\ + \ following problem. DO NOT generate unit tests.\n" + prompt: 'You are working in a programming team to solve a python code implementation + problem. + + + ${role_description} + + + ${chat_history}' diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py b/agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py new file mode 100644 index 000000000..3e3243c6c --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/output_parser.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import re +from typing import Union + +from langchain.agents import AgentOutputParser + +# from langchain.schema import AgentAction, AgentFinish + +from agentverse.parser import OutputParserError, output_parser_registry, OutputParser +from agentverse.llms.base import LLMResult +from agentverse.utils import AgentAction, AgentFinish + + +@output_parser_registry.register("sde_team/sde_team_2players_nolc") +class SdeTeamGivenTestsParser(OutputParser): + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: + return AgentFinish({"output": output.content}, output.content) diff --git a/agentverse/tasks/sde_team/sde_team_2players_nolc/partial_config.yaml b/agentverse/tasks/sde_team/sde_team_2players_nolc/partial_config.yaml new file mode 100644 index 000000000..885afb3b4 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_2players_nolc/partial_config.yaml @@ -0,0 +1,171 @@ +prompts: + prompt: &prompt |- + You are working in a programming team to solve a python code implementation problem. + + ${role_description} + + ${chat_history} + + code_writer_role_prompt: &code_writer_role_prompt | + You are a professional coding assistant. You will be given a function signature and docstring (in ). You should copy the import statement, the function signature first and then finish the function body. Before writing any code, you should first think about the task and write down your thoughts in . Then you can write your code in . + + Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + + The unit tests have been prepared. DO NOT generate unit tests! + + Here is the steps of the code writing and refinement process: + 1 - generate + 2 - generate + + The following is two examples on code writing and refinement. + + [Example on code writing]: + + : + def is_palindrome(x: int) -> bool: + """ Given an integer x, return True if x is a palindrome, and False otherwise. + An integer is a palindrome when it reads the same forward and backward. + >>> is_palindrome(121) + True + >>> is_palindrome(10) + False + """ + + : + I need to convert the integer to a string and then check if the string is a palindrome. + + : + def is_palindrome(x: int) -> bool: + s = str(x) + return s == s[::-1] + + [Example on refinement]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\n\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\n\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\n\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + : + The code reviewer is right. I should start the second for loop from i + 1. + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(i + 1, len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + [Start of new task]: + + Now please solve the following problem. DO NOT generate unit tests. + + code_reviewer_role_prompt: &code_reviewer_role_prompt | + You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . + The code has been executed on a series of unit tests. The execution feedback is provided in . Your job is to write a code review in to help the code writer improve his code. + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\n\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\n\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\n\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + [Start of new task]: + + Now please review the following submitted code. + +environment: + env_type: sde_team_given_tests + max_turns: 5 + task_name: HumanEval/0 + # experiment_name: 2player_temperature-0.0 + unit_tests: None + rule: + order: + type: sde_team_given_tests + visibility: + type: all + selector: + type: sde_team_given_tests + updater: + type: sde_team + describer: + type: basic + +agents: + - agent_type: conversation + name: code_writer + role_description: *code_writer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer, code_reviewer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 + + - agent_type: conversation + name: code_reviewer + role_description: *code_reviewer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.1 + max_tokens: 1024 \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_3players_nolc/config.yaml b/agentverse/tasks/sde_team/sde_team_3players_nolc/config.yaml new file mode 100644 index 000000000..fe4ff4369 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_3players_nolc/config.yaml @@ -0,0 +1,252 @@ +prompts: + prompt: &prompt |- + You are working in a programming team to solve a python code implementation problem. + + ${role_description} + + ${chat_history} + + code_writer_role_prompt: &code_writer_role_prompt | + You are a professional coding assistant. You will be given a function signature and docstring (in ). You should copy the import statement, the function signature first and then finish the function body. Before writing any code, you should first think about the task and write down your thoughts in . Then you can write your code in . + + Your submitted code (in ) will be tested on a series of unit tests. You will be given the feedback (in ) of the test, together with the review of your code (in ) of a professional code reviewer. You can use these feedbacks to refine your code. The refinement will take multiple iterations. You can use the to record your thoughts during the refinement. + + Here is the steps of the code writing and refinement process: + 1 - generate + 2 - generate + + The following is two examples on code writing and refinement. + + [Example on code writing]: + + : + def is_palindrome(x: int) -> bool: + """ Given an integer x, return True if x is a palindrome, and False otherwise. + An integer is a palindrome when it reads the same forward and backward. + >>> is_palindrome(121) + True + >>> is_palindrome(10) + False + """ + + : + I need to convert the integer to a string and then check if the string is a palindrome. + + : + def is_palindrome(x: int) -> bool: + s = str(x) + return s == s[::-1] + + [Example on refinement]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + : + The code reviewer is right. I should start the second for loop from i + 1. + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(i + 1, len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + [Start of new task]: + + Now please solve the following problem. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + + + code_reviewer_role_prompt: &code_reviewer_role_prompt | + You are a professional code reviewer. You will be given a function signature and docstring in . A code writer has submitted his completion in . + The code has been executed on a series of unit tests. The execution feedback is provided in . Your job is to write a code review in to help the code writer improve his code. + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + for i in range(len(nums)): + for j in range(len(nums)): + if nums[i] + nums[j] == target: + return [i, j] + + : + {"is_passing": false, "feedback": "Tested passed:\nassert two_sum([2, 7, 11, 15], 9) == [0, 1]\nassert two_sum([3, 5, 2, 7], 9) == [2, 3]\n\nTests failed:\nassert two_sum([3, 2, 4], 6) == [1, 2] # output: [0, 0]"} + + : + The previous code is not correct. It failed all the tests. The second for loop should start from i + 1. + + [Start of new task]: + + Now please review the following submitted code. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + + + unit_test_generator_role_prompt: &unit_test_generator_role_prompt | + You are a code tester that produces unit test case. You will be given a function signature and docstring in . Your job is to reason and write ONE unit test case in to test the code. + + Here is the steps to write a unit test case. + 1 - compose a valid input in + 2 - given the , generate + 3 - generate the correct expected output + 4 - generate the assertion statement + + The following is an examples. + + [Example]: + + : + from typing import List + + def two_sum(nums: List[int], target: int) -> List[int]: + """ Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. + You may assume that each input would have exactly one solution, and you may not use the same element twice. + The answer should be in an ascending order. + >>> two_sum([2,7,11,15], 9) + [0, 1] + >>> two_sum([3,2,4], 6) + [1, 2] + """ + + : + [5, 7, 1, 9, 3], 14 + + : + The output should be a list of two indices that sum up to 14. 5 (index 0) + 9 (index 3) = 14. So the output should be [0, 3]. + + : + [0, 3] + + : + assert two_sum([5, 7, 1, 9, 3], 14) == [0, 3] + + [Start of new task]: + + Now please write a unit test case for the following problem. There are serveral guys doing the same task, you should try to compose a unique test case. + + : + def longestPalindrome(self, s: str) -> str: + """ Given a string s, return the longest palindromic substring in s. + >>> longestPalindrome("babcsd") + "bab" + >>> longestPalindrome("cbxxbd") + "bxxb" + """ + +# tools: &tools +# - tool_name: "execute_unit_tests" +# tool_url: "http://127.0.0.1:8079/tools/execute_unit_tests/" + +environment: + env_type: sde_team + max_turns: 5 + task_name: HumanEval/0 + rule: + order: + type: sde_team + visibility: + type: all + selector: + type: sde_team + updater: + type: sde_team + describer: + type: basic + +agents: + - agent_type: conversation + name: code_writer + role_description: *code_writer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer, code_reviewer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.3 + max_tokens: 1024 + + - agent_type: conversation + name: code_reviewer + role_description: *code_reviewer_role_prompt + memory: + memory_type: sde_team + prompt_template: *prompt + verbose: true + receiver: [code_writer] + llm: + llm_type: gpt-3.5-turbo + temperature: 0.3 + max_tokens: 1024 + + - agent_type: conversation + name: unit_test_generator + role_description: *unit_test_generator_role_prompt + prompt_template: *prompt + verbose: true + llm: + llm_type: gpt-3.5-turbo + temperature: 1.0 + max_tokens: 1024 \ No newline at end of file diff --git a/agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py b/agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py new file mode 100644 index 000000000..53376d0d8 --- /dev/null +++ b/agentverse/tasks/sde_team/sde_team_3players_nolc/output_parser.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import re +from typing import Union + +from langchain.agents import AgentOutputParser + +# from langchain.schema import AgentAction, AgentFinish + +from agentverse.parser import OutputParserError, output_parser_registry, OutputParser +from agentverse.llms.base import LLMResult +from agentverse.utils import AgentAction, AgentFinish + + +@output_parser_registry.register("sde_team/sde_team_3players_nolc") +class SdeTeamParser(OutputParser): + def parse(self, agent, env, output: LLMResult) -> Union[AgentAction, AgentFinish]: + return AgentFinish({"output": output.content}, output.content) diff --git a/imgs/prison/police.png b/imgs/prison/police.png old mode 100755 new mode 100644 diff --git a/main.py b/main.py index 8464b0413..9f5786224 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ parser = ArgumentParser() # parser.add_argument("--task", type=str, default="nlp_classroom_9players") -parser.add_argument("--task", type=str, default="nlp_classroom_3players_nolc") +parser.add_argument("--task", type=str, default="nlp_classroom_3players") args = parser.parse_args() agentverse = AgentVerse.from_task(args.task)