From 13b93e10ecbb757e69130a69eb1b82fd6d04982d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryan=20P=C3=A9goud?= Date: Tue, 27 Feb 2024 11:38:04 +0100 Subject: [PATCH 1/5] working multi car parallel wrapper (fixed circular import, dependency issues) --- Dockerfile | 41 ++++++ multi_car_racing | 1 + requirements.txt | 9 +- syllabus/core/__init__.py | 36 ++++-- syllabus/core/curriculum_sync_wrapper.py | 62 ++++++--- .../examples/experimental/multi_car_racing.py | 119 ++++++++++++++++++ 6 files changed, 233 insertions(+), 35 deletions(-) create mode 100644 Dockerfile create mode 160000 multi_car_racing create mode 100644 syllabus/examples/experimental/multi_car_racing.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..80dffacd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ +# Use the official Python image +FROM python:3.10-slim + +# Set the working directory inside the container +WORKDIR /usr/src/mylib + + +# Install any dependencies +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt && pip install tqdm + +# You can also install additional tools you might need +RUN apt-get update && apt-get install -y --fix-missing\ + libgeos-dev \ + gcc \ + g++ \ + swig \ + git \ + libgl1-mesa-dev \ + libglu1-mesa-dev \ + freeglut3-dev \ + xvfb \ + && rm -rf /var/lib/apt/lists/* + +# Clone the multi_car_racing repository and install it +RUN git clone https://github.com/igilitschenski/multi_car_racing.git \ + && cd multi_car_racing \ + && pip install -e . + +# Copy the Syllabus directory +COPY . . + +WORKDIR /usr/src/mylib/syllabus/examples/experimental +ENV PYTHONPATH="${PYTHONPATH}:/usr/src/mylib" + +CMD ["/bin/bash"] + +# docker build -t syllabus-image . +# docker run -it --rm -p 4000:4000 syllabus-image +# docker run -it --rm -p 4000:4000 -v ${PWD}:/usr/src/mylib syllabus-image +# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py \ No newline at end of file diff --git a/multi_car_racing b/multi_car_racing new file mode 160000 index 00000000..0131904e --- /dev/null +++ b/multi_car_racing @@ -0,0 +1 @@ +Subproject commit 0131904e16cb44892e8b2b1616f0c01c1b44cb7d diff --git a/requirements.txt b/requirements.txt index bdcae95d..fb4925c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -#pettingzoo==1.19.0 -#supersuit==3.5.0 -gym==0.23.1 +pettingzoo==1.23.0 # upgraded from 1.9 to solve BaseParallelWrapper import (typo) +supersuit==3.5.0 +gym==0.24.1 numpy==1.23.3 wandb>=0.15.3 grpcio<=1.48.2 @@ -9,7 +9,8 @@ torch>=2.0.1 ray==2.2.0 tabulate==0.9.0 tensorflow_probability +gymnasium # documentation -pip install sphinx-tabs +# pip install sphinx-tabs diff --git a/syllabus/core/__init__.py b/syllabus/core/__init__.py index ac9a484b..e816e2e7 100644 --- a/syllabus/core/__init__.py +++ b/syllabus/core/__init__.py @@ -1,14 +1,28 @@ +# flake8: noqa # Environment Code -from .task_interface import TaskWrapper, SubclassTaskWrapper, ReinitTaskWrapper, TaskEnv, PettingZooTaskWrapper, PettingZooTaskEnv - -# Curriculum Code -from .utils import decorate_all_functions, UsageError, enumerate_axes from .curriculum_base import Curriculum -from .curriculum_sync_wrapper import (CurriculumWrapper, - MultiProcessingCurriculumWrapper, - RayCurriculumWrapper, - make_multiprocessing_curriculum, - make_ray_curriculum) - -from .environment_sync_wrapper import MultiProcessingSyncWrapper, RaySyncWrapper, PettingZooMultiProcessingSyncWrapper, PettingZooRaySyncWrapper +from .curriculum_sync_wrapper import ( + CurriculumWrapper, + MultiProcessingCurriculumWrapper, + RayCurriculumWrapper, + make_multiprocessing_curriculum, + make_ray_curriculum, +) +from .environment_sync_wrapper import ( + MultiProcessingSyncWrapper, + PettingZooMultiProcessingSyncWrapper, + PettingZooRaySyncWrapper, + RaySyncWrapper, +) from .multivariate_curriculum_wrapper import MultitaskWrapper +from .task_interface import ( + PettingZooTaskEnv, + PettingZooTaskWrapper, + ReinitTaskWrapper, + SubclassTaskWrapper, + TaskEnv, + TaskWrapper, +) + +# Curriculum Code +from .utils import UsageError, decorate_all_functions, enumerate_axes diff --git a/syllabus/core/curriculum_sync_wrapper.py b/syllabus/core/curriculum_sync_wrapper.py index f69034d8..ef1ec0fa 100644 --- a/syllabus/core/curriculum_sync_wrapper.py +++ b/syllabus/core/curriculum_sync_wrapper.py @@ -1,17 +1,19 @@ import threading +import time from functools import wraps from typing import List, Tuple import ray -import time from torch.multiprocessing import SimpleQueue -from syllabus.core import Curriculum, decorate_all_functions +from syllabus.core import Curriculum + +from .utils import decorate_all_functions # fixed circular import class CurriculumWrapper: - """Wrapper class for adding multiprocessing synchronization to a curriculum. - """ + """Wrapper class for adding multiprocessing synchronization to a curriculum.""" + def __init__(self, curriculum: Curriculum) -> None: self.curriculum = curriculum self.task_space = curriculum.task_space @@ -26,7 +28,7 @@ def count_tasks(self, task_space=None): @property def tasks(self): - return self.task_space.tasks + return self.task_space.tasks def get_tasks(self, task_space=None): return self.task_space.get_tasks(gym_space=task_space) @@ -57,13 +59,15 @@ def add_task(self, task): class MultiProcessingCurriculumWrapper(CurriculumWrapper): - """Wrapper which sends tasks and receives updates from environments wrapped in a corresponding MultiprocessingSyncWrapper. - """ - def __init__(self, - curriculum: Curriculum, - task_queue: SimpleQueue, - update_queue: SimpleQueue, - sequential_start: bool = True): + """Wrapper which sends tasks and receives updates from environments wrapped in a corresponding MultiprocessingSyncWrapper.""" + + def __init__( + self, + curriculum: Curriculum, + task_queue: SimpleQueue, + update_queue: SimpleQueue, + sequential_start: bool = True, + ): super().__init__(curriculum) self.task_queue = task_queue self.update_queue = update_queue @@ -78,7 +82,9 @@ def start(self): """ Start the thread that reads the complete_queue and reads the task_queue. """ - self.update_thread = threading.Thread(name='update', target=self._update_queues, daemon=True) + self.update_thread = threading.Thread( + name="update", target=self._update_queues, daemon=True + ) self.should_update = True self.update_thread.start() @@ -106,18 +112,27 @@ def _update_queues(self): requested_tasks += 1 # Decode task and task progress if update["update_type"] == "task_progress": - update["metrics"] = (self.task_space.decode(update["metrics"][0]), update["metrics"][1]) + update["metrics"] = ( + self.task_space.decode(update["metrics"][0]), + update["metrics"][1], + ) self.batch_update(batch_updates) # Sample new tasks if requested_tasks > 0: # TODO: Move this to curriculum, not sync wrapper # Sequentially sample task_space before using curriculum method - if (self.sequential_start and - self.task_space.num_tasks is not None and - self.num_assigned_tasks + requested_tasks < self.task_space.num_tasks): + if ( + self.sequential_start + and self.task_space.num_tasks is not None + and self.num_assigned_tasks + requested_tasks + < self.task_space.num_tasks + ): # Sample unseen tasks sequentially before using curriculum method - new_tasks = self.task_space.list_tasks()[self.num_assigned_tasks:self.num_assigned_tasks + requested_tasks] + new_tasks = self.task_space.list_tasks()[ + self.num_assigned_tasks : self.num_assigned_tasks + + requested_tasks + ] else: new_tasks = self.curriculum.sample(k=requested_tasks) for i, task in enumerate(new_tasks): @@ -149,6 +164,7 @@ def remote_call(func): Note that this causes functions to block, and should be only used for operations that do not require parallelization. """ + @wraps(func) def wrapper(self, *args, **kw): f_name = func.__name__ @@ -159,6 +175,7 @@ def wrapper(self, *args, **kw): if child_func == parent_func: curriculum_func = getattr(self.curriculum, f_name) return ray.get(curriculum_func.remote(*args, **kw)) + return wrapper @@ -168,7 +185,9 @@ def make_multiprocessing_curriculum(curriculum, **kwargs): """ task_queue = SimpleQueue() update_queue = SimpleQueue() - mp_curriculum = MultiProcessingCurriculumWrapper(curriculum, task_queue, update_queue, **kwargs) + mp_curriculum = MultiProcessingCurriculumWrapper( + curriculum, task_queue, update_queue, **kwargs + ) mp_curriculum.start() return mp_curriculum, task_queue, update_queue @@ -190,6 +209,7 @@ class RayCurriculumWrapper(CurriculumWrapper): for convenience. # TODO: Implement the Curriculum methods explicitly """ + def __init__(self, curriculum, actor_name="curriculum") -> None: super().__init__(curriculum) self.curriculum = RayWrapper.options(name=actor_name).remote(curriculum) @@ -202,7 +222,9 @@ def __init__(self, curriculum, actor_name="curriculum") -> None: def sample(self, k: int = 1): return ray.get(self.curriculum.sample.remote(k=k)) - def update_on_step_batch(self, step_results: List[Tuple[int, int, int, int]]) -> None: + def update_on_step_batch( + self, step_results: List[Tuple[int, int, int, int]] + ) -> None: ray.get(self.curriculum._on_step_batch.remote(step_results)) def add_task(self, task): diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py new file mode 100644 index 00000000..609589cb --- /dev/null +++ b/syllabus/examples/experimental/multi_car_racing.py @@ -0,0 +1,119 @@ +# flake8: noqa: F401 +from typing import TypeVar + +import gym +import gym_multi_car_racing +import numpy as np +from gym.spaces import Box +from tqdm.auto import tqdm + +from syllabus.core import TaskWrapper + +ObsType = TypeVar("ObsType") +ActionType = TypeVar("ActionType") +AgentID = TypeVar("AgentID") + + +class MultiCarRacingParallelWrapper(TaskWrapper): + """ + Wrapper ensuring compatibility with the PettingZoo Parallel API. + """ + + def __init__(self, n_agents, *args, **kwargs): + super().__init__(*args, **kwargs) + self.n_agents = n_agents + + def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray: + """ + Converts actions defined in PZ format to a numpy array. + """ + assert action.__len__() == self.n_agents + + action = np.array(list(action.values())) + assert action.shape == (self.n_agents, 3) + return action + + def reset(self): + pass + + def step(self, action: dict[AgentID, ActionType]) -> tuple[ + dict[AgentID, ObsType], + dict[AgentID, float], + dict[AgentID, bool], + dict[AgentID, bool], + dict[AgentID, dict], + ]: + """ + Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and + returns outputs in PZ format. + """ + + def _np_array_to_pz_dict(array: np.ndarray): + """ + Converts an n-dimensional numpy array to a dictionary + with n keys and values. + """ + out = {} + for idx, i in enumerate(array): + out[idx] = i + return out + + def _singleton_to_pz_dict(value: bool): + """ + Converts a boolean flag to a dictionary with ``n_agents`` keys and values. + """ + return {idx: value for idx in range(self.n_agents)} + + # convert action to numpy format to perform the env step + np_action = self._actions_pz_to_np(action) + obs, rew, term, info = self.env.step(np_action) + trunc = 0 # there is no `truncated` flag in this environment + self.task_completion = self._task_completion(obs, rew, term, trunc, info) + info["task_completion"] = self.task_completion + # convert outputs back to PZ format + obs, rew = tuple(map(_np_array_to_pz_dict, [obs, rew])) + # TODO: are boolean flags replicated `n_agent` times in PZ format ? + term, trunc = tuple(map(_singleton_to_pz_dict, [term, trunc])) + + return self.observation(obs), rew, term, trunc, info + + +if __name__ == "__main__": + n_agents = 2 + env = gym.make( + "MultiCarRacing-v0", + num_agents=n_agents, + direction="CCW", + use_random_direction=True, + backwards_flag=True, + h_ratio=0.25, + use_ego_color=False, + ) + + obs = env.reset() + env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents) + done = {i: False for i in range(n_agents)} + total_reward = {i: 0 for i in range(n_agents)} + np.random.seed(0) + + # while not all(done.values()): + for steps in tqdm(range(100)): + # The actions have to be of the format (num_agents,3) + # The action format for each car is as in the CarRacing-v0 environment + # i.e. (`Box([-1. 0. 0.], 1.0, (3,), float32)`) + action = np.random.normal(0, 1, (2, 3)) + + assert action.shape == (n_agents, 3) + pz_action = {i: action[i] for i in range(n_agents)} + # Similarly, the structure of this is the same as in CarRacing-v0 with an + # additional dimension for the different agents, i.e. + # obs is of shape (num_agents, 96, 96, 3) + # reward is of shape (num_agents,) + # done is a bool and info is not used (an empty dict). + obs, reward, done, trunc, info = env.step(pz_action) + for agent in range(n_agents): + total_reward[agent] += reward[agent] + env.render() + + print("individual scores:", total_reward) + print(reward, done, trunc, info) From e915118a58e82614c2a8d7bbfdc9c991a6482e29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryan=20P=C3=A9goud?= Date: Wed, 28 Feb 2024 08:42:16 +0100 Subject: [PATCH 2/5] added wrapped reset function --- .../examples/experimental/multi_car_racing.py | 96 ++++++++++--------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py index 609589cb..84b3bc4a 100644 --- a/syllabus/examples/experimental/multi_car_racing.py +++ b/syllabus/examples/experimental/multi_car_racing.py @@ -4,10 +4,12 @@ import gym import gym_multi_car_racing import numpy as np -from gym.spaces import Box +from gymnasium import spaces from tqdm.auto import tqdm -from syllabus.core import TaskWrapper +from syllabus.core import TaskWrapper, make_multiprocessing_curriculum +from syllabus.curricula import DomainRandomization +from syllabus.task_space import TaskSpace ObsType = TypeVar("ObsType") ActionType = TypeVar("ActionType") @@ -17,6 +19,12 @@ class MultiCarRacingParallelWrapper(TaskWrapper): """ Wrapper ensuring compatibility with the PettingZoo Parallel API. + + Car Racing Environment: + * Action shape: ``n_agents`` * `Box([-1. 0. 0.], 1.0, (3,), float32)` + * Observation shape: ``n_agents`` * `Box(0, 255, (96, 96, 3), uint8)` + * Done: ``done`` is a single boolean value + * Info: ``info`` is unused and represented as an empty dictionary """ def __init__(self, n_agents, *args, **kwargs): @@ -33,8 +41,31 @@ def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray: assert action.shape == (self.n_agents, 3) return action - def reset(self): - pass + def _np_array_to_pz_dict(self, array: np.ndarray) -> dict[int : np.ndarray]: + """ + Returns a dictionary containing individual observations for each agent. + """ + out = {} + for idx, i in enumerate(array): + out[idx] = i + return out + + def _singleton_to_pz_dict(self, value: bool) -> dict[int:bool]: + """ + Broadcasts the `done` and `trunc` flags to dictionaries keyed by agent id. + """ + return {idx: value for idx in range(self.n_agents)} + + def reset(self) -> tuple[dict[AgentID, ObsType], dict[AgentID, dict]]: + """ + Resets the environment and returns a dictionary of observations + keyed by agent ID. + """ + # TODO: what is the second output (dict[AgentID, dict]])? + obs = self.env.reset() + pz_obs = self._np_array_to_pz_dict(obs) + + return pz_obs def step(self, action: dict[AgentID, ActionType]) -> tuple[ dict[AgentID, ObsType], @@ -47,33 +78,16 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and returns outputs in PZ format. """ - - def _np_array_to_pz_dict(array: np.ndarray): - """ - Converts an n-dimensional numpy array to a dictionary - with n keys and values. - """ - out = {} - for idx, i in enumerate(array): - out[idx] = i - return out - - def _singleton_to_pz_dict(value: bool): - """ - Converts a boolean flag to a dictionary with ``n_agents`` keys and values. - """ - return {idx: value for idx in range(self.n_agents)} - # convert action to numpy format to perform the env step np_action = self._actions_pz_to_np(action) obs, rew, term, info = self.env.step(np_action) trunc = 0 # there is no `truncated` flag in this environment self.task_completion = self._task_completion(obs, rew, term, trunc, info) - info["task_completion"] = self.task_completion # convert outputs back to PZ format - obs, rew = tuple(map(_np_array_to_pz_dict, [obs, rew])) - # TODO: are boolean flags replicated `n_agent` times in PZ format ? - term, trunc = tuple(map(_singleton_to_pz_dict, [term, trunc])) + obs, rew = tuple(map(self._np_array_to_pz_dict, [obs, rew])) + term, trunc, info = tuple( + map(self._singleton_to_pz_dict, [term, trunc, self.task_completion]) + ) return self.observation(obs), rew, term, trunc, info @@ -90,30 +104,26 @@ def _singleton_to_pz_dict(value: bool): use_ego_color=False, ) - obs = env.reset() env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents) + # curriculum = DomainRandomization(env) + # curriculum, task_queue, update_queue = make_multiprocessing_curriculum( + # curriculum, + # ) + done = {i: False for i in range(n_agents)} total_reward = {i: 0 for i in range(n_agents)} np.random.seed(0) # while not all(done.values()): - for steps in tqdm(range(100)): - # The actions have to be of the format (num_agents,3) - # The action format for each car is as in the CarRacing-v0 environment - # i.e. (`Box([-1. 0. 0.], 1.0, (3,), float32)`) - action = np.random.normal(0, 1, (2, 3)) - - assert action.shape == (n_agents, 3) - pz_action = {i: action[i] for i in range(n_agents)} - # Similarly, the structure of this is the same as in CarRacing-v0 with an - # additional dimension for the different agents, i.e. - # obs is of shape (num_agents, 96, 96, 3) - # reward is of shape (num_agents,) - # done is a bool and info is not used (an empty dict). - obs, reward, done, trunc, info = env.step(pz_action) - for agent in range(n_agents): - total_reward[agent] += reward[agent] - env.render() + for episodes in tqdm(range(5)): # testing with 5 truncated episodes + obs = env.reset() + for steps in range(100): + action = np.random.normal(0, 1, (2, 3)) + pz_action = {i: action[i] for i in range(n_agents)} + obs, reward, done, trunc, info = env.step(pz_action) + for agent in range(n_agents): + total_reward[agent] += reward[agent] + env.render() print("individual scores:", total_reward) print(reward, done, trunc, info) From 82130d1d030de11fb40e9388f76da86d6fc2fa82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryan=20P=C3=A9goud?= Date: Wed, 28 Feb 2024 10:08:59 +0100 Subject: [PATCH 3/5] added domain randomization wrapper, started implementing continuous ppo --- .../examples/experimental/multi_car_racing.py | 115 +++++++++++++++++- 1 file changed, 111 insertions(+), 4 deletions(-) diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py index 84b3bc4a..c44c185f 100644 --- a/syllabus/examples/experimental/multi_car_racing.py +++ b/syllabus/examples/experimental/multi_car_racing.py @@ -4,7 +4,11 @@ import gym import gym_multi_car_racing import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim from gymnasium import spaces +from torch.distributions.normal import Normal from tqdm.auto import tqdm from syllabus.core import TaskWrapper, make_multiprocessing_curriculum @@ -16,6 +20,83 @@ AgentID = TypeVar("AgentID") +def layer_init(layer, std=np.sqrt(2), bias_const=0.0): + torch.nn.init.orthogonal_(layer.weight, std) + torch.nn.init.constant_(layer.bias, bias_const) + return layer + + +class Agent(nn.Module): + def __init__(self, envs): + super().__init__() + self.critic = nn.Sequential( + layer_init( + nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64) + ), + nn.Tanh(), + layer_init(nn.Linear(64, 64)), + nn.Tanh(), + layer_init(nn.Linear(64, 1), std=1.0), + ) + self.actor_mean = nn.Sequential( + layer_init( + nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64) + ), + nn.Tanh(), + layer_init(nn.Linear(64, 64)), + nn.Tanh(), + layer_init( + nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01 + ), + ) + self.actor_logstd = nn.Parameter( + torch.zeros(1, np.prod(envs.single_action_space.shape)) + ) + + def get_value(self, x): + return self.critic(x) + + def get_action_and_value(self, x, action=None): + action_mean = self.actor_mean(x) + action_logstd = self.actor_logstd.expand_as(action_mean) + action_std = torch.exp(action_logstd) + probs = Normal(action_mean, action_std) + if action is None: + action = probs.sample() + return ( + action, + probs.log_prob(action).sum(1), + probs.entropy().sum(1), + self.critic(x), + ) + + +def batchify_obs(obs, device): + """Converts PZ style observations to batch of torch arrays.""" + obs = np.stack([obs[a] for a in obs], axis=0) + # transpose to be (batch, channel, height, width) + obs = obs.transpose(0, -1, 1, 2) + obs = torch.tensor(obs).to(device) + + return obs + + +def batchify(x, device): + """Converts PZ style returns to batch of torch arrays.""" + x = np.stack([x[a] for a in x], axis=0) + x = torch.tensor(x).to(device) + + return x + + +def unbatchify(x, env): + """Converts np array to PZ style arguments.""" + x = x.cpu().numpy() + x = {a: x[i] for i, a in enumerate(env.possible_agents)} + + return x + + class MultiCarRacingParallelWrapper(TaskWrapper): """ Wrapper ensuring compatibility with the PettingZoo Parallel API. @@ -30,6 +111,16 @@ class MultiCarRacingParallelWrapper(TaskWrapper): def __init__(self, n_agents, *args, **kwargs): super().__init__(*args, **kwargs) self.n_agents = n_agents + self.task = None + self.episode_return = 0 + self.task_space = TaskSpace( + spaces.Box( + low=np.array([-1.0, 0.0, 0.0]), + high=np.array([1.0, 1.0, 1.0]), + shape=(3,), + dtype=np.float32, + ) + ) def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray: """ @@ -93,6 +184,18 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ if __name__ == "__main__": + """ALGO PARAMS""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + ent_coef = 0.1 + vf_coef = 0.1 + clip_coef = 0.1 + gamma = 0.99 + batch_size = 32 + stack_size = 4 + frame_size = (96, 96) + max_cycles = 125 + total_episodes = 100 + n_agents = 2 env = gym.make( "MultiCarRacing-v0", @@ -104,11 +207,15 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ use_ego_color=False, ) + """ CURRICULUM SETUP """ env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents) - # curriculum = DomainRandomization(env) - # curriculum, task_queue, update_queue = make_multiprocessing_curriculum( - # curriculum, - # ) + curriculum = DomainRandomization(env.task_space) + curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum) + + # TODO: clarify how to setup continuous PPO + # """ LEARNER SETUP """ + # agent = Agent(envs=envs).to(device) + # optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5) done = {i: False for i in range(n_agents)} total_reward = {i: 0 for i in range(n_agents)} From a00a1436025d78d81999e5fe9c291c7c5b3decff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryan=20P=C3=A9goud?= Date: Thu, 29 Feb 2024 17:42:33 +0100 Subject: [PATCH 4/5] Added continuous PPO training loop (one bug remaining, see TODOs) --- requirements.txt | 4 +- .../examples/experimental/multi_car_racing.py | 235 +++++++++++++++--- 2 files changed, 208 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index fb4925c9..8864feee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ pettingzoo==1.23.0 # upgraded from 1.9 to solve BaseParallelWrapper import (typo) -supersuit==3.5.0 +# upgraded from 3.5.0 to 3.7.2 (as pettingZoo < 1.23 is a dependency, +# the typo error is also present here) +supersuit==3.7.2 gym==0.24.1 numpy==1.23.3 wandb>=0.15.3 diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py index c44c185f..3dc83450 100644 --- a/syllabus/examples/experimental/multi_car_racing.py +++ b/syllabus/examples/experimental/multi_car_racing.py @@ -8,6 +8,7 @@ import torch.nn as nn import torch.optim as optim from gymnasium import spaces +from supersuit import color_reduction_v0, frame_stack_v1, resize_v1 from torch.distributions.normal import Normal from tqdm.auto import tqdm @@ -27,36 +28,33 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0): class Agent(nn.Module): - def __init__(self, envs): + def __init__(self): super().__init__() + self.observation_shape = (3, 96, 96) + self.action_shape = (3,) + self.critic = nn.Sequential( - layer_init( - nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64) - ), + layer_init(nn.Linear(np.array(self.observation_shape).prod(), 64)), nn.Tanh(), layer_init(nn.Linear(64, 64)), nn.Tanh(), layer_init(nn.Linear(64, 1), std=1.0), ) self.actor_mean = nn.Sequential( - layer_init( - nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64) - ), + layer_init(nn.Linear(np.array(self.observation_shape).prod(), 64)), nn.Tanh(), layer_init(nn.Linear(64, 64)), nn.Tanh(), - layer_init( - nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01 - ), - ) - self.actor_logstd = nn.Parameter( - torch.zeros(1, np.prod(envs.single_action_space.shape)) + layer_init(nn.Linear(64, np.prod(self.action_shape)), std=0.01), ) + self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(self.action_shape))) def get_value(self, x): return self.critic(x) def get_action_and_value(self, x, action=None): + x = x.float().reshape(x.size(0), -1) + action_mean = self.actor_mean(x) action_logstd = self.actor_logstd.expand_as(action_mean) action_std = torch.exp(action_logstd) @@ -121,6 +119,9 @@ def __init__(self, n_agents, *args, **kwargs): dtype=np.float32, ) ) + self.possible_agents = np.arange( + self.n_agents + ) # TODO: is this the intended use? def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray: """ @@ -191,11 +192,16 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ clip_coef = 0.1 gamma = 0.99 batch_size = 32 - stack_size = 4 + stack_size = 3 frame_size = (96, 96) + action_size = 3 max_cycles = 125 total_episodes = 100 + # PLR Params + num_steps = 128 + + """ ENV SETUP """ n_agents = 2 env = gym.make( "MultiCarRacing-v0", @@ -212,25 +218,194 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ curriculum = DomainRandomization(env.task_space) curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum) - # TODO: clarify how to setup continuous PPO - # """ LEARNER SETUP """ - # agent = Agent(envs=envs).to(device) - # optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5) + """ LEARNER SETUP """ + agent = Agent().to(device) + optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5) + + """ ALGO LOGIC: EPISODE STORAGE""" + end_step = 0 + total_episodic_return = 0 + rb_obs = torch.zeros((max_cycles, n_agents, stack_size, *frame_size)).to(device) + rb_actions = torch.zeros((max_cycles, n_agents, action_size)).to(device) + rb_logprobs = torch.zeros((max_cycles, n_agents)).to(device) + rb_rewards = torch.zeros((max_cycles, n_agents)).to(device) + rb_dones = torch.zeros((max_cycles, n_agents)).to(device) + rb_values = torch.zeros((max_cycles, n_agents)).to(device) done = {i: False for i in range(n_agents)} total_reward = {i: 0 for i in range(n_agents)} np.random.seed(0) + """ TRAINING LOGIC """ + # train for n number of episodes + global_cycles = 0 + for episode in tqdm(range(total_episodes)): + # collect an episode + with torch.no_grad(): + # collect observations and convert to batch of torch tensors + next_obs = env.reset() + # reset the episodic return + total_episodic_return = 0 + + # each episode has num_steps + for step in range(0, max_cycles): + global_cycles += 1 + # rollover the observation + obs = batchify_obs(next_obs, device) + # get action from the agent + actions, logprobs, entropy, values = agent.get_action_and_value(obs) + + # execute the environment and log data + next_obs, rewards, trunc, dones, infos = env.step( + unbatchify(actions, env) + ) + + # add to episode storage + rb_obs[step] = obs + rb_rewards[step] = batchify(rewards, device) + rb_dones[step] = batchify(dones, device) + rb_actions[step] = actions + rb_logprobs[step] = logprobs + rb_values[step] = values.flatten() + + # compute episodic return + total_episodic_return += rb_rewards[step].cpu().numpy() + + # Update curriculum + # TODO: adapt to DR + if global_cycles % num_steps == 0: + update = { + "update_type": "on_demand", + "metrics": { + "action_log_dist": logprobs, + "value": values, + "next_value": ( + agent.get_value(next_obs) + if step == num_steps - 1 + else None + ), + "rew": rb_rewards[step], + "masks": torch.Tensor(1 - np.array(list(dones.values()))), + "tasks": [env.unwrapped.task], + }, + } + curriculum.update_curriculum(update) + + # if we reach the end of the episode + if any([dones[a] for a in dones]): + print(f"Breaking early at step {step} due to done signal.") + end_step = step + break + + # bootstrap value if not done + with torch.no_grad(): + rb_advantages = torch.zeros_like(rb_rewards).to(device) + for t in reversed(range(end_step)): + delta = ( + rb_rewards[t] + + gamma * rb_values[t + 1] * rb_dones[t + 1] + - rb_values[t] + ) + rb_advantages[t] = delta + gamma * gamma * rb_advantages[t + 1] + rb_returns = rb_advantages + rb_values + + # TODO: at this step, `end_step` is still 0, causing b_obs to be empty + # and the `repeat` loop not running as range(0, len(b_obs), batch_size) = 0 + # maybe set end_step to the current number of steps in the episode even if there + # is no positive done flag ? + + # convert our episodes to batch of individual transitions + b_obs = torch.flatten(rb_obs[:end_step], start_dim=0, end_dim=1) + b_logprobs = torch.flatten(rb_logprobs[:end_step], start_dim=0, end_dim=1) + b_actions = torch.flatten(rb_actions[:end_step], start_dim=0, end_dim=1) + b_returns = torch.flatten(rb_returns[:end_step], start_dim=0, end_dim=1) + b_values = torch.flatten(rb_values[:end_step], start_dim=0, end_dim=1) + b_advantages = torch.flatten(rb_advantages[:end_step], start_dim=0, end_dim=1) + + # Optimizing the policy and value network + b_index = np.arange(len(b_obs)) + clip_fracs = [] + for repeat in range(3): + # shuffle the indices we use to access the data + np.random.shuffle(b_index) + for start in range(0, len(b_obs), batch_size): + print(start) + # select the indices we want to train on + end = start + batch_size + batch_index = b_index[start:end] + + _, newlogprob, entropy, value = agent.get_action_and_value( + b_obs[batch_index], b_actions.long()[batch_index] + ) + logratio = newlogprob - b_logprobs[batch_index] + ratio = logratio.exp() + + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clip_fracs += [ + ((ratio - 1.0).abs() > clip_coef).float().mean().item() + ] + + # normalize advantaegs + advantages = b_advantages[batch_index] + advantages = (advantages - advantages.mean()) / ( + advantages.std() + 1e-8 + ) + + # Policy loss + pg_loss1 = -b_advantages[batch_index] * ratio + pg_loss2 = -b_advantages[batch_index] * torch.clamp( + ratio, 1 - clip_coef, 1 + clip_coef + ) + pg_loss = torch.max(pg_loss1, pg_loss2).mean() + + # Value loss + value = value.flatten() + v_loss_unclipped = (value - b_returns[batch_index]) ** 2 + v_clipped = b_values[batch_index] + torch.clamp( + value - b_values[batch_index], + -clip_coef, + clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + entropy_loss = entropy.mean() + loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() + var_y = np.var(y_true) + explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y + + print(f"Training episode {episode}") + print(f"Episodic Return: {np.mean(total_episodic_return)}") + print(f"Episode Length: {end_step}") + print("") + print(f"Value Loss: {v_loss.item()}") + print(f"Policy Loss: {pg_loss.item()}") + print(f"Old Approx KL: {old_approx_kl.item()}") + print(f"Approx KL: {approx_kl.item()}") + print(f"Clip Fraction: {np.mean(clip_fracs)}") + print(f"Explained Variance: {explained_var.item()}") + print("\n-------------------------------------------\n") + + # ----- Dummy test loop ----- # while not all(done.values()): - for episodes in tqdm(range(5)): # testing with 5 truncated episodes - obs = env.reset() - for steps in range(100): - action = np.random.normal(0, 1, (2, 3)) - pz_action = {i: action[i] for i in range(n_agents)} - obs, reward, done, trunc, info = env.step(pz_action) - for agent in range(n_agents): - total_reward[agent] += reward[agent] - env.render() - - print("individual scores:", total_reward) - print(reward, done, trunc, info) + # for episodes in tqdm(range(1)): # testing with 5 truncated episodes + # obs = env.reset() + # for steps in range(100): + # action = np.random.normal(0, 1, (2, 3)) + # pz_action = {i: action[i] for i in range(n_agents)} + # obs, reward, done, trunc, info = env.step(pz_action) + # for agent in range(n_agents): + # total_reward[agent] += reward[agent] + # env.render() + + # print("individual scores:", total_reward) + # print(reward, done, trunc, info) From 5b88b0cec7ebc9937aa0442b3313a5ac8b78d49d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryan=20P=C3=A9goud?= Date: Fri, 22 Mar 2024 15:39:47 +0100 Subject: [PATCH 5/5] . --- Dockerfile | 3 +- .../experimental/bezier_car_racing.py | 999 ++++++++++++++++++ .../examples/experimental/multi_car_racing.py | 27 +- .../experimental/multi_car_racing_v2.py | 432 ++++++++ 4 files changed, 1439 insertions(+), 22 deletions(-) create mode 100644 syllabus/examples/experimental/bezier_car_racing.py create mode 100644 syllabus/examples/experimental/multi_car_racing_v2.py diff --git a/Dockerfile b/Dockerfile index 80dffacd..e53b7f0d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,4 +38,5 @@ CMD ["/bin/bash"] # docker build -t syllabus-image . # docker run -it --rm -p 4000:4000 syllabus-image # docker run -it --rm -p 4000:4000 -v ${PWD}:/usr/src/mylib syllabus-image -# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py \ No newline at end of file +# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py +# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing_v2.py \ No newline at end of file diff --git a/syllabus/examples/experimental/bezier_car_racing.py b/syllabus/examples/experimental/bezier_car_racing.py new file mode 100644 index 00000000..3be4ec44 --- /dev/null +++ b/syllabus/examples/experimental/bezier_car_racing.py @@ -0,0 +1,999 @@ +# Copyright (c) OpenAI +# +# Licensed under the MIT License; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/MIT +# +# This file is an extended version of +# https://github.com/openai/gym/blob/master/gym/envs/box2d/car_racing.py + +import argparse +import math + +import Box2D +import gym +import numpy as np +import pyglet +from Box2D.b2 import contactListener, fixtureDef, polygonShape +from envs.registration import register as gym_register +from gym import spaces +from gym.envs.box2d.car_dynamics import Car +from gym.utils import EzPickle, seeding +from util import geo_complexity + +pyglet.options["debug_gl"] = False +from pyglet import gl + +from . import bezier, racetracks + +STATE_W = 96 +STATE_H = 96 +VIDEO_W = 600 +VIDEO_H = 400 +WINDOW_W = 1000 +WINDOW_H = 800 + +SCALE = 6.0 # Track scale +TRACK_RAD = 900 / SCALE # Track is heavily morphed circle with this radius +PLAYFIELD = 2000 / SCALE # Game over boundary +FPS = 50 # Frames per second +ZOOM = 2.7 # Camera zoom +ZOOM_FOLLOW = True # Set to False for fixed view (don't use zoom) + + +TRACK_DETAIL_STEP = 21 / SCALE +TRACK_TURN_RATE = 0.31 +TRACK_WIDTH = 40 / SCALE +BORDER = 8 / SCALE +BORDER_MIN_COUNT = 4 + +ROAD_COLOR = [0.4, 0.4, 0.4] + + +class FrictionDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + + def BeginContact(self, contact): + self._contact(contact, True) + + def EndContact(self, contact): + self._contact(contact, False) + + def _contact(self, contact, begin): + tile = None + obj = None + u1 = contact.fixtureA.body.userData + u2 = contact.fixtureB.body.userData + index = -1 + if u1 and "tile" in u1: + if "road_friction" in u1["tile"].__dict__: + tile = u1["tile"] + index = u1["index"] + obj = u2 + if u2 and "tile" in u2: + if "road_friction" in u2["tile"].__dict__: + tile = u2["tile"] + index = u2["index"] + obj = u1 + if not tile: + return + + tile.color[0] = ROAD_COLOR[0] + tile.color[1] = ROAD_COLOR[1] + tile.color[2] = ROAD_COLOR[2] + + if not obj or "tiles" not in obj.__dict__: + return + if begin: + obj.tiles.add(tile) + if not tile.road_visited: + tile.road_visited = True + self.env.reward += 1000.0 / len(self.env.track) + self.env.tile_visited_count += 1 + + if self.env.sparse_rewards and index >= 0: + self._eval_tile_index(index) + else: + obj.tiles.remove(tile) + + def _eval_tile_index(self, index): + goal_bin = self.env.goal_bin + track_len = len(self.env.track) + goal_step = track_len / (self.env.num_goal_bins) + + MIN_DISTANCE_TO_GO = 10 + distance = track_len - index + tile_bin = np.floor(distance / goal_step) + + # print('in tile bin, index', tile_bin, index, flush=True) + if goal_bin == 0 and distance < MIN_DISTANCE_TO_GO: + self.env.goal_reached = False + elif goal_bin == self.env.num_goal_bins - 1 and index < MIN_DISTANCE_TO_GO: + self.env.goal_reached = False + elif tile_bin == goal_bin: + self.env.goal_reached = True + # print(f'goal bin {goal_bin} reached!', flush=True) + + +class CarRacingBezier(gym.Env, EzPickle): + metadata = { + "render.modes": ["human", "rgb_array", "state_pixels"], + "video.frames_per_second": FPS, + } + + def __init__( + self, + n_control_points=12, + track_name=None, + bezier=True, + show_borders=True, + show_indicators=True, + birdseye=False, + seed=None, + fixed_environment=False, + animate_zoom=False, + min_rad_ratio=0.333333333, + max_rad_ratio=1.0, + sparse_rewards=False, + clip_reward=None, + num_goal_bins=24, + verbose=0, + ): + EzPickle.__init__(self) + + self.level_seed = seed + self.seed(seed) + + self.n_control_points = n_control_points + self.bezier = bezier + self.fixed_environment = fixed_environment + self.animate_zoom = animate_zoom + self.min_rad_ratio = min_rad_ratio + self.max_rad_ratio = max_rad_ratio + + self.steps = 0 + + self.contactListener_keepref = FrictionDetector(self) + self.world = Box2D.b2World((0, 0), contactListener=self.contactListener_keepref) + self.viewer = None + self.invisible_state_window = None + self.invisible_video_window = None + self.road = None + self.car = None + self.reward = 0.0 + self.prev_reward = 0.0 + + self.preloaded_track = racetracks.get_track(track_name) + self.show_borders = show_borders + self.show_indicators = show_indicators + self.birdseye = birdseye + self.verbose = verbose + + self.track_data = None + self.complexity_info = None + + self.window_h = WINDOW_H + self.window_w = WINDOW_W + self.track_rad = TRACK_RAD + self.track_width = TRACK_WIDTH + if self.preloaded_track: + self.playfield = self.preloaded_track.bounds / SCALE + self.full_zoom = self.preloaded_track.full_zoom + else: + self.playfield = PLAYFIELD + self.full_zoom = 0.25 + + self.fd_tile = fixtureDef( + shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)]) + ) + + self.action_space = spaces.Box( + np.array([-1, 0, 0]), np.array([+1, +1, +1]), dtype=np.float32 + ) # steer, gas, brake + + self.observation_space = spaces.Box( + low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8 + ) + + self.clip_reward = clip_reward + + # Create goal for sparse rewards + self.sparse_rewards = sparse_rewards + self.num_goal_bins = num_goal_bins # 0-indexed + self.goal_bin = None + if sparse_rewards: + self.set_goal() + self.accumulated_rewards = 0.0 + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def get_complexity_info(self): + if self.complexity_info is None: + # recompute + points = ((x, y) for _, _, x, y in self.track) + return geo_complexity.complexity(points) + + return self.complexity_info + + def set_goal(self, goal_bin=None): + if goal_bin is None: + goal_bin = self.goal_bin + + if goal_bin is None: + self.goal_bin = self.np_random.randint(1, self.num_goal_bins) + else: + self.goal_bin = goal_bin + + self.goal_reached = False + + def _destroy(self): + if not self.road: + return + + for t in self.road: + t.userData = t.userData["tile"] + self.world.DestroyBody(t) + + self.road = [] + self.car.destroy() + self.car = None + + def _create_track(self, control_points=None, show_borders=None): + if self.bezier: + return self._create_track_bezier( + control_points=control_points, show_borders=show_borders + ) + else: + t = 0 + reset_random = False + while True: + t += 1 + if t > 10: + reset_random = True + break + + success = self._create_track_polar( + control_points=control_points, show_borders=show_borders + ) + if success: + return success + + if reset_random: + t = 0 + while True: + t += 1 + success = self._create_track_polar(show_borders=show_borders) + if success: + return success + + def _create_track_bezier(self, control_points=None, show_borders=None): + if show_borders is None: + show_borders = self.show_borders + else: + show_borders = show_borders + + # Create random bezier curve + track = [] + self.road = [] + + if self.preloaded_track is not None: + points = self.preloaded_track.xy + x, y = zip(*points) + elif control_points is not None: + a = np.array(control_points) + x, y, _ = bezier.get_bezier_curve(a=a, rad=0.2, edgy=0.2, numpoints=40) + self.track_data = a + else: + a = bezier.get_random_points( + n=self.n_control_points, scale=self.playfield, np_random=self.np_random + ) + x, y, _ = bezier.get_bezier_curve(a=a, rad=0.2, edgy=0.2, numpoints=40) + self.track_data = a + + min_x, max_x = x[-1], x[-1] + min_y, max_y = y[-1], y[-1] + + points = list(zip(x, y)) + betas = [] + for i, p in enumerate(points[:-1]): + x1, y1 = points[i] + x2, y2 = points[i + 1] + dx = x2 - x1 + dy = y2 - y1 + if dx == dy == 0: + continue + + # alpha = math.atan(dy/(dx+1e-5)) + alpha = np.arctan2(dy, dx) + beta = math.pi / 2 + alpha + + track.append((alpha, beta, x1, y1)) + betas.append(beta) + + min_x = min(x1, min_x) + min_y = min(y1, min_y) + max_x = max(x1, max_x) + max_y = max(y1, max_y) + + x_offset = min_x + (max_x - min_x) / 2 + y_offset = min_y + (max_y - min_y) / 2 + self.x_offset = x_offset + self.y_offset = y_offset + + betas = np.array(betas) + abs_dbeta = abs(betas[1:] - betas[0:-1]) + mean_abs_dbeta = abs_dbeta.mean() + std_abs_dbeta = abs_dbeta.std() + mean_abs_dbeta + std_abs_dbeta / 2 + + # Red-white border on hard turns + border = [False] * len(track) + if show_borders: + for i in range(len(track)): + good = True + oneside = 0 + for neg in range(BORDER_MIN_COUNT): + beta1 = track[i - neg - 0][1] + beta2 = track[i - neg - 1][1] + good &= abs(beta1 - beta2) > mean_abs_dbeta + oneside += np.sign(beta1 - beta2) + good &= abs(oneside) == BORDER_MIN_COUNT + border[i] = good + for i in range(len(track)): + for neg in range(BORDER_MIN_COUNT): + border[i - neg] |= border[i] + + # Create tiles + for i in range(len(track)): + alpha1, beta1, x1, y1 = track[i] + + alpha2, beta2, x2, y2 = track[i - 1] + + road1_l = ( + x1 - TRACK_WIDTH * math.cos(beta1) - x_offset, + y1 - TRACK_WIDTH * math.sin(beta1) - y_offset, + ) + road1_r = ( + x1 + TRACK_WIDTH * math.cos(beta1) - x_offset, + y1 + TRACK_WIDTH * math.sin(beta1) - y_offset, + ) + road2_l = ( + x2 - TRACK_WIDTH * math.cos(beta2) - x_offset, + y2 - TRACK_WIDTH * math.sin(beta2) - y_offset, + ) + road2_r = ( + x2 + TRACK_WIDTH * math.cos(beta2) - x_offset, + y2 + TRACK_WIDTH * math.sin(beta2) - y_offset, + ) + vertices = [road1_l, road1_r, road2_r, road2_l] + + try: + self.fd_tile.shape.vertices = vertices + except: + pass + t = self.world.CreateStaticBody(fixtures=self.fd_tile) + # t.userData = t + t.userData = {"tile": t, "index": i} + c = 0.01 * (i % 3) + t.color = [ROAD_COLOR[0] + c, ROAD_COLOR[1] + c, ROAD_COLOR[2] + c] + t.road_visited = False + t.road_friction = 1.0 + t.fixtures[0].sensor = True + self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color)) + self.road.append(t) + + if self.show_borders and border[i]: + side = np.sign(beta2 - beta1) + b1_l = ( + x1 + side * TRACK_WIDTH * math.cos(beta1) - x_offset, + y1 + side * TRACK_WIDTH * math.sin(beta1) - y_offset, + ) + b1_r = ( + x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1) - x_offset, + y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1) - y_offset, + ) + b2_l = ( + x2 + side * TRACK_WIDTH * math.cos(beta2) - x_offset, + y2 + side * TRACK_WIDTH * math.sin(beta2) - y_offset, + ) + b2_r = ( + x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2) - x_offset, + y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2) - y_offset, + ) + self.road_poly.append( + ([b1_l, b1_r, b2_r, b2_l], (1, 1, 1) if i % 2 == 0 else (1, 0, 0)) + ) + self.track = track + + self.complexity_info = geo_complexity.complexity(points) + + return True + + def _create_track_polar(self, control_points=None, show_borders=None): + if show_borders is None: + show_borders = self.show_borders + else: + show_borders = show_borders + + CHECKPOINTS = self.n_control_points + + self.x_offset = 0 + self.y_offset = 0 + + min_rad = TRACK_RAD * self.min_rad_ratio + max_rad = TRACK_RAD * self.max_rad_ratio + + # Create checkpoints + if control_points is not None: + checkpoints = control_points + self.start_alpha = 2 * math.pi * (-0.5) / self.n_control_points + else: + checkpoints = [] + for c in range(CHECKPOINTS): + noise = self.np_random.uniform(0, 2 * math.pi * 1 / CHECKPOINTS) + alpha = 2 * math.pi * c / CHECKPOINTS + noise + rad = self.np_random.uniform(min_rad, max_rad) + + if c == 0: + alpha = 0 + rad = 1.5 * TRACK_RAD + if c == CHECKPOINTS - 1: + alpha = 2 * math.pi * c / CHECKPOINTS + self.start_alpha = 2 * math.pi * (-0.5) / CHECKPOINTS + rad = 1.5 * TRACK_RAD + + checkpoints.append( + (alpha, rad * math.cos(alpha), rad * math.sin(alpha)) + ) + + self.track_data = checkpoints + + self.road = [] + + # Go from one checkpoint to another to create track + # x, y, beta = 1.5 * TRACK_RAD, 0, 0 + _, x, y = checkpoints[0] + beta = 0 + dest_i = 0 + laps = 0 + track = [] + no_freeze = 2500 + visited_other_side = False + while True: + alpha = math.atan2(y, x) + if visited_other_side and alpha > 0: + laps += 1 + visited_other_side = False + if alpha < 0: + visited_other_side = True + alpha += 2 * math.pi + + while True: # Find destination from checkpoints + failed = True + + while True: + dest_alpha, dest_x, dest_y = checkpoints[dest_i % len(checkpoints)] + if alpha <= dest_alpha: + failed = False + break + dest_i += 1 + if dest_i % len(checkpoints) == 0: + break + + if not failed: + break + + alpha -= 2 * math.pi + continue + + r1x = math.cos(beta) + r1y = math.sin(beta) + p1x = -r1y + p1y = r1x + dest_dx = dest_x - x # vector towards destination + dest_dy = dest_y - y + # destination vector projected on rad: + proj = r1x * dest_dx + r1y * dest_dy + while beta - alpha > 1.5 * math.pi: + beta -= 2 * math.pi + while beta - alpha < -1.5 * math.pi: + beta += 2 * math.pi + prev_beta = beta + proj *= SCALE + if proj > 0.3: + beta -= min(TRACK_TURN_RATE, abs(0.001 * proj)) + if proj < -0.3: + beta += min(TRACK_TURN_RATE, abs(0.001 * proj)) + x += p1x * TRACK_DETAIL_STEP + y += p1y * TRACK_DETAIL_STEP + track.append((alpha, prev_beta * 0.5 + beta * 0.5, x, y)) + if laps > 4: + break + no_freeze -= 1 + if no_freeze == 0: + break + + # Find closed loop range i1..i2, first loop should be ignored, second is OK + i1, i2 = -1, -1 + i = len(track) + while True: + i -= 1 + if i == 0: + return False # Failed + pass_through_start = ( + track[i][0] > self.start_alpha and track[i - 1][0] <= self.start_alpha + ) + if pass_through_start and i2 == -1: + i2 = i + elif pass_through_start and i1 == -1: + i1 = i + break + assert i1 != -1 + assert i2 != -1 + + track = track[i1 : i2 - 1] + + # Red-white border on hard turns + border = [False] * len(track) + if show_borders: + for i in range(len(track)): + good = True + oneside = 0 + for neg in range(BORDER_MIN_COUNT): + beta1 = track[i - neg - 0][1] + beta2 = track[i - neg - 1][1] + good &= abs(beta1 - beta2) > TRACK_TURN_RATE * 0.2 + oneside += np.sign(beta1 - beta2) + good &= abs(oneside) == BORDER_MIN_COUNT + border[i] = good + for i in range(len(track)): + for neg in range(BORDER_MIN_COUNT): + border[i - neg] |= border[i] + + # Create tiles + for i in range(len(track)): + alpha1, beta1, x1, y1 = track[i] + alpha2, beta2, x2, y2 = track[i - 1] + road1_l = ( + x1 - TRACK_WIDTH * math.cos(beta1), + y1 - TRACK_WIDTH * math.sin(beta1), + ) + road1_r = ( + x1 + TRACK_WIDTH * math.cos(beta1), + y1 + TRACK_WIDTH * math.sin(beta1), + ) + road2_l = ( + x2 - TRACK_WIDTH * math.cos(beta2), + y2 - TRACK_WIDTH * math.sin(beta2), + ) + road2_r = ( + x2 + TRACK_WIDTH * math.cos(beta2), + y2 + TRACK_WIDTH * math.sin(beta2), + ) + vertices = [road1_l, road1_r, road2_r, road2_l] + self.fd_tile.shape.vertices = vertices + t = self.world.CreateStaticBody(fixtures=self.fd_tile) + t.userData = t + c = 0.01 * (i % 3) + t.color = [ROAD_COLOR[0] + c, ROAD_COLOR[1] + c, ROAD_COLOR[2] + c] + t.road_visited = False + t.road_friction = 1.0 + t.fixtures[0].sensor = True + self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color)) + self.road.append(t) + if border[i]: + side = np.sign(beta2 - beta1) + b1_l = ( + x1 + side * TRACK_WIDTH * math.cos(beta1), + y1 + side * TRACK_WIDTH * math.sin(beta1), + ) + b1_r = ( + x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1), + y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1), + ) + b2_l = ( + x2 + side * TRACK_WIDTH * math.cos(beta2), + y2 + side * TRACK_WIDTH * math.sin(beta2), + ) + b2_r = ( + x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2), + y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2), + ) + self.road_poly.append( + ([b1_l, b1_r, b2_r, b2_l], (1, 1, 1) if i % 2 == 0 else (1, 0, 0)) + ) + self.track = track + + return True + + def reset_sparse_state(self): + if self.sparse_rewards: + self.accumulated_rewards = 0.0 + self.set_goal() + + def reset(self): + if self.fixed_environment: + self.seed(self.level_seed) + + self._destroy() + self.reward = 0.0 + self.prev_reward = 0.0 + self.tile_visited_count = 0 + self.t = 0.0 + self.road_poly = [] + self.track_data = None + + self.steps = 0 + + self._create_track() + + beta0, x0, y0 = self.track[0][1:4] + x0 -= self.x_offset + y0 -= self.y_offset + self.car = Car(self.world, beta0, x0, y0) + + self.goal_bin = None + self.reset_sparse_state() + + return self.step(None)[0] + + def step(self, action): + if action is not None: + self.car.steer(-action[0]) + self.car.gas(action[1]) + self.car.brake(action[2]) + + self.car.step(1.0 / FPS) + self.world.Step(1.0 / FPS, 6 * 30, 2 * 30) + self.t += 1.0 / FPS + + self.steps += 1 + + self.state = self.render("state_pixels") + + step_reward = 0 + done = False + if action is not None: # First step without action, called from reset() + self.reward -= 0.1 + # We actually don't want to count fuel spent, we want car to be faster. + # self.reward -= 10 * self.car.fuel_spent / ENGINE_POWER + self.car.fuel_spent = 0.0 + step_reward = self.reward - self.prev_reward + self.prev_reward = self.reward + if self.tile_visited_count == len(self.track): + done = True + x, y = self.car.hull.position + if abs(x) > self.playfield or abs(y) > self.playfield: + done = True + step_reward = -100 + + if self.sparse_rewards: + self.accumulated_rewards += step_reward + + revealed_reward = 0 + if self.goal_reached: + revealed_reward = self.accumulated_rewards + + self.accumulated_rewards = 0.0 + done = True + else: + revealed_reward = step_reward + + if self.clip_reward: + revealed_reward = min( + max(revealed_reward, -self.clip_reward), self.clip_reward + ) + + return self.state, revealed_reward, done, {} + + def render(self, mode="human"): + assert mode in ["human", "state_pixels", "rgb_array", "level", "sketch"] + if self.viewer is None: + from envs.box2d import rendering + + self.viewer = rendering.Viewer(WINDOW_W, WINDOW_H) + self.score_label = pyglet.text.Label( + "0000", + font_size=36, + x=20, + y=WINDOW_H * 2.5 / 40.00, + anchor_x="left", + anchor_y="center", + color=(255, 255, 255, 255), + ) + self.transform = rendering.Transform() + + if "t" not in self.__dict__: + return # reset() not called yet + + # Animate zoom first second: + if self.birdseye or mode in ["level", "sketch"]: + zoom_coef = self.full_zoom + else: + zoom_coef = ZOOM + if self.animate_zoom: + zoom = 0.1 * SCALE * max(1 - self.t, 0) + zoom_coef * SCALE * min(self.t, 1) + else: + zoom = zoom_coef * SCALE + + scroll_x = self.car.hull.position[0] + scroll_y = self.car.hull.position[1] + angle = -self.car.hull.angle + vel = self.car.hull.linearVelocity + if np.linalg.norm(vel) > 0.5: + angle = math.atan2(vel[0], vel[1]) + self.transform.set_scale(zoom, zoom) + + if self.birdseye or mode in ["level", "sketch"]: + self.transform.set_translation( + WINDOW_W / 2, + WINDOW_H / 2, + ) + self.transform.set_rotation(0) + else: + self.transform.set_translation( + WINDOW_W / 2 + - ( + scroll_x * zoom * math.cos(angle) + - scroll_y * zoom * math.sin(angle) + ), + WINDOW_H / 4 + - ( + scroll_x * zoom * math.sin(angle) + + scroll_y * zoom * math.cos(angle) + ), + ) + self.transform.set_rotation(angle) + + self.car.draw(self.viewer, mode != "state_pixels") + + arr = None + win = self.viewer.window + win.switch_to() + win.dispatch_events() + + win.clear() + t = self.transform + if mode == "rgb_array": + VP_W = VIDEO_W + VP_H = VIDEO_H + elif mode in ["state_pixels", "sketch"]: + VP_W = STATE_W + VP_H = STATE_H + else: + pixel_scale = 1 + if hasattr(win.context, "_nscontext"): + pixel_scale = ( + win.context._nscontext.view().backingScaleFactor() + ) # pylint: disable=protected-access + VP_W = int(pixel_scale * WINDOW_W) + VP_H = int(pixel_scale * WINDOW_H) + + gl.glViewport(0, 0, VP_W, VP_H) + t.enable() + self.render_road() + for geom in self.viewer.onetime_geoms: + geom.render() + self.viewer.onetime_geoms = [] + t.disable() + + if mode not in ["level", "sketch"] and self.show_indicators: + self.render_indicators(WINDOW_W, WINDOW_H) + + if mode == "human": + win.flip() + return self.viewer.isopen + + image_data = ( + pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() + ) + arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep="") + arr = arr.reshape(VP_H, VP_W, 4) + arr = arr[::-1, :, 0:3] + + return arr + + def close(self): + if self.viewer is not None: + self.viewer.close() + self.viewer = None + + def render_road(self): + colors = [0.4, 0.8, 0.4, 1.0] * 4 + polygons_ = [ + +self.playfield, + +self.playfield, + 0, + +self.playfield, + -self.playfield, + 0, + -self.playfield, + -self.playfield, + 0, + -self.playfield, + +self.playfield, + 0, + ] + + k = self.playfield / 20.0 + colors.extend([0.4, 0.9, 0.4, 1.0] * 4 * 20 * 20) + for x in range(-20, 20, 2): + for y in range(-20, 20, 2): + polygons_.extend( + [ + k * x + k, + k * y + 0, + 0, + k * x + 0, + k * y + 0, + 0, + k * x + 0, + k * y + k, + 0, + k * x + k, + k * y + k, + 0, + ] + ) + + for poly, color in self.road_poly: + colors.extend([color[0], color[1], color[2], 1] * len(poly)) + for p in poly: + polygons_.extend([p[0], p[1], 0]) + + vl = pyglet.graphics.vertex_list( + len(polygons_) // 3, ("v3f", polygons_), ("c4f", colors) # gl.GL_QUADS, + ) + vl.draw(gl.GL_QUADS) + vl.delete() + + def render_indicators(self, W, H): + s = W / 40.0 + h = H / 40.0 + colors = [0, 0, 0, 1] * 4 + polygons = [W, 0, 0, W, 5 * h, 0, 0, 5 * h, 0, 0, 0, 0] + + def vertical_ind(place, val, color): + colors.extend([color[0], color[1], color[2], 1] * 4) + polygons.extend( + [ + place * s, + h + h * val, + 0, + (place + 1) * s, + h + h * val, + 0, + (place + 1) * s, + h, + 0, + (place + 0) * s, + h, + 0, + ] + ) + + def horiz_ind(place, val, color): + colors.extend([color[0], color[1], color[2], 1] * 4) + polygons.extend( + [ + (place + 0) * s, + 4 * h, + 0, + (place + val) * s, + 4 * h, + 0, + (place + val) * s, + 2 * h, + 0, + (place + 0) * s, + 2 * h, + 0, + ] + ) + + true_speed = np.sqrt( + np.square(self.car.hull.linearVelocity[0]) + + np.square(self.car.hull.linearVelocity[1]) + ) + + vertical_ind(5, 0.02 * true_speed, (1, 1, 1)) + vertical_ind(7, 0.01 * self.car.wheels[0].omega, (0.0, 0, 1)) # ABS sensors + vertical_ind(8, 0.01 * self.car.wheels[1].omega, (0.0, 0, 1)) + vertical_ind(9, 0.01 * self.car.wheels[2].omega, (0.2, 0, 1)) + vertical_ind(10, 0.01 * self.car.wheels[3].omega, (0.2, 0, 1)) + horiz_ind(20, -10.0 * self.car.wheels[0].joint.angle, (0, 1, 0)) + horiz_ind(30, -0.8 * self.car.hull.angularVelocity, (1, 0, 0)) + vl = pyglet.graphics.vertex_list( + len(polygons) // 3, ("v3f", polygons), ("c4f", colors) # gl.GL_QUADS, + ) + vl.draw(gl.GL_QUADS) + vl.delete() + self.score_label.text = "%04i" % self.reward + self.score_label.draw() + + +if hasattr(__loader__, "name"): + module_path = __loader__.name +elif hasattr(__loader__, "fullname"): + module_path = __loader__.fullname + +try: + gym_register(id="CarRacing-Bezier-v0", entry_point=module_path + ":CarRacingBezier") +except: + pass + +if __name__ == "__main__": + from pyglet.window import key + + parser = argparse.ArgumentParser() + parser.add_argument( + "--track_name", type=str, default=None, help="Name of preexisting track." + ) + parser.add_argument( + "--birdseye", + action="store_true", + default=False, + help="Show a fixed birdseye view of track.", + ) + parser.add_argument("--seed", type=int, default=None, help="PRNG seed.") + args = parser.parse_args() + + a = np.array([0.0, 0.0, 0.0]) + + def key_press(k, mod): + global restart + if k == 0xFF0D: + restart = True + if k == key.LEFT: + a[0] = -1.0 + if k == key.RIGHT: + a[0] = +1.0 + if k == key.UP: + a[1] = +1.0 + if k == key.DOWN: + a[2] = +0.8 # set 1.0 for wheels to block to zero rotation + + def key_release(k, mod): + if k == key.LEFT and a[0] == -1.0: + a[0] = 0 + if k == key.RIGHT and a[0] == +1.0: + a[0] = 0 + if k == key.UP: + a[1] = 0 + if k == key.DOWN: + a[2] = 0 + + env = CarRacingBezier( + track_name=args.track_name, birdseye=args.birdseye, seed=args.seed + ) + env.render() + env.viewer.window.on_key_press = key_press + env.viewer.window.on_key_release = key_release + record_video = False + if record_video: + from gym.wrappers.monitor import Monitor + + env = Monitor(env, "videos/", force=True) + isopen = True + while isopen: + env.reset() + total_reward = 0.0 + steps = 0 + restart = False + while True: + s, r, done, info = env.step(a) + total_reward += r + if steps % 200 == 0 or done: + print("\naction " + str(["{:+0.2f}".format(x) for x in a])) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + steps += 1 + isopen = env.render() + if done or restart or isopen is False: + break + env.close() diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py index 3dc83450..29e83872 100644 --- a/syllabus/examples/experimental/multi_car_racing.py +++ b/syllabus/examples/experimental/multi_car_racing.py @@ -12,7 +12,11 @@ from torch.distributions.normal import Normal from tqdm.auto import tqdm -from syllabus.core import TaskWrapper, make_multiprocessing_curriculum +from syllabus.core import ( + PettingZooMultiProcessingSyncWrapper, + TaskWrapper, + make_multiprocessing_curriculum, +) from syllabus.curricula import DomainRandomization from syllabus.task_space import TaskSpace @@ -217,6 +221,7 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents) curriculum = DomainRandomization(env.task_space) curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum) + env = PettingZooMultiProcessingSyncWrapper(env, task_queue, update_queue) """ LEARNER SETUP """ agent = Agent().to(device) @@ -271,26 +276,6 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[ # compute episodic return total_episodic_return += rb_rewards[step].cpu().numpy() - # Update curriculum - # TODO: adapt to DR - if global_cycles % num_steps == 0: - update = { - "update_type": "on_demand", - "metrics": { - "action_log_dist": logprobs, - "value": values, - "next_value": ( - agent.get_value(next_obs) - if step == num_steps - 1 - else None - ), - "rew": rb_rewards[step], - "masks": torch.Tensor(1 - np.array(list(dones.values()))), - "tasks": [env.unwrapped.task], - }, - } - curriculum.update_curriculum(update) - # if we reach the end of the episode if any([dones[a] for a in dones]): print(f"Breaking early at step {step} due to done signal.") diff --git a/syllabus/examples/experimental/multi_car_racing_v2.py b/syllabus/examples/experimental/multi_car_racing_v2.py new file mode 100644 index 00000000..e2e5c77d --- /dev/null +++ b/syllabus/examples/experimental/multi_car_racing_v2.py @@ -0,0 +1,432 @@ +# flake8: noqa: F401 +import argparse +from typing import TypeVar + +import gym +import gym_multi_car_racing +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from gymnasium import spaces +from torch.distributions import Beta +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler +from tqdm.auto import tqdm + +from syllabus.core import ( + PettingZooMultiProcessingSyncWrapper, + TaskWrapper, + make_multiprocessing_curriculum, +) +from syllabus.curricula import DomainRandomization +from syllabus.task_space.task_space import TaskSpace + +ObsType = TypeVar("ObsType") +ActionType = TypeVar("ActionType") +AgentID = TypeVar("AgentID") + +parser = argparse.ArgumentParser(description="Train a PPO agent for the CarRacing-v0") +parser.add_argument( + "--gamma", + type=float, + default=0.99, + metavar="G", + help="discount factor (default: 0.99)", +) +parser.add_argument( + "--action-repeat", + type=int, + default=8, + metavar="N", + help="repeat action in N frames (default: 8)", +) +parser.add_argument( + "--img-stack", + type=int, + default=4, + metavar="N", + help="stack N image in a state (default: 4)", +) +parser.add_argument( + "--seed", type=int, default=0, metavar="N", help="random seed (default: 0)" +) +parser.add_argument("--render", action="store_true", help="render the environment") +parser.add_argument("--vis", action="store_true", help="use visdom") +parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="interval between training status logs (default: 10)", +) +args = parser.parse_args() + +use_cuda = torch.cuda.is_available() +device = torch.device("cuda" if use_cuda else "cpu") +torch.manual_seed(args.seed) +if use_cuda: + torch.cuda.manual_seed(args.seed) + +transition = np.dtype( + [ + ("s", np.float64, (args.img_stack, 96, 96)), + ("a", np.float64, (3,)), + ("a_logp", np.float64), + ("r", np.float64), + ("s_", np.float64, (args.img_stack, 96, 96)), + ] +) + + +# class Env: +# """ +# Environment wrapper for CarRacing +# """ + +# def __init__(self): +# self.env = gym.make("CarRacing-v0") +# self.env.seed(args.seed) +# self.reward_threshold = self.env.spec.reward_threshold + +# def reset(self): +# self.counter = 0 +# self.av_r = self.reward_memory() + +# self.die = False +# img_rgb = self.env.reset() +# img_gray = self.rgb2gray(img_rgb) +# self.stack = [img_gray] * args.img_stack # four frames for decision +# return np.array(self.stack) + +# def step(self, action): +# total_reward = 0 +# for i in range(args.action_repeat): +# img_rgb, reward, die, _ = self.env.step(action) +# # don't penalize "die state" +# if die: +# reward += 100 +# # green penalty +# if np.mean(img_rgb[:, :, 1]) > 185.0: +# reward -= 0.05 +# total_reward += reward +# # if no reward recently, end the episode +# done = True if self.av_r(reward) <= -0.1 else False +# if done or die: +# break +# img_gray = self.rgb2gray(img_rgb) +# self.stack.pop(0) +# self.stack.append(img_gray) +# assert len(self.stack) == args.img_stack +# return np.array(self.stack), total_reward, done, die + +# def render(self, *arg): +# self.env.render(*arg) + +# @staticmethod +# def rgb2gray(rgb, norm=True): +# # rgb image -> gray [0, 1] +# gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114]) +# if norm: +# # normalize +# gray = gray / 128.0 - 1.0 +# return gray + +# @staticmethod +# def reward_memory(): +# # record reward for last 100 steps +# count = 0 +# length = 100 +# history = np.zeros(length) + +# def memory(reward): +# nonlocal count +# history[count] = reward +# count = (count + 1) % length +# return np.mean(history) + +# return memory + + +class MultiCarRacingParallelWrapper(TaskWrapper): + """ + Wrapper ensuring compatibility with the PettingZoo Parallel API. + + Car Racing Environment: + * Action shape: ``n_agents`` * `Box([-1. 0. 0.], 1.0, (3,), float32)` + * Observation shape: ``n_agents`` * `Box(0, 255, (96, 96, 3), uint8)` + * Done: ``done`` is a single boolean value + * Info: ``info`` is unused and represented as an empty dictionary + """ + + def __init__(self, n_agents, *args, **kwargs): + super().__init__(*args, **kwargs) + self.n_agents = n_agents + self.task = None + self.episode_return = 0 + self.task_space = TaskSpace( + spaces.Box( + low=np.array([-1.0, 0.0, 0.0]), + high=np.array([1.0, 1.0, 1.0]), + shape=(3,), + dtype=np.float32, + ) + ) + self.possible_agents = np.arange( + self.n_agents + ) # TODO: is this the intended use? + + def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray: + """ + Converts actions defined in PZ format to a numpy array. + """ + assert action.__len__() == self.n_agents + + action = np.array(list(action.values())) + assert action.shape == (self.n_agents, 3) + return action + + def _np_array_to_pz_dict(self, array: np.ndarray) -> dict[int : np.ndarray]: + """ + Returns a dictionary containing individual observations for each agent. + """ + out = {} + for idx, i in enumerate(array): + out[idx] = i + return out + + def _singleton_to_pz_dict(self, value: bool) -> dict[int:bool]: + """ + Broadcasts the `done` and `trunc` flags to dictionaries keyed by agent id. + """ + return {idx: value for idx in range(self.n_agents)} + + def reset(self) -> tuple[dict[AgentID, ObsType], dict[AgentID, dict]]: + """ + Resets the environment and returns a dictionary of observations + keyed by agent ID. + """ + # TODO: what is the second output (dict[AgentID, dict]])? + obs = self.env.reset() + pz_obs = self._np_array_to_pz_dict(obs) + + return pz_obs + + def step(self, action: dict[AgentID, ActionType]) -> tuple[ + dict[AgentID, ObsType], + dict[AgentID, float], + dict[AgentID, bool], + dict[AgentID, bool], + dict[AgentID, dict], + ]: + """ + Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and + returns outputs in PZ format. + """ + # convert action to numpy format to perform the env step + np_action = self._actions_pz_to_np(action) + obs, rew, term, info = self.env.step(np_action) + trunc = 0 # there is no `truncated` flag in this environment + self.task_completion = self._task_completion(obs, rew, term, trunc, info) + # convert outputs back to PZ format + obs, rew = tuple(map(self._np_array_to_pz_dict, [obs, rew])) + term, trunc, info = tuple( + map(self._singleton_to_pz_dict, [term, trunc, self.task_completion]) + ) + + return self.observation(obs), rew, term, trunc, info + + +class Net(nn.Module): + """ + Actor-Critic Network for PPO + """ + + def __init__(self): + super(Net, self).__init__() + self.cnn_base = nn.Sequential( # input shape (4, 96, 96) + nn.Conv2d(args.img_stack, 8, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(8, 16, kernel_size=3, stride=2), # (8, 47, 47) + nn.ReLU(), + nn.Conv2d(16, 32, kernel_size=3, stride=2), # (16, 23, 23) + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=3, stride=2), # (32, 11, 11) + nn.ReLU(), + nn.Conv2d(64, 128, kernel_size=3, stride=1), # (64, 5, 5) + nn.ReLU(), + nn.Conv2d(128, 256, kernel_size=3, stride=1), # (128, 3, 3) + nn.ReLU(), + ) # output shape (256, 1, 1) + self.v = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1)) + self.fc = nn.Sequential(nn.Linear(256, 100), nn.ReLU()) + self.alpha_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus()) + self.beta_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus()) + self.apply(self._weights_init) + + @staticmethod + def _weights_init(m): + if isinstance(m, nn.Conv2d): + nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain("relu")) + nn.init.constant_(m.bias, 0.1) + + def forward(self, x): + x = self.cnn_base(x) + x = x.view(-1, 256) + v = self.v(x) + x = self.fc(x) + alpha = self.alpha_head(x) + 1 + beta = self.beta_head(x) + 1 + + return (alpha, beta), v + + +class Agent: + """ + Agent for training + """ + + max_grad_norm = 0.5 + clip_param = 0.1 # epsilon in clipped loss + ppo_epoch = 10 + buffer_capacity, batch_size = 2000, 128 + + def __init__(self): + self.training_step = 0 + self.net = Net().double().to(device) + self.buffer = np.empty(self.buffer_capacity, dtype=transition) + self.counter = 0 + + self.optimizer = optim.Adam(self.net.parameters(), lr=1e-3) + + def select_action(self, state): + state = torch.from_numpy(state).double().to(device).unsqueeze(0) + with torch.no_grad(): + alpha, beta = self.net(state)[0] + dist = Beta(alpha, beta) + action = dist.sample() + a_logp = dist.log_prob(action).sum(dim=1) + + action = action.squeeze().cpu().numpy() + a_logp = a_logp.item() + return action, a_logp + + def save_param(self): + torch.save(self.net.state_dict(), "param/ppo_net_params.pkl") + + def store(self, transition): + self.buffer[self.counter] = transition + self.counter += 1 + if self.counter == self.buffer_capacity: + self.counter = 0 + return True + else: + return False + + def update(self): + self.training_step += 1 + + s = torch.tensor(self.buffer["s"], dtype=torch.double).to(device) + a = torch.tensor(self.buffer["a"], dtype=torch.double).to(device) + r = torch.tensor(self.buffer["r"], dtype=torch.double).to(device).view(-1, 1) + s_ = torch.tensor(self.buffer["s_"], dtype=torch.double).to(device) + + old_a_logp = ( + torch.tensor(self.buffer["a_logp"], dtype=torch.double) + .to(device) + .view(-1, 1) + ) + + with torch.no_grad(): + target_v = r + args.gamma * self.net(s_)[1] + adv = target_v - self.net(s)[1] + # adv = (adv - adv.mean()) / (adv.std() + 1e-8) + + for _ in range(self.ppo_epoch): + for index in BatchSampler( + SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False + ): + + alpha, beta = self.net(s[index])[0] + dist = Beta(alpha, beta) + a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True) + ratio = torch.exp(a_logp - old_a_logp[index]) + + surr1 = ratio * adv[index] + surr2 = ( + torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) + * adv[index] + ) + action_loss = -torch.min(surr1, surr2).mean() + value_loss = F.smooth_l1_loss(self.net(s[index])[1], target_v[index]) + loss = action_loss + 2.0 * value_loss + + self.optimizer.zero_grad() + loss.backward() + # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) + self.optimizer.step() + + +if __name__ == "__main__": + agent = Agent() + """ CURRICULUM SETUP """ + n_agents = 2 + env = gym.make( + "MultiCarRacing-v0", + num_agents=n_agents, + direction="CCW", + use_random_direction=True, + backwards_flag=True, + h_ratio=0.25, + use_ego_color=False, + ) + env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents) + curriculum = DomainRandomization(env.task_space) + curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum) + env = PettingZooMultiProcessingSyncWrapper( + env, + task_queue, + update_queue, + update_on_step=False, + task_space=env.task_space, + ) + + n_episodes = 10 + n_steps = 100 + + training_records = [] + running_score = 0 + state = env.reset() + for i_ep in tqdm(range(n_episodes)): + score = 0 + state = env.reset() + + for t in tqdm(range(n_steps)): + action, a_logp = agent.select_action(state) + state_, reward, done, die = env.step( + action * np.array([2.0, 1.0, 1.0]) + np.array([-1.0, 0.0, 0.0]) + ) + if args.render: + env.render() + if agent.store((state, action, a_logp, reward, state_)): + print("updating") + agent.update() + score += reward + state = state_ + if done or die: + break + running_score = running_score * 0.99 + score * 0.01 + + if i_ep % args.log_interval == 0: + print( + "Ep {}\tLast score: {:.2f}\tMoving average score: {:.2f}".format( + i_ep, score, running_score + ) + ) + # agent.save_param() + if running_score > env.reward_threshold: + print( + f"""Solved! Running reward is now {running_score} and the last \\ + episode runs to""" + ) + break