From 13b93e10ecbb757e69130a69eb1b82fd6d04982d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ryan=20P=C3=A9goud?= <ryanpegoud@gmail.com>
Date: Tue, 27 Feb 2024 11:38:04 +0100
Subject: [PATCH 1/5] working multi car parallel wrapper (fixed circular
 import, dependency issues)

---
 Dockerfile                                    |  41 ++++++
 multi_car_racing                              |   1 +
 requirements.txt                              |   9 +-
 syllabus/core/__init__.py                     |  36 ++++--
 syllabus/core/curriculum_sync_wrapper.py      |  62 ++++++---
 .../examples/experimental/multi_car_racing.py | 119 ++++++++++++++++++
 6 files changed, 233 insertions(+), 35 deletions(-)
 create mode 100644 Dockerfile
 create mode 160000 multi_car_racing
 create mode 100644 syllabus/examples/experimental/multi_car_racing.py

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..80dffacd
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,41 @@
+# Use the official Python image
+FROM python:3.10-slim
+
+# Set the working directory inside the container
+WORKDIR /usr/src/mylib
+
+
+# Install any dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt && pip install tqdm
+
+# You can also install additional tools you might need
+RUN apt-get update && apt-get install -y --fix-missing\
+    libgeos-dev \
+    gcc \        
+    g++ \        
+    swig \       
+    git \
+    libgl1-mesa-dev \
+    libglu1-mesa-dev \ 
+    freeglut3-dev \ 
+    xvfb \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone the multi_car_racing repository and install it
+RUN git clone https://github.com/igilitschenski/multi_car_racing.git \
+    && cd multi_car_racing \
+    && pip install -e .
+
+# Copy the Syllabus directory
+COPY . .
+
+WORKDIR /usr/src/mylib/syllabus/examples/experimental
+ENV PYTHONPATH="${PYTHONPATH}:/usr/src/mylib"
+
+CMD ["/bin/bash"]
+
+# docker build -t syllabus-image . 
+# docker run -it --rm -p 4000:4000 syllabus-image
+# docker run -it --rm -p 4000:4000 -v ${PWD}:/usr/src/mylib syllabus-image
+# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py
\ No newline at end of file
diff --git a/multi_car_racing b/multi_car_racing
new file mode 160000
index 00000000..0131904e
--- /dev/null
+++ b/multi_car_racing
@@ -0,0 +1 @@
+Subproject commit 0131904e16cb44892e8b2b1616f0c01c1b44cb7d
diff --git a/requirements.txt b/requirements.txt
index bdcae95d..fb4925c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-#pettingzoo==1.19.0
-#supersuit==3.5.0
-gym==0.23.1
+pettingzoo==1.23.0 # upgraded from 1.9 to solve BaseParallelWrapper import (typo)
+supersuit==3.5.0
+gym==0.24.1
 numpy==1.23.3
 wandb>=0.15.3
 grpcio<=1.48.2
@@ -9,7 +9,8 @@ torch>=2.0.1
 ray==2.2.0
 tabulate==0.9.0
 tensorflow_probability
+gymnasium
 
 # documentation
-pip install sphinx-tabs
+# pip install sphinx-tabs
 
diff --git a/syllabus/core/__init__.py b/syllabus/core/__init__.py
index ac9a484b..e816e2e7 100644
--- a/syllabus/core/__init__.py
+++ b/syllabus/core/__init__.py
@@ -1,14 +1,28 @@
+# flake8: noqa
 # Environment Code
-from .task_interface import TaskWrapper, SubclassTaskWrapper, ReinitTaskWrapper, TaskEnv, PettingZooTaskWrapper, PettingZooTaskEnv
-
-# Curriculum Code
-from .utils import decorate_all_functions, UsageError, enumerate_axes
 from .curriculum_base import Curriculum
-from .curriculum_sync_wrapper import (CurriculumWrapper,
-                                      MultiProcessingCurriculumWrapper,
-                                      RayCurriculumWrapper,
-                                      make_multiprocessing_curriculum,
-                                      make_ray_curriculum)
-
-from .environment_sync_wrapper import MultiProcessingSyncWrapper, RaySyncWrapper, PettingZooMultiProcessingSyncWrapper, PettingZooRaySyncWrapper
+from .curriculum_sync_wrapper import (
+    CurriculumWrapper,
+    MultiProcessingCurriculumWrapper,
+    RayCurriculumWrapper,
+    make_multiprocessing_curriculum,
+    make_ray_curriculum,
+)
+from .environment_sync_wrapper import (
+    MultiProcessingSyncWrapper,
+    PettingZooMultiProcessingSyncWrapper,
+    PettingZooRaySyncWrapper,
+    RaySyncWrapper,
+)
 from .multivariate_curriculum_wrapper import MultitaskWrapper
+from .task_interface import (
+    PettingZooTaskEnv,
+    PettingZooTaskWrapper,
+    ReinitTaskWrapper,
+    SubclassTaskWrapper,
+    TaskEnv,
+    TaskWrapper,
+)
+
+# Curriculum Code
+from .utils import UsageError, decorate_all_functions, enumerate_axes
diff --git a/syllabus/core/curriculum_sync_wrapper.py b/syllabus/core/curriculum_sync_wrapper.py
index f69034d8..ef1ec0fa 100644
--- a/syllabus/core/curriculum_sync_wrapper.py
+++ b/syllabus/core/curriculum_sync_wrapper.py
@@ -1,17 +1,19 @@
 import threading
+import time
 from functools import wraps
 from typing import List, Tuple
 
 import ray
-import time
 from torch.multiprocessing import SimpleQueue
 
-from syllabus.core import Curriculum, decorate_all_functions
+from syllabus.core import Curriculum
+
+from .utils import decorate_all_functions  # fixed circular import
 
 
 class CurriculumWrapper:
-    """Wrapper class for adding multiprocessing synchronization to a curriculum.
-    """
+    """Wrapper class for adding multiprocessing synchronization to a curriculum."""
+
     def __init__(self, curriculum: Curriculum) -> None:
         self.curriculum = curriculum
         self.task_space = curriculum.task_space
@@ -26,7 +28,7 @@ def count_tasks(self, task_space=None):
 
     @property
     def tasks(self):
-        return self.task_space.tasks   
+        return self.task_space.tasks
 
     def get_tasks(self, task_space=None):
         return self.task_space.get_tasks(gym_space=task_space)
@@ -57,13 +59,15 @@ def add_task(self, task):
 
 
 class MultiProcessingCurriculumWrapper(CurriculumWrapper):
-    """Wrapper which sends tasks and receives updates from environments wrapped in a corresponding MultiprocessingSyncWrapper.
-    """
-    def __init__(self,
-                 curriculum: Curriculum,
-                 task_queue: SimpleQueue,
-                 update_queue: SimpleQueue,
-                 sequential_start: bool = True):
+    """Wrapper which sends tasks and receives updates from environments wrapped in a corresponding MultiprocessingSyncWrapper."""
+
+    def __init__(
+        self,
+        curriculum: Curriculum,
+        task_queue: SimpleQueue,
+        update_queue: SimpleQueue,
+        sequential_start: bool = True,
+    ):
         super().__init__(curriculum)
         self.task_queue = task_queue
         self.update_queue = update_queue
@@ -78,7 +82,9 @@ def start(self):
         """
         Start the thread that reads the complete_queue and reads the task_queue.
         """
-        self.update_thread = threading.Thread(name='update', target=self._update_queues, daemon=True)
+        self.update_thread = threading.Thread(
+            name="update", target=self._update_queues, daemon=True
+        )
         self.should_update = True
         self.update_thread.start()
 
@@ -106,18 +112,27 @@ def _update_queues(self):
                         requested_tasks += 1
                     # Decode task and task progress
                     if update["update_type"] == "task_progress":
-                        update["metrics"] = (self.task_space.decode(update["metrics"][0]), update["metrics"][1])
+                        update["metrics"] = (
+                            self.task_space.decode(update["metrics"][0]),
+                            update["metrics"][1],
+                        )
                 self.batch_update(batch_updates)
 
             # Sample new tasks
             if requested_tasks > 0:
                 # TODO: Move this to curriculum, not sync wrapper
                 # Sequentially sample task_space before using curriculum method
-                if (self.sequential_start and
-                        self.task_space.num_tasks is not None and
-                        self.num_assigned_tasks + requested_tasks < self.task_space.num_tasks):
+                if (
+                    self.sequential_start
+                    and self.task_space.num_tasks is not None
+                    and self.num_assigned_tasks + requested_tasks
+                    < self.task_space.num_tasks
+                ):
                     # Sample unseen tasks sequentially before using curriculum method
-                    new_tasks = self.task_space.list_tasks()[self.num_assigned_tasks:self.num_assigned_tasks + requested_tasks]
+                    new_tasks = self.task_space.list_tasks()[
+                        self.num_assigned_tasks : self.num_assigned_tasks
+                        + requested_tasks
+                    ]
                 else:
                     new_tasks = self.curriculum.sample(k=requested_tasks)
                 for i, task in enumerate(new_tasks):
@@ -149,6 +164,7 @@ def remote_call(func):
 
     Note that this causes functions to block, and should be only used for operations that do not require parallelization.
     """
+
     @wraps(func)
     def wrapper(self, *args, **kw):
         f_name = func.__name__
@@ -159,6 +175,7 @@ def wrapper(self, *args, **kw):
         if child_func == parent_func:
             curriculum_func = getattr(self.curriculum, f_name)
             return ray.get(curriculum_func.remote(*args, **kw))
+
     return wrapper
 
 
@@ -168,7 +185,9 @@ def make_multiprocessing_curriculum(curriculum, **kwargs):
     """
     task_queue = SimpleQueue()
     update_queue = SimpleQueue()
-    mp_curriculum = MultiProcessingCurriculumWrapper(curriculum, task_queue, update_queue, **kwargs)
+    mp_curriculum = MultiProcessingCurriculumWrapper(
+        curriculum, task_queue, update_queue, **kwargs
+    )
     mp_curriculum.start()
     return mp_curriculum, task_queue, update_queue
 
@@ -190,6 +209,7 @@ class RayCurriculumWrapper(CurriculumWrapper):
     for convenience.
     # TODO: Implement the Curriculum methods explicitly
     """
+
     def __init__(self, curriculum, actor_name="curriculum") -> None:
         super().__init__(curriculum)
         self.curriculum = RayWrapper.options(name=actor_name).remote(curriculum)
@@ -202,7 +222,9 @@ def __init__(self, curriculum, actor_name="curriculum") -> None:
     def sample(self, k: int = 1):
         return ray.get(self.curriculum.sample.remote(k=k))
 
-    def update_on_step_batch(self, step_results: List[Tuple[int, int, int, int]]) -> None:
+    def update_on_step_batch(
+        self, step_results: List[Tuple[int, int, int, int]]
+    ) -> None:
         ray.get(self.curriculum._on_step_batch.remote(step_results))
 
     def add_task(self, task):
diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py
new file mode 100644
index 00000000..609589cb
--- /dev/null
+++ b/syllabus/examples/experimental/multi_car_racing.py
@@ -0,0 +1,119 @@
+# flake8: noqa: F401
+from typing import TypeVar
+
+import gym
+import gym_multi_car_racing
+import numpy as np
+from gym.spaces import Box
+from tqdm.auto import tqdm
+
+from syllabus.core import TaskWrapper
+
+ObsType = TypeVar("ObsType")
+ActionType = TypeVar("ActionType")
+AgentID = TypeVar("AgentID")
+
+
+class MultiCarRacingParallelWrapper(TaskWrapper):
+    """
+    Wrapper ensuring compatibility with the PettingZoo Parallel API.
+    """
+
+    def __init__(self, n_agents, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.n_agents = n_agents
+
+    def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray:
+        """
+        Converts actions defined in PZ format to a numpy array.
+        """
+        assert action.__len__() == self.n_agents
+
+        action = np.array(list(action.values()))
+        assert action.shape == (self.n_agents, 3)
+        return action
+
+    def reset(self):
+        pass
+
+    def step(self, action: dict[AgentID, ActionType]) -> tuple[
+        dict[AgentID, ObsType],
+        dict[AgentID, float],
+        dict[AgentID, bool],
+        dict[AgentID, bool],
+        dict[AgentID, dict],
+    ]:
+        """
+        Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and
+        returns outputs in PZ format.
+        """
+
+        def _np_array_to_pz_dict(array: np.ndarray):
+            """
+            Converts an n-dimensional numpy array to a dictionary
+            with n keys and values.
+            """
+            out = {}
+            for idx, i in enumerate(array):
+                out[idx] = i
+            return out
+
+        def _singleton_to_pz_dict(value: bool):
+            """
+            Converts a boolean flag to a dictionary with ``n_agents`` keys and values.
+            """
+            return {idx: value for idx in range(self.n_agents)}
+
+        # convert action to numpy format to perform the env step
+        np_action = self._actions_pz_to_np(action)
+        obs, rew, term, info = self.env.step(np_action)
+        trunc = 0  # there is no `truncated` flag in this environment
+        self.task_completion = self._task_completion(obs, rew, term, trunc, info)
+        info["task_completion"] = self.task_completion
+        # convert outputs back to PZ format
+        obs, rew = tuple(map(_np_array_to_pz_dict, [obs, rew]))
+        # TODO: are boolean flags replicated `n_agent` times in PZ format ?
+        term, trunc = tuple(map(_singleton_to_pz_dict, [term, trunc]))
+
+        return self.observation(obs), rew, term, trunc, info
+
+
+if __name__ == "__main__":
+    n_agents = 2
+    env = gym.make(
+        "MultiCarRacing-v0",
+        num_agents=n_agents,
+        direction="CCW",
+        use_random_direction=True,
+        backwards_flag=True,
+        h_ratio=0.25,
+        use_ego_color=False,
+    )
+
+    obs = env.reset()
+    env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents)
+    done = {i: False for i in range(n_agents)}
+    total_reward = {i: 0 for i in range(n_agents)}
+    np.random.seed(0)
+
+    # while not all(done.values()):
+    for steps in tqdm(range(100)):
+        # The actions have to be of the format (num_agents,3)
+        # The action format for each car is as in the CarRacing-v0 environment
+        # i.e. (`Box([-1. 0. 0.], 1.0, (3,), float32)`)
+        action = np.random.normal(0, 1, (2, 3))
+
+        assert action.shape == (n_agents, 3)
+        pz_action = {i: action[i] for i in range(n_agents)}
+        # Similarly, the structure of this is the same as in CarRacing-v0 with an
+        # additional dimension for the different agents, i.e.
+        # obs is of shape (num_agents, 96, 96, 3)
+        # reward is of shape (num_agents,)
+        # done is a bool and info is not used (an empty dict).
+        obs, reward, done, trunc, info = env.step(pz_action)
+        for agent in range(n_agents):
+            total_reward[agent] += reward[agent]
+        env.render()
+
+    print("individual scores:", total_reward)
+    print(reward, done, trunc, info)

From e915118a58e82614c2a8d7bbfdc9c991a6482e29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ryan=20P=C3=A9goud?= <ryanpegoud@gmail.com>
Date: Wed, 28 Feb 2024 08:42:16 +0100
Subject: [PATCH 2/5] added wrapped reset function

---
 .../examples/experimental/multi_car_racing.py | 96 ++++++++++---------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py
index 609589cb..84b3bc4a 100644
--- a/syllabus/examples/experimental/multi_car_racing.py
+++ b/syllabus/examples/experimental/multi_car_racing.py
@@ -4,10 +4,12 @@
 import gym
 import gym_multi_car_racing
 import numpy as np
-from gym.spaces import Box
+from gymnasium import spaces
 from tqdm.auto import tqdm
 
-from syllabus.core import TaskWrapper
+from syllabus.core import TaskWrapper, make_multiprocessing_curriculum
+from syllabus.curricula import DomainRandomization
+from syllabus.task_space import TaskSpace
 
 ObsType = TypeVar("ObsType")
 ActionType = TypeVar("ActionType")
@@ -17,6 +19,12 @@
 class MultiCarRacingParallelWrapper(TaskWrapper):
     """
     Wrapper ensuring compatibility with the PettingZoo Parallel API.
+
+    Car Racing Environment:
+        * Action shape:  ``n_agents`` * `Box([-1. 0. 0.], 1.0, (3,), float32)`
+        * Observation shape: ``n_agents`` * `Box(0, 255, (96, 96, 3), uint8)`
+        * Done: ``done`` is a single boolean value
+        * Info: ``info`` is unused and represented as an empty dictionary
     """
 
     def __init__(self, n_agents, *args, **kwargs):
@@ -33,8 +41,31 @@ def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray:
         assert action.shape == (self.n_agents, 3)
         return action
 
-    def reset(self):
-        pass
+    def _np_array_to_pz_dict(self, array: np.ndarray) -> dict[int : np.ndarray]:
+        """
+        Returns a dictionary containing individual observations for each agent.
+        """
+        out = {}
+        for idx, i in enumerate(array):
+            out[idx] = i
+        return out
+
+    def _singleton_to_pz_dict(self, value: bool) -> dict[int:bool]:
+        """
+        Broadcasts the `done` and `trunc` flags to dictionaries keyed by agent id.
+        """
+        return {idx: value for idx in range(self.n_agents)}
+
+    def reset(self) -> tuple[dict[AgentID, ObsType], dict[AgentID, dict]]:
+        """
+        Resets the environment and returns a dictionary of observations
+        keyed by agent ID.
+        """
+        # TODO: what is the second output (dict[AgentID, dict]])?
+        obs = self.env.reset()
+        pz_obs = self._np_array_to_pz_dict(obs)
+
+        return pz_obs
 
     def step(self, action: dict[AgentID, ActionType]) -> tuple[
         dict[AgentID, ObsType],
@@ -47,33 +78,16 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
         Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and
         returns outputs in PZ format.
         """
-
-        def _np_array_to_pz_dict(array: np.ndarray):
-            """
-            Converts an n-dimensional numpy array to a dictionary
-            with n keys and values.
-            """
-            out = {}
-            for idx, i in enumerate(array):
-                out[idx] = i
-            return out
-
-        def _singleton_to_pz_dict(value: bool):
-            """
-            Converts a boolean flag to a dictionary with ``n_agents`` keys and values.
-            """
-            return {idx: value for idx in range(self.n_agents)}
-
         # convert action to numpy format to perform the env step
         np_action = self._actions_pz_to_np(action)
         obs, rew, term, info = self.env.step(np_action)
         trunc = 0  # there is no `truncated` flag in this environment
         self.task_completion = self._task_completion(obs, rew, term, trunc, info)
-        info["task_completion"] = self.task_completion
         # convert outputs back to PZ format
-        obs, rew = tuple(map(_np_array_to_pz_dict, [obs, rew]))
-        # TODO: are boolean flags replicated `n_agent` times in PZ format ?
-        term, trunc = tuple(map(_singleton_to_pz_dict, [term, trunc]))
+        obs, rew = tuple(map(self._np_array_to_pz_dict, [obs, rew]))
+        term, trunc, info = tuple(
+            map(self._singleton_to_pz_dict, [term, trunc, self.task_completion])
+        )
 
         return self.observation(obs), rew, term, trunc, info
 
@@ -90,30 +104,26 @@ def _singleton_to_pz_dict(value: bool):
         use_ego_color=False,
     )
 
-    obs = env.reset()
     env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents)
+    # curriculum = DomainRandomization(env)
+    # curriculum, task_queue, update_queue = make_multiprocessing_curriculum(
+    #     curriculum,
+    # )
+
     done = {i: False for i in range(n_agents)}
     total_reward = {i: 0 for i in range(n_agents)}
     np.random.seed(0)
 
     # while not all(done.values()):
-    for steps in tqdm(range(100)):
-        # The actions have to be of the format (num_agents,3)
-        # The action format for each car is as in the CarRacing-v0 environment
-        # i.e. (`Box([-1. 0. 0.], 1.0, (3,), float32)`)
-        action = np.random.normal(0, 1, (2, 3))
-
-        assert action.shape == (n_agents, 3)
-        pz_action = {i: action[i] for i in range(n_agents)}
-        # Similarly, the structure of this is the same as in CarRacing-v0 with an
-        # additional dimension for the different agents, i.e.
-        # obs is of shape (num_agents, 96, 96, 3)
-        # reward is of shape (num_agents,)
-        # done is a bool and info is not used (an empty dict).
-        obs, reward, done, trunc, info = env.step(pz_action)
-        for agent in range(n_agents):
-            total_reward[agent] += reward[agent]
-        env.render()
+    for episodes in tqdm(range(5)):  # testing with 5 truncated episodes
+        obs = env.reset()
+        for steps in range(100):
+            action = np.random.normal(0, 1, (2, 3))
+            pz_action = {i: action[i] for i in range(n_agents)}
+            obs, reward, done, trunc, info = env.step(pz_action)
+            for agent in range(n_agents):
+                total_reward[agent] += reward[agent]
+            env.render()
 
     print("individual scores:", total_reward)
     print(reward, done, trunc, info)

From 82130d1d030de11fb40e9388f76da86d6fc2fa82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ryan=20P=C3=A9goud?= <ryanpegoud@gmail.com>
Date: Wed, 28 Feb 2024 10:08:59 +0100
Subject: [PATCH 3/5] added domain randomization wrapper, started implementing
 continuous ppo

---
 .../examples/experimental/multi_car_racing.py | 115 +++++++++++++++++-
 1 file changed, 111 insertions(+), 4 deletions(-)

diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py
index 84b3bc4a..c44c185f 100644
--- a/syllabus/examples/experimental/multi_car_racing.py
+++ b/syllabus/examples/experimental/multi_car_racing.py
@@ -4,7 +4,11 @@
 import gym
 import gym_multi_car_racing
 import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
 from gymnasium import spaces
+from torch.distributions.normal import Normal
 from tqdm.auto import tqdm
 
 from syllabus.core import TaskWrapper, make_multiprocessing_curriculum
@@ -16,6 +20,83 @@
 AgentID = TypeVar("AgentID")
 
 
+def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
+    torch.nn.init.orthogonal_(layer.weight, std)
+    torch.nn.init.constant_(layer.bias, bias_const)
+    return layer
+
+
+class Agent(nn.Module):
+    def __init__(self, envs):
+        super().__init__()
+        self.critic = nn.Sequential(
+            layer_init(
+                nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)
+            ),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 64)),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 1), std=1.0),
+        )
+        self.actor_mean = nn.Sequential(
+            layer_init(
+                nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)
+            ),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 64)),
+            nn.Tanh(),
+            layer_init(
+                nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01
+            ),
+        )
+        self.actor_logstd = nn.Parameter(
+            torch.zeros(1, np.prod(envs.single_action_space.shape))
+        )
+
+    def get_value(self, x):
+        return self.critic(x)
+
+    def get_action_and_value(self, x, action=None):
+        action_mean = self.actor_mean(x)
+        action_logstd = self.actor_logstd.expand_as(action_mean)
+        action_std = torch.exp(action_logstd)
+        probs = Normal(action_mean, action_std)
+        if action is None:
+            action = probs.sample()
+        return (
+            action,
+            probs.log_prob(action).sum(1),
+            probs.entropy().sum(1),
+            self.critic(x),
+        )
+
+
+def batchify_obs(obs, device):
+    """Converts PZ style observations to batch of torch arrays."""
+    obs = np.stack([obs[a] for a in obs], axis=0)
+    # transpose to be (batch, channel, height, width)
+    obs = obs.transpose(0, -1, 1, 2)
+    obs = torch.tensor(obs).to(device)
+
+    return obs
+
+
+def batchify(x, device):
+    """Converts PZ style returns to batch of torch arrays."""
+    x = np.stack([x[a] for a in x], axis=0)
+    x = torch.tensor(x).to(device)
+
+    return x
+
+
+def unbatchify(x, env):
+    """Converts np array to PZ style arguments."""
+    x = x.cpu().numpy()
+    x = {a: x[i] for i, a in enumerate(env.possible_agents)}
+
+    return x
+
+
 class MultiCarRacingParallelWrapper(TaskWrapper):
     """
     Wrapper ensuring compatibility with the PettingZoo Parallel API.
@@ -30,6 +111,16 @@ class MultiCarRacingParallelWrapper(TaskWrapper):
     def __init__(self, n_agents, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.n_agents = n_agents
+        self.task = None
+        self.episode_return = 0
+        self.task_space = TaskSpace(
+            spaces.Box(
+                low=np.array([-1.0, 0.0, 0.0]),
+                high=np.array([1.0, 1.0, 1.0]),
+                shape=(3,),
+                dtype=np.float32,
+            )
+        )
 
     def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray:
         """
@@ -93,6 +184,18 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
 
 
 if __name__ == "__main__":
+    """ALGO PARAMS"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    ent_coef = 0.1
+    vf_coef = 0.1
+    clip_coef = 0.1
+    gamma = 0.99
+    batch_size = 32
+    stack_size = 4
+    frame_size = (96, 96)
+    max_cycles = 125
+    total_episodes = 100
+
     n_agents = 2
     env = gym.make(
         "MultiCarRacing-v0",
@@ -104,11 +207,15 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
         use_ego_color=False,
     )
 
+    """ CURRICULUM SETUP """
     env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents)
-    # curriculum = DomainRandomization(env)
-    # curriculum, task_queue, update_queue = make_multiprocessing_curriculum(
-    #     curriculum,
-    # )
+    curriculum = DomainRandomization(env.task_space)
+    curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum)
+
+    # TODO: clarify how to setup continuous PPO
+    # """ LEARNER SETUP """
+    # agent = Agent(envs=envs).to(device)
+    # optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)
 
     done = {i: False for i in range(n_agents)}
     total_reward = {i: 0 for i in range(n_agents)}

From a00a1436025d78d81999e5fe9c291c7c5b3decff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ryan=20P=C3=A9goud?= <ryanpegoud@gmail.com>
Date: Thu, 29 Feb 2024 17:42:33 +0100
Subject: [PATCH 4/5] Added continuous PPO training loop (one bug remaining,
 see TODOs)

---
 requirements.txt                              |   4 +-
 .../examples/experimental/multi_car_racing.py | 235 +++++++++++++++---
 2 files changed, 208 insertions(+), 31 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fb4925c9..8864feee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 pettingzoo==1.23.0 # upgraded from 1.9 to solve BaseParallelWrapper import (typo)
-supersuit==3.5.0
+# upgraded from 3.5.0 to 3.7.2 (as pettingZoo < 1.23 is a dependency, 
+# the typo error is also present here)
+supersuit==3.7.2 
 gym==0.24.1
 numpy==1.23.3
 wandb>=0.15.3
diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py
index c44c185f..3dc83450 100644
--- a/syllabus/examples/experimental/multi_car_racing.py
+++ b/syllabus/examples/experimental/multi_car_racing.py
@@ -8,6 +8,7 @@
 import torch.nn as nn
 import torch.optim as optim
 from gymnasium import spaces
+from supersuit import color_reduction_v0, frame_stack_v1, resize_v1
 from torch.distributions.normal import Normal
 from tqdm.auto import tqdm
 
@@ -27,36 +28,33 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
 
 
 class Agent(nn.Module):
-    def __init__(self, envs):
+    def __init__(self):
         super().__init__()
+        self.observation_shape = (3, 96, 96)
+        self.action_shape = (3,)
+
         self.critic = nn.Sequential(
-            layer_init(
-                nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)
-            ),
+            layer_init(nn.Linear(np.array(self.observation_shape).prod(), 64)),
             nn.Tanh(),
             layer_init(nn.Linear(64, 64)),
             nn.Tanh(),
             layer_init(nn.Linear(64, 1), std=1.0),
         )
         self.actor_mean = nn.Sequential(
-            layer_init(
-                nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)
-            ),
+            layer_init(nn.Linear(np.array(self.observation_shape).prod(), 64)),
             nn.Tanh(),
             layer_init(nn.Linear(64, 64)),
             nn.Tanh(),
-            layer_init(
-                nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01
-            ),
-        )
-        self.actor_logstd = nn.Parameter(
-            torch.zeros(1, np.prod(envs.single_action_space.shape))
+            layer_init(nn.Linear(64, np.prod(self.action_shape)), std=0.01),
         )
+        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(self.action_shape)))
 
     def get_value(self, x):
         return self.critic(x)
 
     def get_action_and_value(self, x, action=None):
+        x = x.float().reshape(x.size(0), -1)
+
         action_mean = self.actor_mean(x)
         action_logstd = self.actor_logstd.expand_as(action_mean)
         action_std = torch.exp(action_logstd)
@@ -121,6 +119,9 @@ def __init__(self, n_agents, *args, **kwargs):
                 dtype=np.float32,
             )
         )
+        self.possible_agents = np.arange(
+            self.n_agents
+        )  # TODO: is this the intended use?
 
     def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray:
         """
@@ -191,11 +192,16 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
     clip_coef = 0.1
     gamma = 0.99
     batch_size = 32
-    stack_size = 4
+    stack_size = 3
     frame_size = (96, 96)
+    action_size = 3
     max_cycles = 125
     total_episodes = 100
 
+    # PLR Params
+    num_steps = 128
+
+    """ ENV SETUP """
     n_agents = 2
     env = gym.make(
         "MultiCarRacing-v0",
@@ -212,25 +218,194 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
     curriculum = DomainRandomization(env.task_space)
     curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum)
 
-    # TODO: clarify how to setup continuous PPO
-    # """ LEARNER SETUP """
-    # agent = Agent(envs=envs).to(device)
-    # optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)
+    """ LEARNER SETUP """
+    agent = Agent().to(device)
+    optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)
+
+    """ ALGO LOGIC: EPISODE STORAGE"""
+    end_step = 0
+    total_episodic_return = 0
+    rb_obs = torch.zeros((max_cycles, n_agents, stack_size, *frame_size)).to(device)
+    rb_actions = torch.zeros((max_cycles, n_agents, action_size)).to(device)
+    rb_logprobs = torch.zeros((max_cycles, n_agents)).to(device)
+    rb_rewards = torch.zeros((max_cycles, n_agents)).to(device)
+    rb_dones = torch.zeros((max_cycles, n_agents)).to(device)
+    rb_values = torch.zeros((max_cycles, n_agents)).to(device)
 
     done = {i: False for i in range(n_agents)}
     total_reward = {i: 0 for i in range(n_agents)}
     np.random.seed(0)
 
+    """ TRAINING LOGIC """
+    # train for n number of episodes
+    global_cycles = 0
+    for episode in tqdm(range(total_episodes)):
+        # collect an episode
+        with torch.no_grad():
+            # collect observations and convert to batch of torch tensors
+            next_obs = env.reset()
+            # reset the episodic return
+            total_episodic_return = 0
+
+            # each episode has num_steps
+            for step in range(0, max_cycles):
+                global_cycles += 1
+                # rollover the observation
+                obs = batchify_obs(next_obs, device)
+                # get action from the agent
+                actions, logprobs, entropy, values = agent.get_action_and_value(obs)
+
+                # execute the environment and log data
+                next_obs, rewards, trunc, dones, infos = env.step(
+                    unbatchify(actions, env)
+                )
+
+                # add to episode storage
+                rb_obs[step] = obs
+                rb_rewards[step] = batchify(rewards, device)
+                rb_dones[step] = batchify(dones, device)
+                rb_actions[step] = actions
+                rb_logprobs[step] = logprobs
+                rb_values[step] = values.flatten()
+
+                # compute episodic return
+                total_episodic_return += rb_rewards[step].cpu().numpy()
+
+                # Update curriculum
+                # TODO: adapt to DR
+                if global_cycles % num_steps == 0:
+                    update = {
+                        "update_type": "on_demand",
+                        "metrics": {
+                            "action_log_dist": logprobs,
+                            "value": values,
+                            "next_value": (
+                                agent.get_value(next_obs)
+                                if step == num_steps - 1
+                                else None
+                            ),
+                            "rew": rb_rewards[step],
+                            "masks": torch.Tensor(1 - np.array(list(dones.values()))),
+                            "tasks": [env.unwrapped.task],
+                        },
+                    }
+                    curriculum.update_curriculum(update)
+
+                # if we reach the end of the episode
+                if any([dones[a] for a in dones]):
+                    print(f"Breaking early at step {step} due to done signal.")
+                    end_step = step
+                    break
+
+        # bootstrap value if not done
+        with torch.no_grad():
+            rb_advantages = torch.zeros_like(rb_rewards).to(device)
+            for t in reversed(range(end_step)):
+                delta = (
+                    rb_rewards[t]
+                    + gamma * rb_values[t + 1] * rb_dones[t + 1]
+                    - rb_values[t]
+                )
+                rb_advantages[t] = delta + gamma * gamma * rb_advantages[t + 1]
+            rb_returns = rb_advantages + rb_values
+
+        # TODO: at this step, `end_step` is still 0, causing b_obs to be empty
+        # and the `repeat` loop not running as range(0, len(b_obs), batch_size) = 0
+        # maybe set end_step to the current number of steps in the episode even if there
+        # is no positive done flag ?
+
+        # convert our episodes to batch of individual transitions
+        b_obs = torch.flatten(rb_obs[:end_step], start_dim=0, end_dim=1)
+        b_logprobs = torch.flatten(rb_logprobs[:end_step], start_dim=0, end_dim=1)
+        b_actions = torch.flatten(rb_actions[:end_step], start_dim=0, end_dim=1)
+        b_returns = torch.flatten(rb_returns[:end_step], start_dim=0, end_dim=1)
+        b_values = torch.flatten(rb_values[:end_step], start_dim=0, end_dim=1)
+        b_advantages = torch.flatten(rb_advantages[:end_step], start_dim=0, end_dim=1)
+
+        # Optimizing the policy and value network
+        b_index = np.arange(len(b_obs))
+        clip_fracs = []
+        for repeat in range(3):
+            # shuffle the indices we use to access the data
+            np.random.shuffle(b_index)
+            for start in range(0, len(b_obs), batch_size):
+                print(start)
+                # select the indices we want to train on
+                end = start + batch_size
+                batch_index = b_index[start:end]
+
+                _, newlogprob, entropy, value = agent.get_action_and_value(
+                    b_obs[batch_index], b_actions.long()[batch_index]
+                )
+                logratio = newlogprob - b_logprobs[batch_index]
+                ratio = logratio.exp()
+
+                with torch.no_grad():
+                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
+                    old_approx_kl = (-logratio).mean()
+                    approx_kl = ((ratio - 1) - logratio).mean()
+                    clip_fracs += [
+                        ((ratio - 1.0).abs() > clip_coef).float().mean().item()
+                    ]
+
+                # normalize advantaegs
+                advantages = b_advantages[batch_index]
+                advantages = (advantages - advantages.mean()) / (
+                    advantages.std() + 1e-8
+                )
+
+                # Policy loss
+                pg_loss1 = -b_advantages[batch_index] * ratio
+                pg_loss2 = -b_advantages[batch_index] * torch.clamp(
+                    ratio, 1 - clip_coef, 1 + clip_coef
+                )
+                pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+                # Value loss
+                value = value.flatten()
+                v_loss_unclipped = (value - b_returns[batch_index]) ** 2
+                v_clipped = b_values[batch_index] + torch.clamp(
+                    value - b_values[batch_index],
+                    -clip_coef,
+                    clip_coef,
+                )
+                v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
+                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+                v_loss = 0.5 * v_loss_max.mean()
+                entropy_loss = entropy.mean()
+                loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef
+
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
+        var_y = np.var(y_true)
+        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+
+        print(f"Training episode {episode}")
+        print(f"Episodic Return: {np.mean(total_episodic_return)}")
+        print(f"Episode Length: {end_step}")
+        print("")
+        print(f"Value Loss: {v_loss.item()}")
+        print(f"Policy Loss: {pg_loss.item()}")
+        print(f"Old Approx KL: {old_approx_kl.item()}")
+        print(f"Approx KL: {approx_kl.item()}")
+        print(f"Clip Fraction: {np.mean(clip_fracs)}")
+        print(f"Explained Variance: {explained_var.item()}")
+        print("\n-------------------------------------------\n")
+
+    # ----- Dummy test loop -----
     # while not all(done.values()):
-    for episodes in tqdm(range(5)):  # testing with 5 truncated episodes
-        obs = env.reset()
-        for steps in range(100):
-            action = np.random.normal(0, 1, (2, 3))
-            pz_action = {i: action[i] for i in range(n_agents)}
-            obs, reward, done, trunc, info = env.step(pz_action)
-            for agent in range(n_agents):
-                total_reward[agent] += reward[agent]
-            env.render()
-
-    print("individual scores:", total_reward)
-    print(reward, done, trunc, info)
+    # for episodes in tqdm(range(1)):  # testing with 5 truncated episodes
+    #     obs = env.reset()
+    #     for steps in range(100):
+    #         action = np.random.normal(0, 1, (2, 3))
+    #         pz_action = {i: action[i] for i in range(n_agents)}
+    #         obs, reward, done, trunc, info = env.step(pz_action)
+    #         for agent in range(n_agents):
+    #             total_reward[agent] += reward[agent]
+    #         env.render()
+
+    # print("individual scores:", total_reward)
+    # print(reward, done, trunc, info)

From 5b88b0cec7ebc9937aa0442b3313a5ac8b78d49d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ryan=20P=C3=A9goud?= <ryanpegoud@gmail.com>
Date: Fri, 22 Mar 2024 15:39:47 +0100
Subject: [PATCH 5/5] .

---
 Dockerfile                                    |   3 +-
 .../experimental/bezier_car_racing.py         | 999 ++++++++++++++++++
 .../examples/experimental/multi_car_racing.py |  27 +-
 .../experimental/multi_car_racing_v2.py       | 432 ++++++++
 4 files changed, 1439 insertions(+), 22 deletions(-)
 create mode 100644 syllabus/examples/experimental/bezier_car_racing.py
 create mode 100644 syllabus/examples/experimental/multi_car_racing_v2.py

diff --git a/Dockerfile b/Dockerfile
index 80dffacd..e53b7f0d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,4 +38,5 @@ CMD ["/bin/bash"]
 # docker build -t syllabus-image . 
 # docker run -it --rm -p 4000:4000 syllabus-image
 # docker run -it --rm -p 4000:4000 -v ${PWD}:/usr/src/mylib syllabus-image
-# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py
\ No newline at end of file
+# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing.py
+# xvfb-run -s "-screen 0 1400x900x24" python -u multi_car_racing_v2.py
\ No newline at end of file
diff --git a/syllabus/examples/experimental/bezier_car_racing.py b/syllabus/examples/experimental/bezier_car_racing.py
new file mode 100644
index 00000000..3be4ec44
--- /dev/null
+++ b/syllabus/examples/experimental/bezier_car_racing.py
@@ -0,0 +1,999 @@
+# Copyright (c) OpenAI
+#
+# Licensed under the MIT License;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://opensource.org/licenses/MIT
+#
+# This file is an extended version of
+# https://github.com/openai/gym/blob/master/gym/envs/box2d/car_racing.py
+
+import argparse
+import math
+
+import Box2D
+import gym
+import numpy as np
+import pyglet
+from Box2D.b2 import contactListener, fixtureDef, polygonShape
+from envs.registration import register as gym_register
+from gym import spaces
+from gym.envs.box2d.car_dynamics import Car
+from gym.utils import EzPickle, seeding
+from util import geo_complexity
+
+pyglet.options["debug_gl"] = False
+from pyglet import gl
+
+from . import bezier, racetracks
+
+STATE_W = 96
+STATE_H = 96
+VIDEO_W = 600
+VIDEO_H = 400
+WINDOW_W = 1000
+WINDOW_H = 800
+
+SCALE = 6.0  # Track scale
+TRACK_RAD = 900 / SCALE  # Track is heavily morphed circle with this radius
+PLAYFIELD = 2000 / SCALE  # Game over boundary
+FPS = 50  # Frames per second
+ZOOM = 2.7  # Camera zoom
+ZOOM_FOLLOW = True  # Set to False for fixed view (don't use zoom)
+
+
+TRACK_DETAIL_STEP = 21 / SCALE
+TRACK_TURN_RATE = 0.31
+TRACK_WIDTH = 40 / SCALE
+BORDER = 8 / SCALE
+BORDER_MIN_COUNT = 4
+
+ROAD_COLOR = [0.4, 0.4, 0.4]
+
+
+class FrictionDetector(contactListener):
+    def __init__(self, env):
+        contactListener.__init__(self)
+        self.env = env
+
+    def BeginContact(self, contact):
+        self._contact(contact, True)
+
+    def EndContact(self, contact):
+        self._contact(contact, False)
+
+    def _contact(self, contact, begin):
+        tile = None
+        obj = None
+        u1 = contact.fixtureA.body.userData
+        u2 = contact.fixtureB.body.userData
+        index = -1
+        if u1 and "tile" in u1:
+            if "road_friction" in u1["tile"].__dict__:
+                tile = u1["tile"]
+                index = u1["index"]
+                obj = u2
+        if u2 and "tile" in u2:
+            if "road_friction" in u2["tile"].__dict__:
+                tile = u2["tile"]
+                index = u2["index"]
+                obj = u1
+        if not tile:
+            return
+
+        tile.color[0] = ROAD_COLOR[0]
+        tile.color[1] = ROAD_COLOR[1]
+        tile.color[2] = ROAD_COLOR[2]
+
+        if not obj or "tiles" not in obj.__dict__:
+            return
+        if begin:
+            obj.tiles.add(tile)
+            if not tile.road_visited:
+                tile.road_visited = True
+                self.env.reward += 1000.0 / len(self.env.track)
+                self.env.tile_visited_count += 1
+
+            if self.env.sparse_rewards and index >= 0:
+                self._eval_tile_index(index)
+        else:
+            obj.tiles.remove(tile)
+
+    def _eval_tile_index(self, index):
+        goal_bin = self.env.goal_bin
+        track_len = len(self.env.track)
+        goal_step = track_len / (self.env.num_goal_bins)
+
+        MIN_DISTANCE_TO_GO = 10
+        distance = track_len - index
+        tile_bin = np.floor(distance / goal_step)
+
+        # print('in tile bin, index', tile_bin, index, flush=True)
+        if goal_bin == 0 and distance < MIN_DISTANCE_TO_GO:
+            self.env.goal_reached = False
+        elif goal_bin == self.env.num_goal_bins - 1 and index < MIN_DISTANCE_TO_GO:
+            self.env.goal_reached = False
+        elif tile_bin == goal_bin:
+            self.env.goal_reached = True
+            # print(f'goal bin {goal_bin} reached!', flush=True)
+
+
+class CarRacingBezier(gym.Env, EzPickle):
+    metadata = {
+        "render.modes": ["human", "rgb_array", "state_pixels"],
+        "video.frames_per_second": FPS,
+    }
+
+    def __init__(
+        self,
+        n_control_points=12,
+        track_name=None,
+        bezier=True,
+        show_borders=True,
+        show_indicators=True,
+        birdseye=False,
+        seed=None,
+        fixed_environment=False,
+        animate_zoom=False,
+        min_rad_ratio=0.333333333,
+        max_rad_ratio=1.0,
+        sparse_rewards=False,
+        clip_reward=None,
+        num_goal_bins=24,
+        verbose=0,
+    ):
+        EzPickle.__init__(self)
+
+        self.level_seed = seed
+        self.seed(seed)
+
+        self.n_control_points = n_control_points
+        self.bezier = bezier
+        self.fixed_environment = fixed_environment
+        self.animate_zoom = animate_zoom
+        self.min_rad_ratio = min_rad_ratio
+        self.max_rad_ratio = max_rad_ratio
+
+        self.steps = 0
+
+        self.contactListener_keepref = FrictionDetector(self)
+        self.world = Box2D.b2World((0, 0), contactListener=self.contactListener_keepref)
+        self.viewer = None
+        self.invisible_state_window = None
+        self.invisible_video_window = None
+        self.road = None
+        self.car = None
+        self.reward = 0.0
+        self.prev_reward = 0.0
+
+        self.preloaded_track = racetracks.get_track(track_name)
+        self.show_borders = show_borders
+        self.show_indicators = show_indicators
+        self.birdseye = birdseye
+        self.verbose = verbose
+
+        self.track_data = None
+        self.complexity_info = None
+
+        self.window_h = WINDOW_H
+        self.window_w = WINDOW_W
+        self.track_rad = TRACK_RAD
+        self.track_width = TRACK_WIDTH
+        if self.preloaded_track:
+            self.playfield = self.preloaded_track.bounds / SCALE
+            self.full_zoom = self.preloaded_track.full_zoom
+        else:
+            self.playfield = PLAYFIELD
+            self.full_zoom = 0.25
+
+        self.fd_tile = fixtureDef(
+            shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)])
+        )
+
+        self.action_space = spaces.Box(
+            np.array([-1, 0, 0]), np.array([+1, +1, +1]), dtype=np.float32
+        )  # steer, gas, brake
+
+        self.observation_space = spaces.Box(
+            low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8
+        )
+
+        self.clip_reward = clip_reward
+
+        # Create goal for sparse rewards
+        self.sparse_rewards = sparse_rewards
+        self.num_goal_bins = num_goal_bins  # 0-indexed
+        self.goal_bin = None
+        if sparse_rewards:
+            self.set_goal()
+            self.accumulated_rewards = 0.0
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def get_complexity_info(self):
+        if self.complexity_info is None:
+            # recompute
+            points = ((x, y) for _, _, x, y in self.track)
+            return geo_complexity.complexity(points)
+
+        return self.complexity_info
+
+    def set_goal(self, goal_bin=None):
+        if goal_bin is None:
+            goal_bin = self.goal_bin
+
+        if goal_bin is None:
+            self.goal_bin = self.np_random.randint(1, self.num_goal_bins)
+        else:
+            self.goal_bin = goal_bin
+
+        self.goal_reached = False
+
+    def _destroy(self):
+        if not self.road:
+            return
+
+        for t in self.road:
+            t.userData = t.userData["tile"]
+            self.world.DestroyBody(t)
+
+        self.road = []
+        self.car.destroy()
+        self.car = None
+
+    def _create_track(self, control_points=None, show_borders=None):
+        if self.bezier:
+            return self._create_track_bezier(
+                control_points=control_points, show_borders=show_borders
+            )
+        else:
+            t = 0
+            reset_random = False
+            while True:
+                t += 1
+                if t > 10:
+                    reset_random = True
+                    break
+
+                success = self._create_track_polar(
+                    control_points=control_points, show_borders=show_borders
+                )
+                if success:
+                    return success
+
+        if reset_random:
+            t = 0
+            while True:
+                t += 1
+                success = self._create_track_polar(show_borders=show_borders)
+                if success:
+                    return success
+
+    def _create_track_bezier(self, control_points=None, show_borders=None):
+        if show_borders is None:
+            show_borders = self.show_borders
+        else:
+            show_borders = show_borders
+
+        # Create random bezier curve
+        track = []
+        self.road = []
+
+        if self.preloaded_track is not None:
+            points = self.preloaded_track.xy
+            x, y = zip(*points)
+        elif control_points is not None:
+            a = np.array(control_points)
+            x, y, _ = bezier.get_bezier_curve(a=a, rad=0.2, edgy=0.2, numpoints=40)
+            self.track_data = a
+        else:
+            a = bezier.get_random_points(
+                n=self.n_control_points, scale=self.playfield, np_random=self.np_random
+            )
+            x, y, _ = bezier.get_bezier_curve(a=a, rad=0.2, edgy=0.2, numpoints=40)
+            self.track_data = a
+
+        min_x, max_x = x[-1], x[-1]
+        min_y, max_y = y[-1], y[-1]
+
+        points = list(zip(x, y))
+        betas = []
+        for i, p in enumerate(points[:-1]):
+            x1, y1 = points[i]
+            x2, y2 = points[i + 1]
+            dx = x2 - x1
+            dy = y2 - y1
+            if dx == dy == 0:
+                continue
+
+            # alpha = math.atan(dy/(dx+1e-5))
+            alpha = np.arctan2(dy, dx)
+            beta = math.pi / 2 + alpha
+
+            track.append((alpha, beta, x1, y1))
+            betas.append(beta)
+
+            min_x = min(x1, min_x)
+            min_y = min(y1, min_y)
+            max_x = max(x1, max_x)
+            max_y = max(y1, max_y)
+
+        x_offset = min_x + (max_x - min_x) / 2
+        y_offset = min_y + (max_y - min_y) / 2
+        self.x_offset = x_offset
+        self.y_offset = y_offset
+
+        betas = np.array(betas)
+        abs_dbeta = abs(betas[1:] - betas[0:-1])
+        mean_abs_dbeta = abs_dbeta.mean()
+        std_abs_dbeta = abs_dbeta.std()
+        mean_abs_dbeta + std_abs_dbeta / 2
+
+        # Red-white border on hard turns
+        border = [False] * len(track)
+        if show_borders:
+            for i in range(len(track)):
+                good = True
+                oneside = 0
+                for neg in range(BORDER_MIN_COUNT):
+                    beta1 = track[i - neg - 0][1]
+                    beta2 = track[i - neg - 1][1]
+                    good &= abs(beta1 - beta2) > mean_abs_dbeta
+                    oneside += np.sign(beta1 - beta2)
+                good &= abs(oneside) == BORDER_MIN_COUNT
+                border[i] = good
+            for i in range(len(track)):
+                for neg in range(BORDER_MIN_COUNT):
+                    border[i - neg] |= border[i]
+
+        # Create tiles
+        for i in range(len(track)):
+            alpha1, beta1, x1, y1 = track[i]
+
+            alpha2, beta2, x2, y2 = track[i - 1]
+
+            road1_l = (
+                x1 - TRACK_WIDTH * math.cos(beta1) - x_offset,
+                y1 - TRACK_WIDTH * math.sin(beta1) - y_offset,
+            )
+            road1_r = (
+                x1 + TRACK_WIDTH * math.cos(beta1) - x_offset,
+                y1 + TRACK_WIDTH * math.sin(beta1) - y_offset,
+            )
+            road2_l = (
+                x2 - TRACK_WIDTH * math.cos(beta2) - x_offset,
+                y2 - TRACK_WIDTH * math.sin(beta2) - y_offset,
+            )
+            road2_r = (
+                x2 + TRACK_WIDTH * math.cos(beta2) - x_offset,
+                y2 + TRACK_WIDTH * math.sin(beta2) - y_offset,
+            )
+            vertices = [road1_l, road1_r, road2_r, road2_l]
+
+            try:
+                self.fd_tile.shape.vertices = vertices
+            except:
+                pass
+            t = self.world.CreateStaticBody(fixtures=self.fd_tile)
+            # t.userData = t
+            t.userData = {"tile": t, "index": i}
+            c = 0.01 * (i % 3)
+            t.color = [ROAD_COLOR[0] + c, ROAD_COLOR[1] + c, ROAD_COLOR[2] + c]
+            t.road_visited = False
+            t.road_friction = 1.0
+            t.fixtures[0].sensor = True
+            self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color))
+            self.road.append(t)
+
+            if self.show_borders and border[i]:
+                side = np.sign(beta2 - beta1)
+                b1_l = (
+                    x1 + side * TRACK_WIDTH * math.cos(beta1) - x_offset,
+                    y1 + side * TRACK_WIDTH * math.sin(beta1) - y_offset,
+                )
+                b1_r = (
+                    x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1) - x_offset,
+                    y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1) - y_offset,
+                )
+                b2_l = (
+                    x2 + side * TRACK_WIDTH * math.cos(beta2) - x_offset,
+                    y2 + side * TRACK_WIDTH * math.sin(beta2) - y_offset,
+                )
+                b2_r = (
+                    x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2) - x_offset,
+                    y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2) - y_offset,
+                )
+                self.road_poly.append(
+                    ([b1_l, b1_r, b2_r, b2_l], (1, 1, 1) if i % 2 == 0 else (1, 0, 0))
+                )
+        self.track = track
+
+        self.complexity_info = geo_complexity.complexity(points)
+
+        return True
+
+    def _create_track_polar(self, control_points=None, show_borders=None):
+        if show_borders is None:
+            show_borders = self.show_borders
+        else:
+            show_borders = show_borders
+
+        CHECKPOINTS = self.n_control_points
+
+        self.x_offset = 0
+        self.y_offset = 0
+
+        min_rad = TRACK_RAD * self.min_rad_ratio
+        max_rad = TRACK_RAD * self.max_rad_ratio
+
+        # Create checkpoints
+        if control_points is not None:
+            checkpoints = control_points
+            self.start_alpha = 2 * math.pi * (-0.5) / self.n_control_points
+        else:
+            checkpoints = []
+            for c in range(CHECKPOINTS):
+                noise = self.np_random.uniform(0, 2 * math.pi * 1 / CHECKPOINTS)
+                alpha = 2 * math.pi * c / CHECKPOINTS + noise
+                rad = self.np_random.uniform(min_rad, max_rad)
+
+                if c == 0:
+                    alpha = 0
+                    rad = 1.5 * TRACK_RAD
+                if c == CHECKPOINTS - 1:
+                    alpha = 2 * math.pi * c / CHECKPOINTS
+                    self.start_alpha = 2 * math.pi * (-0.5) / CHECKPOINTS
+                    rad = 1.5 * TRACK_RAD
+
+                checkpoints.append(
+                    (alpha, rad * math.cos(alpha), rad * math.sin(alpha))
+                )
+
+        self.track_data = checkpoints
+
+        self.road = []
+
+        # Go from one checkpoint to another to create track
+        # x, y, beta = 1.5 * TRACK_RAD, 0, 0
+        _, x, y = checkpoints[0]
+        beta = 0
+        dest_i = 0
+        laps = 0
+        track = []
+        no_freeze = 2500
+        visited_other_side = False
+        while True:
+            alpha = math.atan2(y, x)
+            if visited_other_side and alpha > 0:
+                laps += 1
+                visited_other_side = False
+            if alpha < 0:
+                visited_other_side = True
+                alpha += 2 * math.pi
+
+            while True:  # Find destination from checkpoints
+                failed = True
+
+                while True:
+                    dest_alpha, dest_x, dest_y = checkpoints[dest_i % len(checkpoints)]
+                    if alpha <= dest_alpha:
+                        failed = False
+                        break
+                    dest_i += 1
+                    if dest_i % len(checkpoints) == 0:
+                        break
+
+                if not failed:
+                    break
+
+                alpha -= 2 * math.pi
+                continue
+
+            r1x = math.cos(beta)
+            r1y = math.sin(beta)
+            p1x = -r1y
+            p1y = r1x
+            dest_dx = dest_x - x  # vector towards destination
+            dest_dy = dest_y - y
+            # destination vector projected on rad:
+            proj = r1x * dest_dx + r1y * dest_dy
+            while beta - alpha > 1.5 * math.pi:
+                beta -= 2 * math.pi
+            while beta - alpha < -1.5 * math.pi:
+                beta += 2 * math.pi
+            prev_beta = beta
+            proj *= SCALE
+            if proj > 0.3:
+                beta -= min(TRACK_TURN_RATE, abs(0.001 * proj))
+            if proj < -0.3:
+                beta += min(TRACK_TURN_RATE, abs(0.001 * proj))
+            x += p1x * TRACK_DETAIL_STEP
+            y += p1y * TRACK_DETAIL_STEP
+            track.append((alpha, prev_beta * 0.5 + beta * 0.5, x, y))
+            if laps > 4:
+                break
+            no_freeze -= 1
+            if no_freeze == 0:
+                break
+
+        # Find closed loop range i1..i2, first loop should be ignored, second is OK
+        i1, i2 = -1, -1
+        i = len(track)
+        while True:
+            i -= 1
+            if i == 0:
+                return False  # Failed
+            pass_through_start = (
+                track[i][0] > self.start_alpha and track[i - 1][0] <= self.start_alpha
+            )
+            if pass_through_start and i2 == -1:
+                i2 = i
+            elif pass_through_start and i1 == -1:
+                i1 = i
+                break
+        assert i1 != -1
+        assert i2 != -1
+
+        track = track[i1 : i2 - 1]
+
+        # Red-white border on hard turns
+        border = [False] * len(track)
+        if show_borders:
+            for i in range(len(track)):
+                good = True
+                oneside = 0
+                for neg in range(BORDER_MIN_COUNT):
+                    beta1 = track[i - neg - 0][1]
+                    beta2 = track[i - neg - 1][1]
+                    good &= abs(beta1 - beta2) > TRACK_TURN_RATE * 0.2
+                    oneside += np.sign(beta1 - beta2)
+                good &= abs(oneside) == BORDER_MIN_COUNT
+                border[i] = good
+            for i in range(len(track)):
+                for neg in range(BORDER_MIN_COUNT):
+                    border[i - neg] |= border[i]
+
+        # Create tiles
+        for i in range(len(track)):
+            alpha1, beta1, x1, y1 = track[i]
+            alpha2, beta2, x2, y2 = track[i - 1]
+            road1_l = (
+                x1 - TRACK_WIDTH * math.cos(beta1),
+                y1 - TRACK_WIDTH * math.sin(beta1),
+            )
+            road1_r = (
+                x1 + TRACK_WIDTH * math.cos(beta1),
+                y1 + TRACK_WIDTH * math.sin(beta1),
+            )
+            road2_l = (
+                x2 - TRACK_WIDTH * math.cos(beta2),
+                y2 - TRACK_WIDTH * math.sin(beta2),
+            )
+            road2_r = (
+                x2 + TRACK_WIDTH * math.cos(beta2),
+                y2 + TRACK_WIDTH * math.sin(beta2),
+            )
+            vertices = [road1_l, road1_r, road2_r, road2_l]
+            self.fd_tile.shape.vertices = vertices
+            t = self.world.CreateStaticBody(fixtures=self.fd_tile)
+            t.userData = t
+            c = 0.01 * (i % 3)
+            t.color = [ROAD_COLOR[0] + c, ROAD_COLOR[1] + c, ROAD_COLOR[2] + c]
+            t.road_visited = False
+            t.road_friction = 1.0
+            t.fixtures[0].sensor = True
+            self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color))
+            self.road.append(t)
+            if border[i]:
+                side = np.sign(beta2 - beta1)
+                b1_l = (
+                    x1 + side * TRACK_WIDTH * math.cos(beta1),
+                    y1 + side * TRACK_WIDTH * math.sin(beta1),
+                )
+                b1_r = (
+                    x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1),
+                    y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1),
+                )
+                b2_l = (
+                    x2 + side * TRACK_WIDTH * math.cos(beta2),
+                    y2 + side * TRACK_WIDTH * math.sin(beta2),
+                )
+                b2_r = (
+                    x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2),
+                    y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2),
+                )
+                self.road_poly.append(
+                    ([b1_l, b1_r, b2_r, b2_l], (1, 1, 1) if i % 2 == 0 else (1, 0, 0))
+                )
+        self.track = track
+
+        return True
+
+    def reset_sparse_state(self):
+        if self.sparse_rewards:
+            self.accumulated_rewards = 0.0
+            self.set_goal()
+
+    def reset(self):
+        if self.fixed_environment:
+            self.seed(self.level_seed)
+
+        self._destroy()
+        self.reward = 0.0
+        self.prev_reward = 0.0
+        self.tile_visited_count = 0
+        self.t = 0.0
+        self.road_poly = []
+        self.track_data = None
+
+        self.steps = 0
+
+        self._create_track()
+
+        beta0, x0, y0 = self.track[0][1:4]
+        x0 -= self.x_offset
+        y0 -= self.y_offset
+        self.car = Car(self.world, beta0, x0, y0)
+
+        self.goal_bin = None
+        self.reset_sparse_state()
+
+        return self.step(None)[0]
+
+    def step(self, action):
+        if action is not None:
+            self.car.steer(-action[0])
+            self.car.gas(action[1])
+            self.car.brake(action[2])
+
+        self.car.step(1.0 / FPS)
+        self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
+        self.t += 1.0 / FPS
+
+        self.steps += 1
+
+        self.state = self.render("state_pixels")
+
+        step_reward = 0
+        done = False
+        if action is not None:  # First step without action, called from reset()
+            self.reward -= 0.1
+            # We actually don't want to count fuel spent, we want car to be faster.
+            # self.reward -=  10 * self.car.fuel_spent / ENGINE_POWER
+            self.car.fuel_spent = 0.0
+            step_reward = self.reward - self.prev_reward
+            self.prev_reward = self.reward
+            if self.tile_visited_count == len(self.track):
+                done = True
+            x, y = self.car.hull.position
+            if abs(x) > self.playfield or abs(y) > self.playfield:
+                done = True
+                step_reward = -100
+
+        if self.sparse_rewards:
+            self.accumulated_rewards += step_reward
+
+            revealed_reward = 0
+            if self.goal_reached:
+                revealed_reward = self.accumulated_rewards
+
+                self.accumulated_rewards = 0.0
+                done = True
+        else:
+            revealed_reward = step_reward
+
+        if self.clip_reward:
+            revealed_reward = min(
+                max(revealed_reward, -self.clip_reward), self.clip_reward
+            )
+
+        return self.state, revealed_reward, done, {}
+
+    def render(self, mode="human"):
+        assert mode in ["human", "state_pixels", "rgb_array", "level", "sketch"]
+        if self.viewer is None:
+            from envs.box2d import rendering
+
+            self.viewer = rendering.Viewer(WINDOW_W, WINDOW_H)
+            self.score_label = pyglet.text.Label(
+                "0000",
+                font_size=36,
+                x=20,
+                y=WINDOW_H * 2.5 / 40.00,
+                anchor_x="left",
+                anchor_y="center",
+                color=(255, 255, 255, 255),
+            )
+            self.transform = rendering.Transform()
+
+        if "t" not in self.__dict__:
+            return  # reset() not called yet
+
+        # Animate zoom first second:
+        if self.birdseye or mode in ["level", "sketch"]:
+            zoom_coef = self.full_zoom
+        else:
+            zoom_coef = ZOOM
+        if self.animate_zoom:
+            zoom = 0.1 * SCALE * max(1 - self.t, 0) + zoom_coef * SCALE * min(self.t, 1)
+        else:
+            zoom = zoom_coef * SCALE
+
+        scroll_x = self.car.hull.position[0]
+        scroll_y = self.car.hull.position[1]
+        angle = -self.car.hull.angle
+        vel = self.car.hull.linearVelocity
+        if np.linalg.norm(vel) > 0.5:
+            angle = math.atan2(vel[0], vel[1])
+        self.transform.set_scale(zoom, zoom)
+
+        if self.birdseye or mode in ["level", "sketch"]:
+            self.transform.set_translation(
+                WINDOW_W / 2,
+                WINDOW_H / 2,
+            )
+            self.transform.set_rotation(0)
+        else:
+            self.transform.set_translation(
+                WINDOW_W / 2
+                - (
+                    scroll_x * zoom * math.cos(angle)
+                    - scroll_y * zoom * math.sin(angle)
+                ),
+                WINDOW_H / 4
+                - (
+                    scroll_x * zoom * math.sin(angle)
+                    + scroll_y * zoom * math.cos(angle)
+                ),
+            )
+            self.transform.set_rotation(angle)
+
+        self.car.draw(self.viewer, mode != "state_pixels")
+
+        arr = None
+        win = self.viewer.window
+        win.switch_to()
+        win.dispatch_events()
+
+        win.clear()
+        t = self.transform
+        if mode == "rgb_array":
+            VP_W = VIDEO_W
+            VP_H = VIDEO_H
+        elif mode in ["state_pixels", "sketch"]:
+            VP_W = STATE_W
+            VP_H = STATE_H
+        else:
+            pixel_scale = 1
+            if hasattr(win.context, "_nscontext"):
+                pixel_scale = (
+                    win.context._nscontext.view().backingScaleFactor()
+                )  # pylint: disable=protected-access
+            VP_W = int(pixel_scale * WINDOW_W)
+            VP_H = int(pixel_scale * WINDOW_H)
+
+        gl.glViewport(0, 0, VP_W, VP_H)
+        t.enable()
+        self.render_road()
+        for geom in self.viewer.onetime_geoms:
+            geom.render()
+        self.viewer.onetime_geoms = []
+        t.disable()
+
+        if mode not in ["level", "sketch"] and self.show_indicators:
+            self.render_indicators(WINDOW_W, WINDOW_H)
+
+        if mode == "human":
+            win.flip()
+            return self.viewer.isopen
+
+        image_data = (
+            pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
+        )
+        arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep="")
+        arr = arr.reshape(VP_H, VP_W, 4)
+        arr = arr[::-1, :, 0:3]
+
+        return arr
+
+    def close(self):
+        if self.viewer is not None:
+            self.viewer.close()
+            self.viewer = None
+
+    def render_road(self):
+        colors = [0.4, 0.8, 0.4, 1.0] * 4
+        polygons_ = [
+            +self.playfield,
+            +self.playfield,
+            0,
+            +self.playfield,
+            -self.playfield,
+            0,
+            -self.playfield,
+            -self.playfield,
+            0,
+            -self.playfield,
+            +self.playfield,
+            0,
+        ]
+
+        k = self.playfield / 20.0
+        colors.extend([0.4, 0.9, 0.4, 1.0] * 4 * 20 * 20)
+        for x in range(-20, 20, 2):
+            for y in range(-20, 20, 2):
+                polygons_.extend(
+                    [
+                        k * x + k,
+                        k * y + 0,
+                        0,
+                        k * x + 0,
+                        k * y + 0,
+                        0,
+                        k * x + 0,
+                        k * y + k,
+                        0,
+                        k * x + k,
+                        k * y + k,
+                        0,
+                    ]
+                )
+
+        for poly, color in self.road_poly:
+            colors.extend([color[0], color[1], color[2], 1] * len(poly))
+            for p in poly:
+                polygons_.extend([p[0], p[1], 0])
+
+        vl = pyglet.graphics.vertex_list(
+            len(polygons_) // 3, ("v3f", polygons_), ("c4f", colors)  # gl.GL_QUADS,
+        )
+        vl.draw(gl.GL_QUADS)
+        vl.delete()
+
+    def render_indicators(self, W, H):
+        s = W / 40.0
+        h = H / 40.0
+        colors = [0, 0, 0, 1] * 4
+        polygons = [W, 0, 0, W, 5 * h, 0, 0, 5 * h, 0, 0, 0, 0]
+
+        def vertical_ind(place, val, color):
+            colors.extend([color[0], color[1], color[2], 1] * 4)
+            polygons.extend(
+                [
+                    place * s,
+                    h + h * val,
+                    0,
+                    (place + 1) * s,
+                    h + h * val,
+                    0,
+                    (place + 1) * s,
+                    h,
+                    0,
+                    (place + 0) * s,
+                    h,
+                    0,
+                ]
+            )
+
+        def horiz_ind(place, val, color):
+            colors.extend([color[0], color[1], color[2], 1] * 4)
+            polygons.extend(
+                [
+                    (place + 0) * s,
+                    4 * h,
+                    0,
+                    (place + val) * s,
+                    4 * h,
+                    0,
+                    (place + val) * s,
+                    2 * h,
+                    0,
+                    (place + 0) * s,
+                    2 * h,
+                    0,
+                ]
+            )
+
+        true_speed = np.sqrt(
+            np.square(self.car.hull.linearVelocity[0])
+            + np.square(self.car.hull.linearVelocity[1])
+        )
+
+        vertical_ind(5, 0.02 * true_speed, (1, 1, 1))
+        vertical_ind(7, 0.01 * self.car.wheels[0].omega, (0.0, 0, 1))  # ABS sensors
+        vertical_ind(8, 0.01 * self.car.wheels[1].omega, (0.0, 0, 1))
+        vertical_ind(9, 0.01 * self.car.wheels[2].omega, (0.2, 0, 1))
+        vertical_ind(10, 0.01 * self.car.wheels[3].omega, (0.2, 0, 1))
+        horiz_ind(20, -10.0 * self.car.wheels[0].joint.angle, (0, 1, 0))
+        horiz_ind(30, -0.8 * self.car.hull.angularVelocity, (1, 0, 0))
+        vl = pyglet.graphics.vertex_list(
+            len(polygons) // 3, ("v3f", polygons), ("c4f", colors)  # gl.GL_QUADS,
+        )
+        vl.draw(gl.GL_QUADS)
+        vl.delete()
+        self.score_label.text = "%04i" % self.reward
+        self.score_label.draw()
+
+
+if hasattr(__loader__, "name"):
+    module_path = __loader__.name
+elif hasattr(__loader__, "fullname"):
+    module_path = __loader__.fullname
+
+try:
+    gym_register(id="CarRacing-Bezier-v0", entry_point=module_path + ":CarRacingBezier")
+except:
+    pass
+
+if __name__ == "__main__":
+    from pyglet.window import key
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--track_name", type=str, default=None, help="Name of preexisting track."
+    )
+    parser.add_argument(
+        "--birdseye",
+        action="store_true",
+        default=False,
+        help="Show a fixed birdseye view of track.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="PRNG seed.")
+    args = parser.parse_args()
+
+    a = np.array([0.0, 0.0, 0.0])
+
+    def key_press(k, mod):
+        global restart
+        if k == 0xFF0D:
+            restart = True
+        if k == key.LEFT:
+            a[0] = -1.0
+        if k == key.RIGHT:
+            a[0] = +1.0
+        if k == key.UP:
+            a[1] = +1.0
+        if k == key.DOWN:
+            a[2] = +0.8  # set 1.0 for wheels to block to zero rotation
+
+    def key_release(k, mod):
+        if k == key.LEFT and a[0] == -1.0:
+            a[0] = 0
+        if k == key.RIGHT and a[0] == +1.0:
+            a[0] = 0
+        if k == key.UP:
+            a[1] = 0
+        if k == key.DOWN:
+            a[2] = 0
+
+    env = CarRacingBezier(
+        track_name=args.track_name, birdseye=args.birdseye, seed=args.seed
+    )
+    env.render()
+    env.viewer.window.on_key_press = key_press
+    env.viewer.window.on_key_release = key_release
+    record_video = False
+    if record_video:
+        from gym.wrappers.monitor import Monitor
+
+        env = Monitor(env, "videos/", force=True)
+    isopen = True
+    while isopen:
+        env.reset()
+        total_reward = 0.0
+        steps = 0
+        restart = False
+        while True:
+            s, r, done, info = env.step(a)
+            total_reward += r
+            if steps % 200 == 0 or done:
+                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
+                print("step {} total_reward {:+0.2f}".format(steps, total_reward))
+            steps += 1
+            isopen = env.render()
+            if done or restart or isopen is False:
+                break
+    env.close()
diff --git a/syllabus/examples/experimental/multi_car_racing.py b/syllabus/examples/experimental/multi_car_racing.py
index 3dc83450..29e83872 100644
--- a/syllabus/examples/experimental/multi_car_racing.py
+++ b/syllabus/examples/experimental/multi_car_racing.py
@@ -12,7 +12,11 @@
 from torch.distributions.normal import Normal
 from tqdm.auto import tqdm
 
-from syllabus.core import TaskWrapper, make_multiprocessing_curriculum
+from syllabus.core import (
+    PettingZooMultiProcessingSyncWrapper,
+    TaskWrapper,
+    make_multiprocessing_curriculum,
+)
 from syllabus.curricula import DomainRandomization
 from syllabus.task_space import TaskSpace
 
@@ -217,6 +221,7 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
     env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents)
     curriculum = DomainRandomization(env.task_space)
     curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum)
+    env = PettingZooMultiProcessingSyncWrapper(env, task_queue, update_queue)
 
     """ LEARNER SETUP """
     agent = Agent().to(device)
@@ -271,26 +276,6 @@ def step(self, action: dict[AgentID, ActionType]) -> tuple[
                 # compute episodic return
                 total_episodic_return += rb_rewards[step].cpu().numpy()
 
-                # Update curriculum
-                # TODO: adapt to DR
-                if global_cycles % num_steps == 0:
-                    update = {
-                        "update_type": "on_demand",
-                        "metrics": {
-                            "action_log_dist": logprobs,
-                            "value": values,
-                            "next_value": (
-                                agent.get_value(next_obs)
-                                if step == num_steps - 1
-                                else None
-                            ),
-                            "rew": rb_rewards[step],
-                            "masks": torch.Tensor(1 - np.array(list(dones.values()))),
-                            "tasks": [env.unwrapped.task],
-                        },
-                    }
-                    curriculum.update_curriculum(update)
-
                 # if we reach the end of the episode
                 if any([dones[a] for a in dones]):
                     print(f"Breaking early at step {step} due to done signal.")
diff --git a/syllabus/examples/experimental/multi_car_racing_v2.py b/syllabus/examples/experimental/multi_car_racing_v2.py
new file mode 100644
index 00000000..e2e5c77d
--- /dev/null
+++ b/syllabus/examples/experimental/multi_car_racing_v2.py
@@ -0,0 +1,432 @@
+# flake8: noqa: F401
+import argparse
+from typing import TypeVar
+
+import gym
+import gym_multi_car_racing
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from gymnasium import spaces
+from torch.distributions import Beta
+from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
+from tqdm.auto import tqdm
+
+from syllabus.core import (
+    PettingZooMultiProcessingSyncWrapper,
+    TaskWrapper,
+    make_multiprocessing_curriculum,
+)
+from syllabus.curricula import DomainRandomization
+from syllabus.task_space.task_space import TaskSpace
+
+ObsType = TypeVar("ObsType")
+ActionType = TypeVar("ActionType")
+AgentID = TypeVar("AgentID")
+
+parser = argparse.ArgumentParser(description="Train a PPO agent for the CarRacing-v0")
+parser.add_argument(
+    "--gamma",
+    type=float,
+    default=0.99,
+    metavar="G",
+    help="discount factor (default: 0.99)",
+)
+parser.add_argument(
+    "--action-repeat",
+    type=int,
+    default=8,
+    metavar="N",
+    help="repeat action in N frames (default: 8)",
+)
+parser.add_argument(
+    "--img-stack",
+    type=int,
+    default=4,
+    metavar="N",
+    help="stack N image in a state (default: 4)",
+)
+parser.add_argument(
+    "--seed", type=int, default=0, metavar="N", help="random seed (default: 0)"
+)
+parser.add_argument("--render", action="store_true", help="render the environment")
+parser.add_argument("--vis", action="store_true", help="use visdom")
+parser.add_argument(
+    "--log-interval",
+    type=int,
+    default=10,
+    metavar="N",
+    help="interval between training status logs (default: 10)",
+)
+args = parser.parse_args()
+
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if use_cuda else "cpu")
+torch.manual_seed(args.seed)
+if use_cuda:
+    torch.cuda.manual_seed(args.seed)
+
+transition = np.dtype(
+    [
+        ("s", np.float64, (args.img_stack, 96, 96)),
+        ("a", np.float64, (3,)),
+        ("a_logp", np.float64),
+        ("r", np.float64),
+        ("s_", np.float64, (args.img_stack, 96, 96)),
+    ]
+)
+
+
+# class Env:
+#     """
+#     Environment wrapper for CarRacing
+#     """
+
+#     def __init__(self):
+#         self.env = gym.make("CarRacing-v0")
+#         self.env.seed(args.seed)
+#         self.reward_threshold = self.env.spec.reward_threshold
+
+#     def reset(self):
+#         self.counter = 0
+#         self.av_r = self.reward_memory()
+
+#         self.die = False
+#         img_rgb = self.env.reset()
+#         img_gray = self.rgb2gray(img_rgb)
+#         self.stack = [img_gray] * args.img_stack  # four frames for decision
+#         return np.array(self.stack)
+
+#     def step(self, action):
+#         total_reward = 0
+#         for i in range(args.action_repeat):
+#             img_rgb, reward, die, _ = self.env.step(action)
+#             # don't penalize "die state"
+#             if die:
+#                 reward += 100
+#             # green penalty
+#             if np.mean(img_rgb[:, :, 1]) > 185.0:
+#                 reward -= 0.05
+#             total_reward += reward
+#             # if no reward recently, end the episode
+#             done = True if self.av_r(reward) <= -0.1 else False
+#             if done or die:
+#                 break
+#         img_gray = self.rgb2gray(img_rgb)
+#         self.stack.pop(0)
+#         self.stack.append(img_gray)
+#         assert len(self.stack) == args.img_stack
+#         return np.array(self.stack), total_reward, done, die
+
+#     def render(self, *arg):
+#         self.env.render(*arg)
+
+#     @staticmethod
+#     def rgb2gray(rgb, norm=True):
+#         # rgb image -> gray [0, 1]
+#         gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
+#         if norm:
+#             # normalize
+#             gray = gray / 128.0 - 1.0
+#         return gray
+
+#     @staticmethod
+#     def reward_memory():
+#         # record reward for last 100 steps
+#         count = 0
+#         length = 100
+#         history = np.zeros(length)
+
+#         def memory(reward):
+#             nonlocal count
+#             history[count] = reward
+#             count = (count + 1) % length
+#             return np.mean(history)
+
+#         return memory
+
+
+class MultiCarRacingParallelWrapper(TaskWrapper):
+    """
+    Wrapper ensuring compatibility with the PettingZoo Parallel API.
+
+    Car Racing Environment:
+        * Action shape:  ``n_agents`` * `Box([-1. 0. 0.], 1.0, (3,), float32)`
+        * Observation shape: ``n_agents`` * `Box(0, 255, (96, 96, 3), uint8)`
+        * Done: ``done`` is a single boolean value
+        * Info: ``info`` is unused and represented as an empty dictionary
+    """
+
+    def __init__(self, n_agents, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.n_agents = n_agents
+        self.task = None
+        self.episode_return = 0
+        self.task_space = TaskSpace(
+            spaces.Box(
+                low=np.array([-1.0, 0.0, 0.0]),
+                high=np.array([1.0, 1.0, 1.0]),
+                shape=(3,),
+                dtype=np.float32,
+            )
+        )
+        self.possible_agents = np.arange(
+            self.n_agents
+        )  # TODO: is this the intended use?
+
+    def _actions_pz_to_np(self, action: dict[AgentID, ActionType]) -> np.ndarray:
+        """
+        Converts actions defined in PZ format to a numpy array.
+        """
+        assert action.__len__() == self.n_agents
+
+        action = np.array(list(action.values()))
+        assert action.shape == (self.n_agents, 3)
+        return action
+
+    def _np_array_to_pz_dict(self, array: np.ndarray) -> dict[int : np.ndarray]:
+        """
+        Returns a dictionary containing individual observations for each agent.
+        """
+        out = {}
+        for idx, i in enumerate(array):
+            out[idx] = i
+        return out
+
+    def _singleton_to_pz_dict(self, value: bool) -> dict[int:bool]:
+        """
+        Broadcasts the `done` and `trunc` flags to dictionaries keyed by agent id.
+        """
+        return {idx: value for idx in range(self.n_agents)}
+
+    def reset(self) -> tuple[dict[AgentID, ObsType], dict[AgentID, dict]]:
+        """
+        Resets the environment and returns a dictionary of observations
+        keyed by agent ID.
+        """
+        # TODO: what is the second output (dict[AgentID, dict]])?
+        obs = self.env.reset()
+        pz_obs = self._np_array_to_pz_dict(obs)
+
+        return pz_obs
+
+    def step(self, action: dict[AgentID, ActionType]) -> tuple[
+        dict[AgentID, ObsType],
+        dict[AgentID, float],
+        dict[AgentID, bool],
+        dict[AgentID, bool],
+        dict[AgentID, dict],
+    ]:
+        """
+        Takes inputs in the PettingZoo (PZ) Parallel API format, performs a step and
+        returns outputs in PZ format.
+        """
+        # convert action to numpy format to perform the env step
+        np_action = self._actions_pz_to_np(action)
+        obs, rew, term, info = self.env.step(np_action)
+        trunc = 0  # there is no `truncated` flag in this environment
+        self.task_completion = self._task_completion(obs, rew, term, trunc, info)
+        # convert outputs back to PZ format
+        obs, rew = tuple(map(self._np_array_to_pz_dict, [obs, rew]))
+        term, trunc, info = tuple(
+            map(self._singleton_to_pz_dict, [term, trunc, self.task_completion])
+        )
+
+        return self.observation(obs), rew, term, trunc, info
+
+
+class Net(nn.Module):
+    """
+    Actor-Critic Network for PPO
+    """
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.cnn_base = nn.Sequential(  # input shape (4, 96, 96)
+            nn.Conv2d(args.img_stack, 8, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
+            nn.ReLU(),
+            nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
+            nn.ReLU(),
+            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
+            nn.ReLU(),
+            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
+            nn.ReLU(),
+        )  # output shape (256, 1, 1)
+        self.v = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1))
+        self.fc = nn.Sequential(nn.Linear(256, 100), nn.ReLU())
+        self.alpha_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
+        self.beta_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
+        self.apply(self._weights_init)
+
+    @staticmethod
+    def _weights_init(m):
+        if isinstance(m, nn.Conv2d):
+            nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain("relu"))
+            nn.init.constant_(m.bias, 0.1)
+
+    def forward(self, x):
+        x = self.cnn_base(x)
+        x = x.view(-1, 256)
+        v = self.v(x)
+        x = self.fc(x)
+        alpha = self.alpha_head(x) + 1
+        beta = self.beta_head(x) + 1
+
+        return (alpha, beta), v
+
+
+class Agent:
+    """
+    Agent for training
+    """
+
+    max_grad_norm = 0.5
+    clip_param = 0.1  # epsilon in clipped loss
+    ppo_epoch = 10
+    buffer_capacity, batch_size = 2000, 128
+
+    def __init__(self):
+        self.training_step = 0
+        self.net = Net().double().to(device)
+        self.buffer = np.empty(self.buffer_capacity, dtype=transition)
+        self.counter = 0
+
+        self.optimizer = optim.Adam(self.net.parameters(), lr=1e-3)
+
+    def select_action(self, state):
+        state = torch.from_numpy(state).double().to(device).unsqueeze(0)
+        with torch.no_grad():
+            alpha, beta = self.net(state)[0]
+        dist = Beta(alpha, beta)
+        action = dist.sample()
+        a_logp = dist.log_prob(action).sum(dim=1)
+
+        action = action.squeeze().cpu().numpy()
+        a_logp = a_logp.item()
+        return action, a_logp
+
+    def save_param(self):
+        torch.save(self.net.state_dict(), "param/ppo_net_params.pkl")
+
+    def store(self, transition):
+        self.buffer[self.counter] = transition
+        self.counter += 1
+        if self.counter == self.buffer_capacity:
+            self.counter = 0
+            return True
+        else:
+            return False
+
+    def update(self):
+        self.training_step += 1
+
+        s = torch.tensor(self.buffer["s"], dtype=torch.double).to(device)
+        a = torch.tensor(self.buffer["a"], dtype=torch.double).to(device)
+        r = torch.tensor(self.buffer["r"], dtype=torch.double).to(device).view(-1, 1)
+        s_ = torch.tensor(self.buffer["s_"], dtype=torch.double).to(device)
+
+        old_a_logp = (
+            torch.tensor(self.buffer["a_logp"], dtype=torch.double)
+            .to(device)
+            .view(-1, 1)
+        )
+
+        with torch.no_grad():
+            target_v = r + args.gamma * self.net(s_)[1]
+            adv = target_v - self.net(s)[1]
+            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)
+
+        for _ in range(self.ppo_epoch):
+            for index in BatchSampler(
+                SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False
+            ):
+
+                alpha, beta = self.net(s[index])[0]
+                dist = Beta(alpha, beta)
+                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
+                ratio = torch.exp(a_logp - old_a_logp[index])
+
+                surr1 = ratio * adv[index]
+                surr2 = (
+                    torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param)
+                    * adv[index]
+                )
+                action_loss = -torch.min(surr1, surr2).mean()
+                value_loss = F.smooth_l1_loss(self.net(s[index])[1], target_v[index])
+                loss = action_loss + 2.0 * value_loss
+
+                self.optimizer.zero_grad()
+                loss.backward()
+                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
+                self.optimizer.step()
+
+
+if __name__ == "__main__":
+    agent = Agent()
+    """ CURRICULUM SETUP """
+    n_agents = 2
+    env = gym.make(
+        "MultiCarRacing-v0",
+        num_agents=n_agents,
+        direction="CCW",
+        use_random_direction=True,
+        backwards_flag=True,
+        h_ratio=0.25,
+        use_ego_color=False,
+    )
+    env = MultiCarRacingParallelWrapper(env=env, n_agents=n_agents)
+    curriculum = DomainRandomization(env.task_space)
+    curriculum, task_queue, update_queue = make_multiprocessing_curriculum(curriculum)
+    env = PettingZooMultiProcessingSyncWrapper(
+        env,
+        task_queue,
+        update_queue,
+        update_on_step=False,
+        task_space=env.task_space,
+    )
+
+    n_episodes = 10
+    n_steps = 100
+
+    training_records = []
+    running_score = 0
+    state = env.reset()
+    for i_ep in tqdm(range(n_episodes)):
+        score = 0
+        state = env.reset()
+
+        for t in tqdm(range(n_steps)):
+            action, a_logp = agent.select_action(state)
+            state_, reward, done, die = env.step(
+                action * np.array([2.0, 1.0, 1.0]) + np.array([-1.0, 0.0, 0.0])
+            )
+            if args.render:
+                env.render()
+            if agent.store((state, action, a_logp, reward, state_)):
+                print("updating")
+                agent.update()
+            score += reward
+            state = state_
+            if done or die:
+                break
+        running_score = running_score * 0.99 + score * 0.01
+
+        if i_ep % args.log_interval == 0:
+            print(
+                "Ep {}\tLast score: {:.2f}\tMoving average score: {:.2f}".format(
+                    i_ep, score, running_score
+                )
+            )
+            # agent.save_param()
+        if running_score > env.reward_threshold:
+            print(
+                f"""Solved! Running reward is now {running_score} and the last \\
+                      episode runs to"""
+            )
+            break