From f0a433df01373e66888151a1773a90e651fb1c82 Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Fri, 17 Apr 2020 13:31:27 -0700
Subject: [PATCH 1/5] Remove redundant passing of head.unique_id

Summary:
When setting heads, we do the following currently -
```
model.set_heads({"block3-2": {head.unique_id: head}})
```
This is a redundant structure, since the unique_id is already available in `head`. I was writing a tutorial and I want to show a cleaner API there -
```
model.set_heads({"block3-2": [head]})
```

Differential Revision: D21096621

fbshipit-source-id: 65c1eb5caacf2970d60eb050061aeadf69db94d9
---
 classy_vision/models/__init__.py     |  4 ++--
 classy_vision/models/classy_model.py | 17 +++++++++++------
 tutorials/fine_tuning.ipynb          |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/classy_vision/models/__init__.py b/classy_vision/models/__init__.py
index 97c841fded..ecaa728cc7 100644
--- a/classy_vision/models/__init__.py
+++ b/classy_vision/models/__init__.py
@@ -69,7 +69,7 @@ def build_model(config):
     assert config["name"] in MODEL_REGISTRY, "unknown model"
     model = MODEL_REGISTRY[config["name"]].from_config(config)
     if "heads" in config:
-        heads = defaultdict(dict)
+        heads = defaultdict(list)
         for head_config in config["heads"]:
             assert "fork_block" in head_config, "Expect fork_block in config"
             fork_block = head_config["fork_block"]
@@ -77,7 +77,7 @@ def build_model(config):
             del updated_config["fork_block"]
 
             head = build_head(updated_config)
-            heads[fork_block][head.unique_id] = head
+            heads[fork_block].append(head)
         model.set_heads(heads)
     return model
 
diff --git a/classy_vision/models/classy_model.py b/classy_vision/models/classy_model.py
index de925cfd59..d5547a9ac8 100644
--- a/classy_vision/models/classy_model.py
+++ b/classy_vision/models/classy_model.py
@@ -342,7 +342,7 @@ def _make_module_attachable(self, module, module_name):
             found = found or found_in_child
         return found
 
-    def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
+    def set_heads(self, heads: Dict[str, List[ClassyHead]]):
         """Attach all the heads to corresponding blocks.
 
         A head is expected to be a ClassyHead object. For more
@@ -350,7 +350,7 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
 
         Args:
             heads (Dict): a mapping between attachable block name
-                and a dictionary of heads attached to that block. For
+                and a list of heads attached to that block. For
                 example, if you have two different teams that want to
                 attach two different heads for downstream classifiers to
                 the 15th block, then they would use:
@@ -358,7 +358,7 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
                 .. code-block:: python
 
                   heads = {"block15":
-                      {"team1": classifier_head1, "team2": classifier_head2}
+                      [classifier_head1, classifier_head2]
                   }
         """
         self.clear_heads()
@@ -367,11 +367,13 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
         for block_name, block_heads in heads.items():
             if not self._make_module_attachable(self, block_name):
                 raise KeyError(f"{block_name} not found in the model")
-            for head in block_heads.values():
+            for head in block_heads:
                 if head.unique_id in head_ids:
                     raise ValueError("head id {} already exists".format(head.unique_id))
                 head_ids.add(head.unique_id)
-            self._heads[block_name] = nn.ModuleDict(block_heads)
+            self._heads[block_name] = nn.ModuleDict(
+                {head.unique_id: head for head in block_heads}
+            )
 
     def get_heads(self):
         """Returns the heads on the model
@@ -381,7 +383,10 @@ def get_heads(self):
         attached to that block.
 
         """
-        return {block_name: dict(heads) for block_name, heads in self._heads.items()}
+        return {
+            block_name: list(heads.values())
+            for block_name, heads in self._heads.items()
+        }
 
     @property
     def head_outputs(self):
diff --git a/tutorials/fine_tuning.ipynb b/tutorials/fine_tuning.ipynb
index 6a2f85e0b7..0ab3a450a9 100644
--- a/tutorials/fine_tuning.ipynb
+++ b/tutorials/fine_tuning.ipynb
@@ -149,7 +149,7 @@
    },
    "outputs": [],
    "source": [
-    "model.set_heads({\"block3-2\": {head.unique_id: head}})"
+    "model.set_heads({\"block3-2\": [head]})"
    ]
   },
   {
@@ -443,7 +443,7 @@
    },
    "outputs": [],
    "source": [
-    "model.set_heads({\"block3-2\": {head.unique_id: head}})"
+    "model.set_heads({\"block3-2\": [head]})"
    ]
   },
   {

From 45bf0b4e6aac82e23a427b4eb9292c8d8c6455a6 Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Fri, 17 Apr 2020 20:14:34 -0700
Subject: [PATCH 2/5] Remove redundant passing of head.unique_id (#484)

Summary:
Pull Request resolved: https://github.com/facebookresearch/ClassyVision/pull/484

When setting heads, we do the following currently -
```
model.set_heads({"block3-2": {head.unique_id: head}})
```
This is a redundant structure, since the unique_id is already available in `head`. I was writing a tutorial and I want to show a cleaner API there -
```
model.set_heads({"block3-2": [head]})
```

Differential Revision: D21096621

fbshipit-source-id: 4dfc87e172390c43e9cd7fce98ad987099542ca2
---
 classy_vision/models/__init__.py              |  4 ++--
 classy_vision/models/classy_model.py          | 19 ++++++++++++-------
 classy_vision/tasks/fine_tuning_task.py       |  4 ++--
 .../manual/models_classy_vision_model_test.py |  4 ++--
 test/models_classy_block_test.py              | 17 +++++++----------
 test/models_classy_model_test.py              |  2 +-
 tutorials/fine_tuning.ipynb                   |  4 ++--
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/classy_vision/models/__init__.py b/classy_vision/models/__init__.py
index 97c841fded..ecaa728cc7 100644
--- a/classy_vision/models/__init__.py
+++ b/classy_vision/models/__init__.py
@@ -69,7 +69,7 @@ def build_model(config):
     assert config["name"] in MODEL_REGISTRY, "unknown model"
     model = MODEL_REGISTRY[config["name"]].from_config(config)
     if "heads" in config:
-        heads = defaultdict(dict)
+        heads = defaultdict(list)
         for head_config in config["heads"]:
             assert "fork_block" in head_config, "Expect fork_block in config"
             fork_block = head_config["fork_block"]
@@ -77,7 +77,7 @@ def build_model(config):
             del updated_config["fork_block"]
 
             head = build_head(updated_config)
-            heads[fork_block][head.unique_id] = head
+            heads[fork_block].append(head)
         model.set_heads(heads)
     return model
 
diff --git a/classy_vision/models/classy_model.py b/classy_vision/models/classy_model.py
index de925cfd59..1cc65839d1 100644
--- a/classy_vision/models/classy_model.py
+++ b/classy_vision/models/classy_model.py
@@ -234,7 +234,7 @@ def get_classy_state(self, deep_copy=False):
         head_state_dict = {}
         for block, heads in attached_heads.items():
             head_state_dict[block] = {
-                head_name: head.state_dict() for head_name, head in heads.items()
+                head.unique_id: head.state_dict() for head in heads
             }
         model_state_dict = {
             "model": {"trunk": trunk_state_dict, "heads": head_state_dict}
@@ -342,7 +342,7 @@ def _make_module_attachable(self, module, module_name):
             found = found or found_in_child
         return found
 
-    def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
+    def set_heads(self, heads: Dict[str, List[ClassyHead]]):
         """Attach all the heads to corresponding blocks.
 
         A head is expected to be a ClassyHead object. For more
@@ -350,7 +350,7 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
 
         Args:
             heads (Dict): a mapping between attachable block name
-                and a dictionary of heads attached to that block. For
+                and a list of heads attached to that block. For
                 example, if you have two different teams that want to
                 attach two different heads for downstream classifiers to
                 the 15th block, then they would use:
@@ -358,7 +358,7 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
                 .. code-block:: python
 
                   heads = {"block15":
-                      {"team1": classifier_head1, "team2": classifier_head2}
+                      [classifier_head1, classifier_head2]
                   }
         """
         self.clear_heads()
@@ -367,11 +367,13 @@ def set_heads(self, heads: Dict[str, Dict[str, ClassyHead]]):
         for block_name, block_heads in heads.items():
             if not self._make_module_attachable(self, block_name):
                 raise KeyError(f"{block_name} not found in the model")
-            for head in block_heads.values():
+            for head in block_heads:
                 if head.unique_id in head_ids:
                     raise ValueError("head id {} already exists".format(head.unique_id))
                 head_ids.add(head.unique_id)
-            self._heads[block_name] = nn.ModuleDict(block_heads)
+            self._heads[block_name] = nn.ModuleDict(
+                {head.unique_id: head for head in block_heads}
+            )
 
     def get_heads(self):
         """Returns the heads on the model
@@ -381,7 +383,10 @@ def get_heads(self):
         attached to that block.
 
         """
-        return {block_name: dict(heads) for block_name, heads in self._heads.items()}
+        return {
+            block_name: list(heads.values())
+            for block_name, heads in self._heads.items()
+        }
 
     @property
     def head_outputs(self):
diff --git a/classy_vision/tasks/fine_tuning_task.py b/classy_vision/tasks/fine_tuning_task.py
index 5da2b382ce..3cac44d50d 100644
--- a/classy_vision/tasks/fine_tuning_task.py
+++ b/classy_vision/tasks/fine_tuning_task.py
@@ -61,7 +61,7 @@ def _set_model_train_mode(self):
             # convert all the sub-modules to the eval mode, except the heads
             self.base_model.eval()
             for heads in self.base_model.get_heads().values():
-                for h in heads.values():
+                for h in heads:
                     h.train(phase["train"])
         else:
             self.base_model.train(phase["train"])
@@ -91,7 +91,7 @@ def prepare(
             for param in self.base_model.parameters():
                 param.requires_grad = False
             for heads in self.base_model.get_heads().values():
-                for h in heads.values():
+                for h in heads:
                     for param in h.parameters():
                         param.requires_grad = True
             # re-create ddp model
diff --git a/test/manual/models_classy_vision_model_test.py b/test/manual/models_classy_vision_model_test.py
index effbf3dc9d..ea89912607 100644
--- a/test/manual/models_classy_vision_model_test.py
+++ b/test/manual/models_classy_vision_model_test.py
@@ -94,10 +94,10 @@ def test_get_set_head_states(self):
         model = build_model(config["model"])
         trunk_state = model.get_classy_state()
 
-        heads = defaultdict(dict)
+        heads = defaultdict(list)
         for head_config in head_configs:
             head = build_head(head_config)
-            heads[head_config["fork_block"]][head.unique_id] = head
+            heads[head_config["fork_block"]].append(head)
         model.set_heads(heads)
         model_state = model.get_classy_state()
 
diff --git a/test/models_classy_block_test.py b/test/models_classy_block_test.py
index 7040def911..a187466f25 100644
--- a/test/models_classy_block_test.py
+++ b/test/models_classy_block_test.py
@@ -55,7 +55,7 @@ def test_head_execution(self):
             self.DummyTestModel.wrapper_cls = wrapper_class
             model = self.DummyTestModel()
             head = self.DummyTestHead()
-            model.set_heads({"dummy_block2": {head.unique_id: head}})
+            model.set_heads({"dummy_block2": [head]})
             input = torch.randn(1, 2)
             output = model(input)
             head_output = model.execute_heads()
@@ -66,7 +66,7 @@ def test_head_execution(self):
         self.DummyTestModel.wrapper_cls = ClassyModelHeadExecutorWrapper
         model = self.DummyTestModel()
         head = self.DummyTestHead()
-        model.set_heads({"dummy_block2": {head.unique_id: head}})
+        model.set_heads({"dummy_block2": [head]})
         input = torch.randn(1, 2)
         output = model(input)
         head_output = model.execute_heads()
@@ -79,10 +79,7 @@ def test_duplicated_head_ids(self):
         model = self.DummyTestModel()
         head1 = self.DummyTestHead()
         head2 = self.DummyTestHead()
-        heads = {
-            "dummy_block": {head1.unique_id: head1},
-            "dummy_block2": {head2.unique_id: head2},
-        }
+        heads = {"dummy_block": [head1], "dummy_block2": [head2]}
         with self.assertRaises(ValueError):
             model.set_heads(heads)
 
@@ -92,13 +89,13 @@ def test_duplicated_head_ids(self):
     def test_duplicated_block_names(self):
         model = self.DummyTestModelDuplicatedBlockNames()
         head = self.DummyTestHead()
-        heads = {"dummy_block2": {head.unique_id: head}}
+        heads = {"dummy_block2": [head]}
         with self.assertRaises(Exception):
             # there are two modules with the name "dummy_block2"
             # which is not supported
             model.set_heads(heads)
         # can still attach to a module with a unique id
-        heads = {"features": {head.unique_id: head}}
+        heads = {"features": [head]}
         model.set_heads(heads)
 
     def test_set_heads(self):
@@ -107,7 +104,7 @@ def test_set_heads(self):
         self.assertEqual(
             len(model.get_heads()), 0, "heads should be empty before set_heads"
         )
-        model.set_heads({"dummy_block2": {head.unique_id: head}})
+        model.set_heads({"dummy_block2": [head]})
         input = torch.randn(1, 2)
         model(input)
         head_outputs = model.execute_heads()
@@ -119,4 +116,4 @@ def test_set_heads(self):
 
         # try a non-existing module
         with self.assertRaises(Exception):
-            model.set_heads({"unknown_block": {head.unique_id: head}})
+            model.set_heads({"unknown_block": [head]})
diff --git a/test/models_classy_model_test.py b/test/models_classy_model_test.py
index 40f0d79541..3f305dbcdb 100644
--- a/test/models_classy_model_test.py
+++ b/test/models_classy_model_test.py
@@ -250,6 +250,6 @@ def test_heads(self):
         head = FullyConnectedHead(
             unique_id="default", in_plane=2048, num_classes=num_classes
         )
-        classy_model.set_heads({"layer4": {head.unique_id: head}})
+        classy_model.set_heads({"layer4": [head]})
         input = torch.ones((1, 3, 224, 224))
         self.assertEqual(classy_model(input).shape, (1, num_classes))
diff --git a/tutorials/fine_tuning.ipynb b/tutorials/fine_tuning.ipynb
index 6a2f85e0b7..0ab3a450a9 100644
--- a/tutorials/fine_tuning.ipynb
+++ b/tutorials/fine_tuning.ipynb
@@ -149,7 +149,7 @@
    },
    "outputs": [],
    "source": [
-    "model.set_heads({\"block3-2\": {head.unique_id: head}})"
+    "model.set_heads({\"block3-2\": [head]})"
    ]
   },
   {
@@ -443,7 +443,7 @@
    },
    "outputs": [],
    "source": [
-    "model.set_heads({\"block3-2\": {head.unique_id: head}})"
+    "model.set_heads({\"block3-2\": [head]})"
    ]
   },
   {

From f855b6f8390f55d9e1887487997a58456fa01570 Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Fri, 17 Apr 2020 23:37:00 -0400
Subject: [PATCH 3/5] Add a tutorial which gives an overview of classy models

---
 tutorials/classy_model_overview.ipynb | 345 ++++++++++++++++++++++++++
 website/tutorials.json                |   3 +
 2 files changed, 348 insertions(+)
 create mode 100644 tutorials/classy_model_overview.ipynb

diff --git a/tutorials/classy_model_overview.ipynb b/tutorials/classy_model_overview.ipynb
new file mode 100644
index 0000000000..c9c6188bde
--- /dev/null
+++ b/tutorials/classy_model_overview.ipynb
@@ -0,0 +1,345 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Classy Model Overview\n",
+    "\n",
+    "Before reading this, please go over the [Getting Started tutorial](https://classyvision.ai/tutorials/getting_started).\n",
+    "\n",
+    "Working with Classy Vision requires models to be instances of `ClassyModel`. A `ClassyModel` is an instance of `torch.nn.Module`, but packed with a lot of extra features! \n",
+    "\n",
+    "If your model isn't implemented as a `ClassyModel`, don't fret - you can easily convert it to one in one line.\n",
+    "\n",
+    "In this tutorial, we will cover:\n",
+    "1. Using Classy Models\n",
+    "1. Getting and setting the state of a model\n",
+    "1. Heads: Introduction and using Classy Heads\n",
+    "1. Creating your own Classy Model\n",
+    "1. Converting any PyTorch model to a Classy Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Classy Models\n",
+    "As `Classy Model`s are also instances of `nn.Module`, they can be treated as any normal PyTorch model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from classy_vision.models import build_model\n",
+    "\n",
+    "\n",
+    "model = build_model({\"name\": \"resnet50\"})\n",
+    "input = torch.ones(10, 3, 224, 224)  # a batch of 10 images with 3 channels with dimensions of 224 x 224\n",
+    "output = model(input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting and setting the state of a model\n",
+    "\n",
+    "Classy Vision provides the functions `get_classy_state()` and `set_classy_state()` to fetch and save the state of the models. These are considered drop-in replacements for the [`torch.nn.Module.state_dict`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.state_dict) and [`torch.nn.Module.load_state_dict()`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.load_state_dict) functions and work similary. For more information, refer to the docs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state = model.get_classy_state()\n",
+    "\n",
+    "model.set_classy_state(state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Heads: Introduction and using Classy Heads\n",
+    "\n",
+    "A lot of work in Computer Vision utilizes the concept of re-using a trunk model, like a ResNet 50, and using it for various tasks. This is accomplished by attaching different \"heads\" to the end of the trunk. \n",
+    "\n",
+    "Some use cases involve re-training a model trained with a certain head by removing the old head and attaching a new one. This is a special case of fine tuning. If you are interested in fine tuning your models, there's a [tutorial for that as well](https://classyvision.ai/tutorials/fine_tuning)! But first, let's understand the basics.\n",
+    "\n",
+    "Normally, attaching heads or changing them requires users to write code and update their model implementations. Classy Vision does all of this work for you - all of this happens under the hood, with no work required by users!\n",
+    "\n",
+    "All you need to do is decide which `ClassyHead` you want to attach to your model and where. We will use a simple fully connected head in our example, and attach it to the output of the `block3-2` module of our model. Note that a head can be attached to any module, as long as the name of the module is unique.\n",
+    "\n",
+    "Classy Vision supports attaching multiple heads at once as well, but that is an advanced concept which this tutorial does not cover."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from classy_vision.heads import FullyConnectedHead\n",
+    "\n",
+    "\n",
+    "# a resnet 50 model's trunk outputs a tensor of 2048 dimension, which will be the\n",
+    "# in_plane of out head\n",
+    "#\n",
+    "# let's say we want a 100 dimensional output\n",
+    "#\n",
+    "# Tip: you can use build_head() as well to create a head instead of initializing the\n",
+    "# class directly\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=100, in_plane=2048)\n",
+    "\n",
+    "# let's attach this head to our model\n",
+    "model.set_heads({\"block3-2\": [head]})\n",
+    "\n",
+    "output = model(input)\n",
+    "assert output.shape == (10, 100)\n",
+    "\n",
+    "# let's change the head one more time\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=2048)\n",
+    "\n",
+    "model.set_heads({\"block3-2\": [head]})\n",
+    "\n",
+    "output = model(input)\n",
+    "assert output.shape == (10, 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating your own Classy Model\n",
+    "\n",
+    "Please refer to the [dedicated tutorial for this section](https://classyvision.ai/tutorials/classy_model)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Converting any PyTorch model to a Classy Model\n",
+    "\n",
+    "Any model can be converted to a Classy Model with a simple function call - `ClassyModel.from_model()`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchvision.models import resnet18\n",
+    "from classy_vision.models import ClassyModel\n",
+    "\n",
+    "\n",
+    "model = resnet18()\n",
+    "classy_model = ClassyModel.from_model(model)\n",
+    "output = classy_model(input)\n",
+    "assert output.shape == (10, 1000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In fact, as soon as a model becomes a Classy Model, it gains all its abilities as well, including the ability to attach heads! Let us inspect the original model to see the modules it comprises."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ResNet(\n",
+       "  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
+       "  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "  (relu): ReLU(inplace=True)\n",
+       "  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+       "  (layer1): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer2): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer3): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer4): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
+       "  (fc): Linear(in_features=512, out_features=1000, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It seems that the final trunk layer of this model is called `layer4`. Let's try to attach heads here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the output of layer4 is 512 dimensional\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=512)\n",
+    "\n",
+    "classy_model.set_heads({\"layer4\": [head]})\n",
+    "\n",
+    "output = classy_model(input)\n",
+    "assert output.shape == (10, 10)  # it works!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You might be wondering how to figure out the `in_plane` for any module. A simple trick is to try attaching any head and noticing the `Exception` if there is a size mismatch!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "size mismatch, m1: [10 x 512], m2: [1234 x 10] at ../aten/src/TH/generic/THTensorMath.cpp:136\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=1234)\n",
+    "\n",
+    "    classy_model.set_heads({\"layer4\": [head]})\n",
+    "\n",
+    "    output = classy_model(input)\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The error tells us that the size should be 512.\n",
+    "\n",
+    "That's all for this tutorial. For more information, refer to our [API docs](https://classyvision.ai/api/)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/website/tutorials.json b/website/tutorials.json
index 8d98c0e203..9dbcf9e720 100644
--- a/website/tutorials.json
+++ b/website/tutorials.json
@@ -6,6 +6,9 @@
      },{
       "id": "ray_aws",
       "title": "Distributed training on AWS"
+     },{
+      "id": "classy_model_overview",
+      "title": "Classy model overview"
      },{
       "id": "classy_dataset",
       "title": "Creating a custom dataset"

From 14c9f815dc7d588010bc84e606b55469c60410bf Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Mon, 20 Apr 2020 22:49:02 -0400
Subject: [PATCH 4/5] Incorporate review suggestions

---
 tutorials/classy_model_overview.ipynb | 58 ++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/tutorials/classy_model_overview.ipynb b/tutorials/classy_model_overview.ipynb
index c9c6188bde..cebfa3d26b 100644
--- a/tutorials/classy_model_overview.ipynb
+++ b/tutorials/classy_model_overview.ipynb
@@ -25,12 +25,12 @@
    "metadata": {},
    "source": [
     "## Using Classy Models\n",
-    "As `Classy Model`s are also instances of `nn.Module`, they can be treated as any normal PyTorch model."
+    "As `ClassyModel`s are also instances of `nn.Module`, they can be treated as any normal PyTorch model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,12 +49,12 @@
    "source": [
     "## Getting and setting the state of a model\n",
     "\n",
-    "Classy Vision provides the functions `get_classy_state()` and `set_classy_state()` to fetch and save the state of the models. These are considered drop-in replacements for the [`torch.nn.Module.state_dict`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.state_dict) and [`torch.nn.Module.load_state_dict()`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.load_state_dict) functions and work similary. For more information, refer to the docs."
+    "Classy Vision provides the functions `get_classy_state()` and `set_classy_state()` to fetch and save the state of the models. These are considered drop-in replacements for the [`torch.nn.Module.state_dict`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.state_dict) and [`torch.nn.Module.load_state_dict()`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.load_state_dict) functions and work similarly. For more information, refer to the [docs](https://classyvision.ai/api/models.html#classy_vision.models.ClassyModel)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -75,14 +75,12 @@
     "\n",
     "Normally, attaching heads or changing them requires users to write code and update their model implementations. Classy Vision does all of this work for you - all of this happens under the hood, with no work required by users!\n",
     "\n",
-    "All you need to do is decide which `ClassyHead` you want to attach to your model and where. We will use a simple fully connected head in our example, and attach it to the output of the `block3-2` module of our model. Note that a head can be attached to any module, as long as the name of the module is unique.\n",
-    "\n",
-    "Classy Vision supports attaching multiple heads at once as well, but that is an advanced concept which this tutorial does not cover."
+    "All you need to do is decide which `ClassyHead` you want to attach to your model and where. We will use a simple fully connected head in our example, and attach it to the output of the `block3-2` module of our model. Note that a head can be attached to any module, as long as the name of the module is unique."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -113,6 +111,29 @@
     "assert output.shape == (10, 10)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Classy Vision supports attaching multiple heads to one or more blocks as well, but that is an advanced concept which this tutorial does not cover. For inquisitive users, here is an example -"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "head_1_1 = FullyConnectedHead(unique_id=\"1_1\", num_classes=10, in_plane=1024)\n",
+    "head_1_2 = FullyConnectedHead(unique_id=\"1_2\", num_classes=20, in_plane=1024)\n",
+    "head_2 = FullyConnectedHead(unique_id=\"2\", num_classes=100, in_plane=2048)\n",
+    "\n",
+    "# we can attach these heads to our model at different blocks\n",
+    "model.set_heads({\"block2-2\": [head_1_1, head_1_2], \"block3-2\": [head_2]})\n",
+    "\n",
+    "output = model(input)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -133,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -156,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -248,7 +269,7 @@
        ")"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -266,7 +287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -315,9 +336,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The error tells us that the size should be 512.\n",
+    "The error tells us that the size should be 512."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "In this tutorial, we covered how to use Classy Models, how to get and set their state, and how to create our own models (through another tutorial). We also got familiarized with the concept of heads and how they work with Classy Vision. Lastly, we learned how we can easily convert any PyTorch models to Classy Models and unlock all the features they provide.\n",
     "\n",
-    "That's all for this tutorial. For more information, refer to our [API docs](https://classyvision.ai/api/)."
+    "For more information, refer to our [API docs](https://classyvision.ai/api/)."
    ]
   }
  ],

From c1b884da9b3f1f7ace3636974bdcd3478d2a5709 Mon Sep 17 00:00:00 2001
From: Mannat Singh <mannatsingh@fb.com>
Date: Wed, 22 Apr 2020 11:38:03 -0400
Subject: [PATCH 5/5] Merge the two classy model tutorials

---
 scripts/publish_website.sh            |   0
 tutorials/classy_model.ipynb          | 367 +++++++++++++++++++++++--
 tutorials/classy_model_overview.ipynb | 375 --------------------------
 tutorials/getting_started.ipynb       |   2 +-
 website/tutorials.json                |   5 +-
 5 files changed, 349 insertions(+), 400 deletions(-)
 mode change 100644 => 100755 scripts/publish_website.sh
 delete mode 100644 tutorials/classy_model_overview.ipynb

diff --git a/scripts/publish_website.sh b/scripts/publish_website.sh
old mode 100644
new mode 100755
diff --git a/tutorials/classy_model.ipynb b/tutorials/classy_model.ipynb
index bdef9bef82..b13b488464 100644
--- a/tutorials/classy_model.ipynb
+++ b/tutorials/classy_model.ipynb
@@ -4,23 +4,152 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Creating a custom model"
+    "# Classy Models\n",
+    "\n",
+    "Before reading this, please go over the [Getting Started tutorial](https://classyvision.ai/tutorials/getting_started).\n",
+    "\n",
+    "Working with Classy Vision requires models to be instances of `ClassyModel`. A `ClassyModel` is an instance of `torch.nn.Module`, but packed with a lot of extra features! \n",
+    "\n",
+    "If your model isn't implemented as a `ClassyModel`, don't fret - you can easily convert it to one in one line.\n",
+    "\n",
+    "In this tutorial, we will cover:\n",
+    "1. Using Classy Models\n",
+    "1. Getting and setting the state of a model\n",
+    "1. Heads: Introduction & Using Classy Heads\n",
+    "1. Creating a custom Classy Model\n",
+    "1. Converting any PyTorch model to a Classy Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Classy Models\n",
+    "As `ClassyModel`s are also instances of `nn.Module`, they can be treated as any normal PyTorch model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from classy_vision.models import build_model\n",
+    "\n",
+    "\n",
+    "model = build_model({\"name\": \"resnet50\"})\n",
+    "input = torch.ones(10, 3, 224, 224)  # a batch of 10 images with 3 channels with dimensions of 224 x 224\n",
+    "output = model(input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting and setting the state of a model\n",
+    "\n",
+    "Classy Vision provides the functions `get_classy_state()` and `set_classy_state()` to fetch and save the state of the models. These are considered drop-in replacements for the [`torch.nn.Module.state_dict`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.state_dict) and [`torch.nn.Module.load_state_dict()`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.load_state_dict) functions and work similarly. For more information, refer to the [docs](https://classyvision.ai/api/models.html#classy_vision.models.ClassyModel)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state = model.get_classy_state()\n",
+    "\n",
+    "model.set_classy_state(state)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Heads: Introduction & Using Classy Heads\n",
+    "\n",
+    "A lot of work in Computer Vision utilizes the concept of re-using a trunk model, like a ResNet 50, and using it for various tasks. This is accomplished by attaching different \"heads\" to the end of the trunk. \n",
+    "\n",
+    "Some use cases involve re-training a model trained with a certain head by removing the old head and attaching a new one. This is a special case of fine tuning. If you are interested in fine tuning your models, there's a [tutorial for that as well](https://classyvision.ai/tutorials/fine_tuning)! But first, let's understand the basics.\n",
+    "\n",
+    "Normally, attaching heads or changing them requires users to write code and update their model implementations. Classy Vision does all of this work for you - all of this happens under the hood, with no work required by users!\n",
+    "\n",
+    "All you need to do is decide which `ClassyHead` you want to attach to your model and where. We will use a simple fully connected head in our example, and attach it to the output of the `block3-2` module of our model. Note that a head can be attached to any module, as long as the name of the module is unique."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from classy_vision.heads import FullyConnectedHead\n",
+    "\n",
+    "\n",
+    "# a resnet 50 model's trunk outputs a tensor of 2048 dimension, which will be the\n",
+    "# in_plane of out head\n",
+    "#\n",
+    "# let's say we want a 100 dimensional output\n",
+    "#\n",
+    "# Tip: you can use build_head() as well to create a head instead of initializing the\n",
+    "# class directly\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=100, in_plane=2048)\n",
+    "\n",
+    "# let's attach this head to our model\n",
+    "model.set_heads({\"block3-2\": [head]})\n",
+    "\n",
+    "output = model(input)\n",
+    "assert output.shape == (10, 100)\n",
+    "\n",
+    "# let's change the head one more time\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=2048)\n",
+    "\n",
+    "model.set_heads({\"block3-2\": [head]})\n",
+    "\n",
+    "output = model(input)\n",
+    "assert output.shape == (10, 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Classy Vision supports attaching multiple heads to one or more blocks as well, but that is an advanced concept which this tutorial does not cover. For inquisitive users, here is an example -"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "head_1_1 = FullyConnectedHead(unique_id=\"1_1\", num_classes=10, in_plane=1024)\n",
+    "head_1_2 = FullyConnectedHead(unique_id=\"1_2\", num_classes=20, in_plane=1024)\n",
+    "head_2 = FullyConnectedHead(unique_id=\"2\", num_classes=100, in_plane=2048)\n",
+    "\n",
+    "# we can attach these heads to our model at different blocks\n",
+    "model.set_heads({\"block2-2\": [head_1_1, head_1_2], \"block3-2\": [head_2]})\n",
+    "\n",
+    "output = model(input)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This tutorial will demonstrate: (1) how to create a custom model within Classy Vision; (2) how to integrate your model with Classy Vision's configuration system; (3) how to use the model for training and inference;\n",
+    "## Creating a custom Classy Model\n",
     "\n",
-    "## 1. Defining a model\n",
+    "This section will demonstrate: (1) how to create a custom model within Classy Vision; (2) how to integrate your model with Classy Vision's configuration system; (3) how to use the model for training and inference;\n",
+    "\n",
+    "### 1. Defining a model\n",
     "\n",
     "Creating a new model in Classy Vision is the simple as creating one within PyTorch. The model needs to derive from `ClassyModel` and implement a `forward` method to perform inference. `ClassyModel` inherits from [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#module), so it works exactly as you would expect."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,7 +200,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Integrate it with the configuration system\n",
+    "### 2. Integrating it with the configuration system\n",
     "\n",
     "Classy Vision is also able to read a configuration file and instantiate the model. This is useful to keep your experiments organized and reproducible. For that, you have to:\n",
     "\n",
@@ -81,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,9 +255,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[-0.1739, -0.7974, -0.0818]])\n"
+     ]
+    }
+   ],
    "source": [
     "from classy_vision.models import build_model\n",
     "import torch\n",
@@ -157,21 +294,211 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Conclusion\n",
+    "## Converting any PyTorch model to a Classy Model\n",
+    "\n",
+    "Any model can be converted to a Classy Model with a simple function call - `ClassyModel.from_model()`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchvision.models import resnet18\n",
+    "from classy_vision.models import ClassyModel\n",
+    "\n",
+    "\n",
+    "model = resnet18()\n",
+    "classy_model = ClassyModel.from_model(model)\n",
+    "output = classy_model(input)\n",
+    "assert output.shape == (10, 1000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In fact, as soon as a model becomes a Classy Model, it gains all its abilities as well, including the ability to attach heads! Let us inspect the original model to see the modules it comprises."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ResNet(\n",
+       "  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
+       "  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "  (relu): ReLU(inplace=True)\n",
+       "  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+       "  (layer1): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer2): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer3): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (layer4): Sequential(\n",
+       "    (0): BasicBlock(\n",
+       "      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (downsample): Sequential(\n",
+       "        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (1): BasicBlock(\n",
+       "      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "      (relu): ReLU(inplace=True)\n",
+       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
+       "  (fc): Linear(in_features=512, out_features=1000, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It seems that the final trunk layer of this model is called `layer4`. Let's try to attach heads here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the output of layer4 is 512 dimensional\n",
+    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=512)\n",
+    "\n",
+    "classy_model.set_heads({\"layer4\": [head]})\n",
+    "\n",
+    "output = classy_model(input)\n",
+    "assert output.shape == (10, 10)  # it works!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You might be wondering how to figure out the `in_plane` for any module. A simple trick is to try attaching any head and noticing the `Exception` if there is a size mismatch!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "size mismatch, m1: [10 x 512], m2: [1234 x 10] at ../aten/src/TH/generic/THTensorMath.cpp:136\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=1234)\n",
+    "\n",
+    "    classy_model.set_heads({\"layer4\": [head]})\n",
+    "\n",
+    "    output = classy_model(input)\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The error tells us that the size should be 512."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
     "\n",
-    "In this tutorial, we learned how to make your model compatible with Classy Vision and how to integrate it with the configuration system. Refer to our documentation to learn more about [ClassyModel](https://classyvision.ai/api/models.html)."
+    "In this tutorial, we covered how to use Classy Models, how to get and set their state, and how to create our own models & integrating them with the configuration system. We also got familiarized with the concept of heads and how they work with Classy Vision. Lastly, we learned how we can easily convert any PyTorch models to Classy Models and unlock all the features they provide.\n",
+    "\n",
+    "For more information, refer to our [API docs](https://classyvision.ai/api/)."
    ]
   }
  ],
  "metadata": {
-  "bento_stylesheets": {
-   "bento/extensions/flow/main.css": true,
-   "bento/extensions/kernel_selector/main.css": true,
-   "bento/extensions/kernel_ui/main.css": true,
-   "bento/extensions/new_kernel/main.css": true,
-   "bento/extensions/system_usage/main.css": true,
-   "bento/extensions/theme/main.css": true
-  },
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
@@ -187,7 +514,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.7"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/classy_model_overview.ipynb b/tutorials/classy_model_overview.ipynb
deleted file mode 100644
index cebfa3d26b..0000000000
--- a/tutorials/classy_model_overview.ipynb
+++ /dev/null
@@ -1,375 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#  Classy Model Overview\n",
-    "\n",
-    "Before reading this, please go over the [Getting Started tutorial](https://classyvision.ai/tutorials/getting_started).\n",
-    "\n",
-    "Working with Classy Vision requires models to be instances of `ClassyModel`. A `ClassyModel` is an instance of `torch.nn.Module`, but packed with a lot of extra features! \n",
-    "\n",
-    "If your model isn't implemented as a `ClassyModel`, don't fret - you can easily convert it to one in one line.\n",
-    "\n",
-    "In this tutorial, we will cover:\n",
-    "1. Using Classy Models\n",
-    "1. Getting and setting the state of a model\n",
-    "1. Heads: Introduction and using Classy Heads\n",
-    "1. Creating your own Classy Model\n",
-    "1. Converting any PyTorch model to a Classy Model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Using Classy Models\n",
-    "As `ClassyModel`s are also instances of `nn.Module`, they can be treated as any normal PyTorch model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from classy_vision.models import build_model\n",
-    "\n",
-    "\n",
-    "model = build_model({\"name\": \"resnet50\"})\n",
-    "input = torch.ones(10, 3, 224, 224)  # a batch of 10 images with 3 channels with dimensions of 224 x 224\n",
-    "output = model(input)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Getting and setting the state of a model\n",
-    "\n",
-    "Classy Vision provides the functions `get_classy_state()` and `set_classy_state()` to fetch and save the state of the models. These are considered drop-in replacements for the [`torch.nn.Module.state_dict`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.state_dict) and [`torch.nn.Module.load_state_dict()`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module.load_state_dict) functions and work similarly. For more information, refer to the [docs](https://classyvision.ai/api/models.html#classy_vision.models.ClassyModel)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "state = model.get_classy_state()\n",
-    "\n",
-    "model.set_classy_state(state)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Heads: Introduction and using Classy Heads\n",
-    "\n",
-    "A lot of work in Computer Vision utilizes the concept of re-using a trunk model, like a ResNet 50, and using it for various tasks. This is accomplished by attaching different \"heads\" to the end of the trunk. \n",
-    "\n",
-    "Some use cases involve re-training a model trained with a certain head by removing the old head and attaching a new one. This is a special case of fine tuning. If you are interested in fine tuning your models, there's a [tutorial for that as well](https://classyvision.ai/tutorials/fine_tuning)! But first, let's understand the basics.\n",
-    "\n",
-    "Normally, attaching heads or changing them requires users to write code and update their model implementations. Classy Vision does all of this work for you - all of this happens under the hood, with no work required by users!\n",
-    "\n",
-    "All you need to do is decide which `ClassyHead` you want to attach to your model and where. We will use a simple fully connected head in our example, and attach it to the output of the `block3-2` module of our model. Note that a head can be attached to any module, as long as the name of the module is unique."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from classy_vision.heads import FullyConnectedHead\n",
-    "\n",
-    "\n",
-    "# a resnet 50 model's trunk outputs a tensor of 2048 dimension, which will be the\n",
-    "# in_plane of out head\n",
-    "#\n",
-    "# let's say we want a 100 dimensional output\n",
-    "#\n",
-    "# Tip: you can use build_head() as well to create a head instead of initializing the\n",
-    "# class directly\n",
-    "head = FullyConnectedHead(unique_id=\"default\", num_classes=100, in_plane=2048)\n",
-    "\n",
-    "# let's attach this head to our model\n",
-    "model.set_heads({\"block3-2\": [head]})\n",
-    "\n",
-    "output = model(input)\n",
-    "assert output.shape == (10, 100)\n",
-    "\n",
-    "# let's change the head one more time\n",
-    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=2048)\n",
-    "\n",
-    "model.set_heads({\"block3-2\": [head]})\n",
-    "\n",
-    "output = model(input)\n",
-    "assert output.shape == (10, 10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Classy Vision supports attaching multiple heads to one or more blocks as well, but that is an advanced concept which this tutorial does not cover. For inquisitive users, here is an example -"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "head_1_1 = FullyConnectedHead(unique_id=\"1_1\", num_classes=10, in_plane=1024)\n",
-    "head_1_2 = FullyConnectedHead(unique_id=\"1_2\", num_classes=20, in_plane=1024)\n",
-    "head_2 = FullyConnectedHead(unique_id=\"2\", num_classes=100, in_plane=2048)\n",
-    "\n",
-    "# we can attach these heads to our model at different blocks\n",
-    "model.set_heads({\"block2-2\": [head_1_1, head_1_2], \"block3-2\": [head_2]})\n",
-    "\n",
-    "output = model(input)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Creating your own Classy Model\n",
-    "\n",
-    "Please refer to the [dedicated tutorial for this section](https://classyvision.ai/tutorials/classy_model)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Converting any PyTorch model to a Classy Model\n",
-    "\n",
-    "Any model can be converted to a Classy Model with a simple function call - `ClassyModel.from_model()`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torchvision.models import resnet18\n",
-    "from classy_vision.models import ClassyModel\n",
-    "\n",
-    "\n",
-    "model = resnet18()\n",
-    "classy_model = ClassyModel.from_model(model)\n",
-    "output = classy_model(input)\n",
-    "assert output.shape == (10, 1000)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In fact, as soon as a model becomes a Classy Model, it gains all its abilities as well, including the ability to attach heads! Let us inspect the original model to see the modules it comprises."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ResNet(\n",
-       "  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
-       "  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "  (relu): ReLU(inplace=True)\n",
-       "  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
-       "  (layer1): Sequential(\n",
-       "    (0): BasicBlock(\n",
-       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    )\n",
-       "    (1): BasicBlock(\n",
-       "      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    )\n",
-       "  )\n",
-       "  (layer2): Sequential(\n",
-       "    (0): BasicBlock(\n",
-       "      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (downsample): Sequential(\n",
-       "        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
-       "        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      )\n",
-       "    )\n",
-       "    (1): BasicBlock(\n",
-       "      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    )\n",
-       "  )\n",
-       "  (layer3): Sequential(\n",
-       "    (0): BasicBlock(\n",
-       "      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (downsample): Sequential(\n",
-       "        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
-       "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      )\n",
-       "    )\n",
-       "    (1): BasicBlock(\n",
-       "      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    )\n",
-       "  )\n",
-       "  (layer4): Sequential(\n",
-       "    (0): BasicBlock(\n",
-       "      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (downsample): Sequential(\n",
-       "        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
-       "        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      )\n",
-       "    )\n",
-       "    (1): BasicBlock(\n",
-       "      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "      (relu): ReLU(inplace=True)\n",
-       "      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
-       "    )\n",
-       "  )\n",
-       "  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
-       "  (fc): Linear(in_features=512, out_features=1000, bias=True)\n",
-       ")"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "It seems that the final trunk layer of this model is called `layer4`. Let's try to attach heads here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# the output of layer4 is 512 dimensional\n",
-    "head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=512)\n",
-    "\n",
-    "classy_model.set_heads({\"layer4\": [head]})\n",
-    "\n",
-    "output = classy_model(input)\n",
-    "assert output.shape == (10, 10)  # it works!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You might be wondering how to figure out the `in_plane` for any module. A simple trick is to try attaching any head and noticing the `Exception` if there is a size mismatch!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "size mismatch, m1: [10 x 512], m2: [1234 x 10] at ../aten/src/TH/generic/THTensorMath.cpp:136\n"
-     ]
-    }
-   ],
-   "source": [
-    "try:\n",
-    "    head = FullyConnectedHead(unique_id=\"default\", num_classes=10, in_plane=1234)\n",
-    "\n",
-    "    classy_model.set_heads({\"layer4\": [head]})\n",
-    "\n",
-    "    output = classy_model(input)\n",
-    "\n",
-    "except Exception as e:\n",
-    "    print(e)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The error tells us that the size should be 512."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "In this tutorial, we covered how to use Classy Models, how to get and set their state, and how to create our own models (through another tutorial). We also got familiarized with the concept of heads and how they work with Classy Vision. Lastly, we learned how we can easily convert any PyTorch models to Classy Models and unlock all the features they provide.\n",
-    "\n",
-    "For more information, refer to our [API docs](https://classyvision.ai/api/)."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tutorials/getting_started.ipynb b/tutorials/getting_started.ipynb
index 8b12c7e9e0..019483d4c3 100644
--- a/tutorials/getting_started.ipynb
+++ b/tutorials/getting_started.ipynb
@@ -552,7 +552,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
diff --git a/website/tutorials.json b/website/tutorials.json
index 9dbcf9e720..1b8ac7a513 100644
--- a/website/tutorials.json
+++ b/website/tutorials.json
@@ -6,15 +6,12 @@
      },{
       "id": "ray_aws",
       "title": "Distributed training on AWS"
-     },{
-      "id": "classy_model_overview",
-      "title": "Classy model overview"
      },{
       "id": "classy_dataset",
       "title": "Creating a custom dataset"
      },{
       "id": "classy_model",
-      "title": "Creating a custom model"
+      "title": "Creating and using Classy Models"
      },{
       "id": "classy_loss",
       "title": "Creating a custom loss"