diff --git a/docs/_config.yml b/docs/_config.yml
index 53fc21f3798f..1e85e57f232e 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -37,10 +37,11 @@ defaults:
values:
layout: single
author_profile: false
- read_time: true
+ read_time: false
comments: false
share: true
related: false
+ sneak_preview: false
# _tutorials
- scope:
path: ""
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index e50c552240d4..1b48a31279b8 100644
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -1,17 +1,17 @@
main:
- title: "Getting Started"
url: /getting-started/
- - title: "Blog"
- url: /blog/
+ - title: "News"
+ url: /news/
- title: "Tutorials"
url: /tutorials/
- title: "Documentation"
- url: https://ghpages-test.readthedocs.io/
+ url: https://deepspeed.readthedocs.io/
- title: "GitHub"
url: https://github.com/microsoft/DeepSpeed
lnav:
- - title: "This is a floating nav bar."
+ - title: "DeepSpeed Documentation"
- title: "Getting Started"
url: /getting-started/
children:
@@ -19,3 +19,5 @@ lnav:
url: /getting-started/#installation
- title: "Configuration"
url: /getting-started/#deepspeed-configuration
+ - title: "DeepSpeed Features"
+ url: /features/
diff --git a/docs/_layouts/news-home.html b/docs/_layouts/news-home.html
new file mode 100644
index 000000000000..3ac509f06ef7
--- /dev/null
+++ b/docs/_layouts/news-home.html
@@ -0,0 +1,27 @@
+---
+layout: archive
+---
+
+{{ content }}
+
+
+{% if paginator %}
+ {% assign posts = paginator.posts %}
+{% else %}
+ {% assign posts = site.posts %}
+{% endif %}
+
+
+
Features Coming Soon
+{% assign soon = posts | where: "sneak_preview", "true" %}
+{% for post in soon %}
+ {% include archive-single.html %}
+{% endfor %}
+
+{{ site.data.ui-text[site.locale].recent_posts | default: "Recent Posts" }}
+{% assign news = posts | where: "sneak_preview", "false" %}
+{% for post in news %}
+ {% include archive-single.html %}
+{% endfor %}
+
+{% include paginator.html %}
diff --git a/docs/_posts/2020-03-17-reduce-scatter.md b/docs/_posts/2020-03-17-reduce-scatter.md
index b7e9b8fe176d..1753a22e3aa7 100644
--- a/docs/_posts/2020-03-17-reduce-scatter.md
+++ b/docs/_posts/2020-03-17-reduce-scatter.md
@@ -1,12 +1,11 @@
---
title: "ZeRO stage 1 with reduced communication"
-date: 2020-03-13
+sneak_preview: true
excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!"
---
-# ZeRO stage 1 with reduced communication
* Partition-aware approach instead of initial implementation that used a global collective (all-reduce)
* Total communication volume reduction 1.5x -> 1x of data parallelism
* Up to 2x reduction in communication time compared to all-reduce
-# Further updates coming soon!
+## Further updates coming soon!
diff --git a/docs/_posts/2020-03-17-zero-stage2.md b/docs/_posts/2020-03-17-zero-stage2.md
index 682dea687baf..18a1974cd956 100644
--- a/docs/_posts/2020-03-17-zero-stage2.md
+++ b/docs/_posts/2020-03-17-zero-stage2.md
@@ -1,12 +1,10 @@
---
title: "ZeRO stage 2"
-date: 2020-03-13
+sneak_preview: true
excerpt: "Reduce memory footprint to enable training 10B models without model parallelism!"
---
-
-# Zero Stage 2
* Reduce memory footprint of gradients
* Train larger models: e.g., 10B parameters on 32GPUs without model parallelism
* Train larger batch sizes
-# Further updates coming soon!
+## Further updates coming soon!
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 26875f50a726..c6be8a05638d 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -213,7 +213,7 @@ deepspeed --include="worker-2:0,1" \
\
--deepspeed --deepspeed_config ds_config.json
```
-
+This is a floating nav bar.
### MPI Compatibility
As described above, DeepSpeed provides its own parallel launcher to help launch
multi-node/multi-gpu training jobs. If you prefer to launch your training job
diff --git a/docs/assets/css/main.scss b/docs/assets/css/main.scss
new file mode 100644
index 000000000000..c2583467e4b7
--- /dev/null
+++ b/docs/assets/css/main.scss
@@ -0,0 +1,48 @@
+---
+# Only the main Sass file needs front matter (the dashes are enough)
+---
+
+@charset "utf-8";
+
+@import "minimal-mistakes/skins/{{ site.minimal_mistakes_skin | default: 'default' }}"; // skin
+@import "minimal-mistakes"; // main partials
+
+//
+// DeepSpeed customizations
+//
+
+
+.site-title {
+ display: -webkit-box;
+ display: -ms-flexbox;
+ display: flex;
+ -ms-flex-item-align: center;
+ align-self: center;
+ font-weight: bold;
+ font-size: $type-size-2; // DeepSpeed: increase size
+}
+
+
+.toc {
+ font-family: $sans-serif-narrow;
+ color: $gray;
+ background-color: $background-color;
+ border: 1px solid $border-color;
+ border-radius: $border-radius;
+ -webkit-box-shadow: $box-shadow;
+ box-shadow: $box-shadow;
+ position: fixed;
+
+ .nav__title {
+ color: #fff;
+ font-size: $type-size-6;
+ background: $primary-color;
+ border-top-left-radius: $border-radius;
+ border-top-right-radius: $border-radius;
+ }
+
+ // Scrollspy marks toc items as .active when they are in focus
+ .active a {
+ @include yiq-contrasted($active-color);
+ }
+}
diff --git a/docs/blog/index.html b/docs/blog/index.html
deleted file mode 100644
index e4d427d215f4..000000000000
--- a/docs/blog/index.html
+++ /dev/null
@@ -1,3 +0,0 @@
----
-layout: home
----
diff --git a/docs/features.md b/docs/features.md
index e3f1d0beb4ff..f28efdae1062 100644
--- a/docs/features.md
+++ b/docs/features.md
@@ -1,6 +1,7 @@
---
title: "Feature Overview"
layout: single
+permalink: /features/
toc: true
toc_label: "Contents"
---
diff --git a/docs/index.md b/docs/index.md
index 9412da692981..a7a7e0e428b4 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -18,6 +18,16 @@ a language model (LM) with over 17B parameters called
[Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
establishing a new SOTA in the LM category.
+# What's New?
+{% assign news = site.posts | where: "sneak_preview", "false" %}
+{% for post in news limit:5 %}
+ {% if post.link %}
+ * [{{ post.title }}]({{ post.link }})
+ {% else %}
+ * [{{ post.title }}]({{ post.url }})
+ {% endif %}
+{% endfor %}
+
# Why DeepSpeed?
Training advanced deep learning models is challenging. Beyond model design,
@@ -56,8 +66,6 @@ optimizations on advanced hyperparameter tuning and optimizers. For example:
| 256 V100 GPUs | NVIDIA | 3.9 |
| 256 V100 GPUs | DeepSpeed | **3.7** |
-
-
*BERT Tutorial*: Coming Soon
* DeepSpeed trains GPT2 (1.5 billion parameters) 3.75x faster than state-of-art, NVIDIA
@@ -157,247 +165,6 @@ overview](features) for descriptions and usage.
* [Performance Analysis and Debugging](features.md#performance-analysis-and-debugging)
-# Getting Started
-
-
-## Installation
-
-* Please see our [Azure tutorial](docs/azure.md) to get started with DeepSpeed on Azure!
-* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
-* If you want to install DeepSpeed manually, we provide an install script [install.sh](install.sh) to help install on a local machine or across an entire cluster.
-
-## Writing DeepSpeed Models
-DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
-can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs
-for training and checkpointing the model. Please see the tutorials for detailed
-examples.
-
-To initialize the DeepSpeed engine:
-```python
-model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
- model=model,
- model_parameters=params)
-```
-
-`deepspeed.inialize` ensures that all of the necessary setup required for
-distributed data parallel or mixed precision training are done
-appropriately under the hood. In addition to wrapping the model, DeepSpeed can
-construct and manage the training optimizer, data loader, and the learning rate
-scheduler based on the parameters passed to `deepspeed.initialze` and the
-DeepSpeed [configuration file](#deepspeed-configuration).
-
-
-### Training
-
-Once the DeepSpeed engine has been initialized, it can be used to train the
-model using three simple APIs for forward propagation (`()`), backward
-propagation (`backward`), and weight updates (`step`).
-
-```python
-for step, batch in enumerate(data_loader):
- #forward() method
- loss = model_engine(batch)
-
- #runs backpropagation
- model_engine.backward(loss)
-
- #weight update
- model_engine.step()
-```
-
-
-Under the hood, DeepSpeed automatically performs the necessary operations
-required for distributed data parallel training, in mixed precision, with a
-pre-defined learning rate schedule:
-
-* **Gradient Averaging**: in distributed data parallel training, `backward`
- ensures that gradients are averaged across data parallel processes after
- training on an `train_batch_size`.
-
-* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
- engine automatically handles scaling the loss to avoid precision loss in the
- gradients.
-
-* **Learning Rate Schedule**: if using DeepSpeed's learning rate
- schedule, then DeepSpeed automatically handles any updates to the learning
- rate when `step` is executed.
-
-
-
-### Model Checkpointing
-Saving and loading the training state is handled via the `save_checkpoint` and
-`load_checkpoint` API in DeepSpeed which takes two arguments to uniquely
-identify a checkpoint:
- * `ckpt_dir`: the directory where checkpoints will be saved.
- * `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
- In the following code snippet, we use the loss value as the checkpoint identifier.
-
-```python
-#load checkpoint
-_, client_sd = model_engine.load_checkpoint(args.load_dir, args.ckpt_id)
-step = client_sd['step']
-
-#advance data loader to ckpt step
-dataloader_to_step(data_loader, step + 1)
-
-for step, batch in enumerate(data_loader):
-
- #forward() method
- loss = model_engine(batch)
-
- #runs backpropagation
- model_engine.backward(loss)
-
- #weight update
- model_engine.step()
-
- #save checkpoint
- if step % args.save_interval:
- client_sd['step'] = step
- ckpt_id = loss.item()
- model_engine.save_checkpoint(args.save_dir, ckpt_id, client_sd = client_sd)
-```
-
-DeepSpeed can automatically save and restore the model, optimizer, and the
-learning rate scheduler states while hiding away these details from the user.
-However, the user may want to save other data in addition to these that are
-unique to a given model training. To support these items, `save_checkpoint`
-accepts a client state dictionary `client_sd` for saving. These items can be
-retrieved from `load_checkpoint` as a return argument. In the example above,
-the `step` value is stored as part of the `client_sd`.
-
-
-## DeepSpeed Configuration
-DeepSpeed features can be enabled, disabled, or configured using a config JSON
-file that should be specified as `args.deepspeed_config`. A sample config file
-is shown below. For a full set of features see [core API
-doc](https://microsoft.github.io/DeepSpeed/docs/htmlfiles/api/full/index.html).
-
-```json
-{
- "train_batch_size": 8,
- "gradient_accumulation_steps": 1,
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015
- }
- },
- "fp16": {
- "enabled": true
- },
- "zero_optimization": true
-}
-```
-
-## Multi-Node Environment Variables
-
-When training across multiple nodes we have found it useful to support
-propagating user-defined environment variables. By default DeepSpeed will
-propagate all NCCL and PYTHON related environment variables that are set. If
-you would like to propagate additional variables you can specify them in a
-dot-file named `.deepspeed_env` that contains a new-line separated list of
-`VAR=VAL` entries. The DeepSpeed launcher will look in the local path you are
-executing from and also in your home directory (`~/`).
-
-As a concrete example, some clusters require special NCCL variables to set
-prior to training. The user can simply add these variables to a
-`.deepspeed_env` file in their home directory that looks like this:
-```
-NCCL_IB_DISABLE=1
-NCCL_SOCKET_IFNAME=eth0
-```
-DeepSpeed will then make sure that these environment variables are set when
-launching each process on every node across their training job.
-
-# Launching DeepSpeed Training
-DeepSpeed installs the entry point `deepspeed` to launch distributed training.
-We illustrate an example usage of DeepSpeed with the following assumptions:
-
-1. You have already integrated DeepSpeed into your model
-2. `client_entry.py` is the entry script for your model
-3. `client args` is the `argparse` command line arguments
-4. `ds_config.json` is the configuration file for DeepSpeed
-
-
-## Resource Configuration (multi-node)
-DeepSpeed configures multi-node compute resources with hostfiles that are compatible with
-[OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod).
-A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless
-SSH, and *slot counts*, which specify the number of GPUs available on the system. For
-example,
-```
-worker-1 slots=4
-worker-2 slots=4
-```
-specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use
-for training.
-
-Hostfiles are specified with the `--hostfile` command line option. If no hostfile is
-specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified or found,
-DeepSpeed queries the number of GPUs on the local machine to discover the number of local
-slots available.
-
-
-The following command launches a PyTorch training job across all available nodes and GPUs
-specified in `myhostfile`:
-```bash
-deepspeed \
- --deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
-```
-
-Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
-subset of the available nodes and GPUs. This feature is enabled through two command line
-arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be
-restricted to use only two nodes with the following command:
-```bash
-deepspeed --num_nodes=2 \
- \
- --deepspeed --deepspeed_config ds_config.json
-```
-You can instead include or exclude specific resources using the `--include` and
-`--exclude` flags. For example, to use all available resources **except** GPU 0 on node
-*worker-2* and GPUs 0 and 1 on *worker-3*:
-```bash
-deepspeed --exclude="worker-2:0@worker-3:0,1" \
- \
- --deepspeed --deepspeed_config ds_config.json
-```
-Similarly, you can use **only** GPUs 0 and 1 on *worker-2*:
-```bash
-deepspeed --include="worker-2:0,1" \
- \
- --deepspeed --deepspeed_config ds_config.json
-```
-
-### MPI Compatibility
-As described above, DeepSpeed provides its own parallel launcher to help launch
-multi-node/multi-gpu training jobs. If you prefer to launch your training job
-using MPI (e.g., mpirun), we provide support for this. It should be noted that
-DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
-backend. To launch your training job with mpirun + DeepSpeed you simply pass us
-an additional flag `--deepspeed_mpi`. DeepSpeed will then use
-[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
-rank, world size) and properly initialize torch distributed for training. In this
-case you will explicitly invoke `python` to launch your model script instead of using
-the `deepspeed` launcher, here is an example:
-```bash
-mpirun python \
- \
- --deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
-```
-
-If you want to use this feature of DeepSpeed, please ensure that mpi4py is
-installed via `pip install mpi4py`.
-
-## Resource Configuration (single-node)
-In the case that we are only running on a single node (with one or more GPUs)
-DeepSpeed *does not* require a hostfile as described above. If a hostfile is
-not detected or passed in then DeepSpeed will query the number of GPUs on the
-local machine to discover the number of slots available. The `--include` and
-`--exclude` arguments work as normal, but the user should specify 'localhost'
-as the hostname.
-
# Further Reading
diff --git a/docs/news/index.html b/docs/news/index.html
new file mode 100644
index 000000000000..95e7974b5050
--- /dev/null
+++ b/docs/news/index.html
@@ -0,0 +1,3 @@
+---
+layout: news-home
+---