diff --git a/docs/_config.yml b/docs/_config.yml index 53fc21f3798f..1e85e57f232e 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -37,10 +37,11 @@ defaults: values: layout: single author_profile: false - read_time: true + read_time: false comments: false share: true related: false + sneak_preview: false # _tutorials - scope: path: "" diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index e50c552240d4..1b48a31279b8 100644 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -1,17 +1,17 @@ main: - title: "Getting Started" url: /getting-started/ - - title: "Blog" - url: /blog/ + - title: "News" + url: /news/ - title: "Tutorials" url: /tutorials/ - title: "Documentation" - url: https://ghpages-test.readthedocs.io/ + url: https://deepspeed.readthedocs.io/ - title: "GitHub" url: https://github.com/microsoft/DeepSpeed lnav: - - title: "This is a floating nav bar." + - title: "DeepSpeed Documentation" - title: "Getting Started" url: /getting-started/ children: @@ -19,3 +19,5 @@ lnav: url: /getting-started/#installation - title: "Configuration" url: /getting-started/#deepspeed-configuration + - title: "DeepSpeed Features" + url: /features/ diff --git a/docs/_layouts/news-home.html b/docs/_layouts/news-home.html new file mode 100644 index 000000000000..3ac509f06ef7 --- /dev/null +++ b/docs/_layouts/news-home.html @@ -0,0 +1,27 @@ +--- +layout: archive +--- + +{{ content }} + + +{% if paginator %} + {% assign posts = paginator.posts %} +{% else %} + {% assign posts = site.posts %} +{% endif %} + + +

Features Coming Soon

+{% assign soon = posts | where: "sneak_preview", "true" %} +{% for post in soon %} + {% include archive-single.html %} +{% endfor %} + +

{{ site.data.ui-text[site.locale].recent_posts | default: "Recent Posts" }}

+{% assign news = posts | where: "sneak_preview", "false" %} +{% for post in news %} + {% include archive-single.html %} +{% endfor %} + +{% include paginator.html %} diff --git a/docs/_posts/2020-03-17-reduce-scatter.md b/docs/_posts/2020-03-17-reduce-scatter.md index b7e9b8fe176d..1753a22e3aa7 100644 --- a/docs/_posts/2020-03-17-reduce-scatter.md +++ b/docs/_posts/2020-03-17-reduce-scatter.md @@ -1,12 +1,11 @@ --- title: "ZeRO stage 1 with reduced communication" -date: 2020-03-13 +sneak_preview: true excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!" --- -# ZeRO stage 1 with reduced communication * Partition-aware approach instead of initial implementation that used a global collective (all-reduce) * Total communication volume reduction 1.5x -> 1x of data parallelism * Up to 2x reduction in communication time compared to all-reduce -# Further updates coming soon! +## Further updates coming soon! diff --git a/docs/_posts/2020-03-17-zero-stage2.md b/docs/_posts/2020-03-17-zero-stage2.md index 682dea687baf..18a1974cd956 100644 --- a/docs/_posts/2020-03-17-zero-stage2.md +++ b/docs/_posts/2020-03-17-zero-stage2.md @@ -1,12 +1,10 @@ --- title: "ZeRO stage 2" -date: 2020-03-13 +sneak_preview: true excerpt: "Reduce memory footprint to enable training 10B models without model parallelism!" --- - -# Zero Stage 2 * Reduce memory footprint of gradients * Train larger models: e.g., 10B parameters on 32GPUs without model parallelism * Train larger batch sizes -# Further updates coming soon! +## Further updates coming soon! diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md index 26875f50a726..c6be8a05638d 100644 --- a/docs/_tutorials/getting-started.md +++ b/docs/_tutorials/getting-started.md @@ -213,7 +213,7 @@ deepspeed --include="worker-2:0,1" \ \ --deepspeed --deepspeed_config ds_config.json ``` - +This is a floating nav bar. ### MPI Compatibility As described above, DeepSpeed provides its own parallel launcher to help launch multi-node/multi-gpu training jobs. If you prefer to launch your training job diff --git a/docs/assets/css/main.scss b/docs/assets/css/main.scss new file mode 100644 index 000000000000..c2583467e4b7 --- /dev/null +++ b/docs/assets/css/main.scss @@ -0,0 +1,48 @@ +--- +# Only the main Sass file needs front matter (the dashes are enough) +--- + +@charset "utf-8"; + +@import "minimal-mistakes/skins/{{ site.minimal_mistakes_skin | default: 'default' }}"; // skin +@import "minimal-mistakes"; // main partials + +// +// DeepSpeed customizations +// + + +.site-title { + display: -webkit-box; + display: -ms-flexbox; + display: flex; + -ms-flex-item-align: center; + align-self: center; + font-weight: bold; + font-size: $type-size-2; // DeepSpeed: increase size +} + + +.toc { + font-family: $sans-serif-narrow; + color: $gray; + background-color: $background-color; + border: 1px solid $border-color; + border-radius: $border-radius; + -webkit-box-shadow: $box-shadow; + box-shadow: $box-shadow; + position: fixed; + + .nav__title { + color: #fff; + font-size: $type-size-6; + background: $primary-color; + border-top-left-radius: $border-radius; + border-top-right-radius: $border-radius; + } + + // Scrollspy marks toc items as .active when they are in focus + .active a { + @include yiq-contrasted($active-color); + } +} diff --git a/docs/blog/index.html b/docs/blog/index.html deleted file mode 100644 index e4d427d215f4..000000000000 --- a/docs/blog/index.html +++ /dev/null @@ -1,3 +0,0 @@ ---- -layout: home ---- diff --git a/docs/features.md b/docs/features.md index e3f1d0beb4ff..f28efdae1062 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,6 +1,7 @@ --- title: "Feature Overview" layout: single +permalink: /features/ toc: true toc_label: "Contents" --- diff --git a/docs/index.md b/docs/index.md index 9412da692981..a7a7e0e428b4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,6 +18,16 @@ a language model (LM) with over 17B parameters called [Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft), establishing a new SOTA in the LM category. +# What's New? +{% assign news = site.posts | where: "sneak_preview", "false" %} +{% for post in news limit:5 %} + {% if post.link %} + * [{{ post.title }}]({{ post.link }}) + {% else %} + * [{{ post.title }}]({{ post.url }}) + {% endif %} +{% endfor %} + # Why DeepSpeed? Training advanced deep learning models is challenging. Beyond model design, @@ -56,8 +66,6 @@ optimizations on advanced hyperparameter tuning and optimizers. For example: | 256 V100 GPUs | NVIDIA | 3.9 | | 256 V100 GPUs | DeepSpeed | **3.7** | - - *BERT Tutorial*: Coming Soon * DeepSpeed trains GPT2 (1.5 billion parameters) 3.75x faster than state-of-art, NVIDIA @@ -157,247 +165,6 @@ overview](features) for descriptions and usage. * [Performance Analysis and Debugging](features.md#performance-analysis-and-debugging) -# Getting Started - - -## Installation - -* Please see our [Azure tutorial](docs/azure.md) to get started with DeepSpeed on Azure! -* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies. -* If you want to install DeepSpeed manually, we provide an install script [install.sh](install.sh) to help install on a local machine or across an entire cluster. - -## Writing DeepSpeed Models -DeepSpeed model training is accomplished using the DeepSpeed engine. The engine -can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs -for training and checkpointing the model. Please see the tutorials for detailed -examples. - -To initialize the DeepSpeed engine: -```python -model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, - model=model, - model_parameters=params) -``` - -`deepspeed.inialize` ensures that all of the necessary setup required for -distributed data parallel or mixed precision training are done -appropriately under the hood. In addition to wrapping the model, DeepSpeed can -construct and manage the training optimizer, data loader, and the learning rate -scheduler based on the parameters passed to `deepspeed.initialze` and the -DeepSpeed [configuration file](#deepspeed-configuration). - - -### Training - -Once the DeepSpeed engine has been initialized, it can be used to train the -model using three simple APIs for forward propagation (`()`), backward -propagation (`backward`), and weight updates (`step`). - -```python -for step, batch in enumerate(data_loader): - #forward() method - loss = model_engine(batch) - - #runs backpropagation - model_engine.backward(loss) - - #weight update - model_engine.step() -``` - - -Under the hood, DeepSpeed automatically performs the necessary operations -required for distributed data parallel training, in mixed precision, with a -pre-defined learning rate schedule: - -* **Gradient Averaging**: in distributed data parallel training, `backward` - ensures that gradients are averaged across data parallel processes after - training on an `train_batch_size`. - -* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed - engine automatically handles scaling the loss to avoid precision loss in the - gradients. - -* **Learning Rate Schedule**: if using DeepSpeed's learning rate - schedule, then DeepSpeed automatically handles any updates to the learning - rate when `step` is executed. - - - -### Model Checkpointing -Saving and loading the training state is handled via the `save_checkpoint` and -`load_checkpoint` API in DeepSpeed which takes two arguments to uniquely -identify a checkpoint: - * `ckpt_dir`: the directory where checkpoints will be saved. - * `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory. - In the following code snippet, we use the loss value as the checkpoint identifier. - -```python -#load checkpoint -_, client_sd = model_engine.load_checkpoint(args.load_dir, args.ckpt_id) -step = client_sd['step'] - -#advance data loader to ckpt step -dataloader_to_step(data_loader, step + 1) - -for step, batch in enumerate(data_loader): - - #forward() method - loss = model_engine(batch) - - #runs backpropagation - model_engine.backward(loss) - - #weight update - model_engine.step() - - #save checkpoint - if step % args.save_interval: - client_sd['step'] = step - ckpt_id = loss.item() - model_engine.save_checkpoint(args.save_dir, ckpt_id, client_sd = client_sd) -``` - -DeepSpeed can automatically save and restore the model, optimizer, and the -learning rate scheduler states while hiding away these details from the user. -However, the user may want to save other data in addition to these that are -unique to a given model training. To support these items, `save_checkpoint` -accepts a client state dictionary `client_sd` for saving. These items can be -retrieved from `load_checkpoint` as a return argument. In the example above, -the `step` value is stored as part of the `client_sd`. - - -## DeepSpeed Configuration -DeepSpeed features can be enabled, disabled, or configured using a config JSON -file that should be specified as `args.deepspeed_config`. A sample config file -is shown below. For a full set of features see [core API -doc](https://microsoft.github.io/DeepSpeed/docs/htmlfiles/api/full/index.html). - -```json -{ - "train_batch_size": 8, - "gradient_accumulation_steps": 1, - "optimizer": { - "type": "Adam", - "params": { - "lr": 0.00015 - } - }, - "fp16": { - "enabled": true - }, - "zero_optimization": true -} -``` - -## Multi-Node Environment Variables - -When training across multiple nodes we have found it useful to support -propagating user-defined environment variables. By default DeepSpeed will -propagate all NCCL and PYTHON related environment variables that are set. If -you would like to propagate additional variables you can specify them in a -dot-file named `.deepspeed_env` that contains a new-line separated list of -`VAR=VAL` entries. The DeepSpeed launcher will look in the local path you are -executing from and also in your home directory (`~/`). - -As a concrete example, some clusters require special NCCL variables to set -prior to training. The user can simply add these variables to a -`.deepspeed_env` file in their home directory that looks like this: -``` -NCCL_IB_DISABLE=1 -NCCL_SOCKET_IFNAME=eth0 -``` -DeepSpeed will then make sure that these environment variables are set when -launching each process on every node across their training job. - -# Launching DeepSpeed Training -DeepSpeed installs the entry point `deepspeed` to launch distributed training. -We illustrate an example usage of DeepSpeed with the following assumptions: - -1. You have already integrated DeepSpeed into your model -2. `client_entry.py` is the entry script for your model -3. `client args` is the `argparse` command line arguments -4. `ds_config.json` is the configuration file for DeepSpeed - - -## Resource Configuration (multi-node) -DeepSpeed configures multi-node compute resources with hostfiles that are compatible with -[OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod). -A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless -SSH, and *slot counts*, which specify the number of GPUs available on the system. For -example, -``` -worker-1 slots=4 -worker-2 slots=4 -``` -specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use -for training. - -Hostfiles are specified with the `--hostfile` command line option. If no hostfile is -specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified or found, -DeepSpeed queries the number of GPUs on the local machine to discover the number of local -slots available. - - -The following command launches a PyTorch training job across all available nodes and GPUs -specified in `myhostfile`: -```bash -deepspeed \ - --deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile -``` - -Alternatively, DeepSpeed allows you to restrict distributed training of your model to a -subset of the available nodes and GPUs. This feature is enabled through two command line -arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be -restricted to use only two nodes with the following command: -```bash -deepspeed --num_nodes=2 \ - \ - --deepspeed --deepspeed_config ds_config.json -``` -You can instead include or exclude specific resources using the `--include` and -`--exclude` flags. For example, to use all available resources **except** GPU 0 on node -*worker-2* and GPUs 0 and 1 on *worker-3*: -```bash -deepspeed --exclude="worker-2:0@worker-3:0,1" \ - \ - --deepspeed --deepspeed_config ds_config.json -``` -Similarly, you can use **only** GPUs 0 and 1 on *worker-2*: -```bash -deepspeed --include="worker-2:0,1" \ - \ - --deepspeed --deepspeed_config ds_config.json -``` - -### MPI Compatibility -As described above, DeepSpeed provides its own parallel launcher to help launch -multi-node/multi-gpu training jobs. If you prefer to launch your training job -using MPI (e.g., mpirun), we provide support for this. It should be noted that -DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI -backend. To launch your training job with mpirun + DeepSpeed you simply pass us -an additional flag `--deepspeed_mpi`. DeepSpeed will then use -[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g., -rank, world size) and properly initialize torch distributed for training. In this -case you will explicitly invoke `python` to launch your model script instead of using -the `deepspeed` launcher, here is an example: -```bash -mpirun python \ - \ - --deepspeed_mpi --deepspeed --deepspeed_config ds_config.json -``` - -If you want to use this feature of DeepSpeed, please ensure that mpi4py is -installed via `pip install mpi4py`. - -## Resource Configuration (single-node) -In the case that we are only running on a single node (with one or more GPUs) -DeepSpeed *does not* require a hostfile as described above. If a hostfile is -not detected or passed in then DeepSpeed will query the number of GPUs on the -local machine to discover the number of slots available. The `--include` and -`--exclude` arguments work as normal, but the user should specify 'localhost' -as the hostname. - # Further Reading diff --git a/docs/news/index.html b/docs/news/index.html new file mode 100644 index 000000000000..95e7974b5050 --- /dev/null +++ b/docs/news/index.html @@ -0,0 +1,3 @@ +--- +layout: news-home +---