diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 20ea07a2a069..bdf8e59aede8 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 20ea07a2a069696abec212e25476a9bf76aced70
+Subproject commit bdf8e59aede8c8e0577e8d4d557298ca8515268f
diff --git a/README.md b/README.md
index 8323084ac6fe..2700b7175fe0 100755
--- a/README.md
+++ b/README.md
@@ -193,6 +193,8 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
+
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
new file mode 100644
index 000000000000..11927969c50f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -0,0 +1,333 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "deepspeed_aio_common.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_PERF 0
+#define DEBUG_DS_AIO_SUBMIT_PERF 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+    __attribute__((unused));
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+
+    std::cout << c_library_name << ": latency statistics(usec) " << tag
+              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
+}
+
+static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
+                               struct deepspeed_aio_latency_t& summary_latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._avg_usec =
+        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+}
+
+static void _do_io_submit_singles(const long long int n_iocbs,
+                                  const long long int iocb_index,
+                                  std::unique_ptr<aio_context>& aio_ctxt,
+                                  std::vector<std::chrono::duration<double>>& submit_times)
+{
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto st = std::chrono::high_resolution_clock::now();
+        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
+        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
+               submit_times.back().count() * 1e6,
+               iocb_index,
+               aio_ctxt->_iocbs[i]->u.c.buf,
+               aio_ctxt->_iocbs[i]->u.c.nbytes,
+               aio_ctxt->_iocbs[i]->u.c.offset);
+#endif
+        assert(submit_ret > 0);
+    }
+}
+
+static void _do_io_submit_block(const long long int n_iocbs,
+                                const long long int iocb_index,
+                                std::unique_ptr<aio_context>& aio_ctxt,
+                                std::vector<std::chrono::duration<double>>& submit_times)
+{
+    const auto st = std::chrono::high_resolution_clock::now();
+    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
+    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
+           submit_times.back().count() * 1e6,
+           iocb_index,
+           n_iocbs,
+           aio_ctxt->_iocbs[0]->u.c.buf,
+           aio_ctxt->_iocbs[0]->u.c.nbytes,
+           aio_ctxt->_iocbs[0]->u.c.offset);
+#endif
+    assert(submit_ret > 0);
+}
+
+static int _do_io_complete(const long long int min_completes,
+                           const long long int max_completes,
+                           std::unique_ptr<aio_context>& aio_ctxt,
+                           std::vector<std::chrono::duration<double>>& reap_times)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    const auto n_completes = io_getevents(
+        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
+
+    assert(n_completes >= min_completes);
+    return n_completes;
+}
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
+
+    const auto num_io_blocks = static_cast<long long int>(
+        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+    const auto max_queue_bytes =
+        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (long long iocb_index = 0; iocb_index < num_io_blocks;
+         iocb_index += aio_ctxt->_queue_depth) {
+        const auto start_offset = iocb_index * aio_ctxt->_block_size;
+        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
+        const auto n_iocbs =
+            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
+        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
+
+        if (config->_single_submit) {
+            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        } else {
+            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        }
+
+        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
+    }
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
+
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+
+    auto request_iocbs = aio_ctxt->_queue_depth;
+    auto n_pending_iocbs = 0;
+    const auto min_completes = 1;
+    auto start = std::chrono::high_resolution_clock::now();
+    while (true) {
+        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
+        if (n_iocbs > 0) {
+            if (config->_single_submit) {
+                _do_io_submit_singles(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            } else {
+                _do_io_submit_block(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            }
+        }
+
+        n_pending_iocbs += n_iocbs;
+        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
+
+        if (n_pending_iocbs == 0) { break; }
+
+        const auto n_complete =
+            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
+        n_pending_iocbs -= n_complete;
+    }
+
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code)
+{
+    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
+                          " error = " + std::to_string(error_code);
+    std::cerr << c_library_name << ":  " << err_msg << std::endl;
+}
+
+int open_file(const char* filename, const bool read_op)
+{
+    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int mode = 0600;
+    const auto fd = open(filename, flags, mode);
+    if (fd == -1) {
+        const auto error_code = errno;
+        const auto error_msg = read_op ? " open for read " : " open for write ";
+        report_file_error(filename, error_msg, error_code);
+        return -1;
+    }
+    return fd;
+}
+
+int regular_read(const char* filename, std::vector<char>& buffer)
+{
+    long long int num_bytes;
+    const auto f_size = get_file_size(filename, num_bytes);
+    assert(f_size != -1);
+    buffer.resize(num_bytes);
+    const auto fd = open(filename, O_RDONLY, 0600);
+    assert(fd != -1);
+    long long int read_bytes = 0;
+    auto r = 0;
+    do {
+        const auto buffer_ptr = buffer.data() + read_bytes;
+        const auto bytes_to_read = num_bytes - read_bytes;
+        r = read(fd, buffer_ptr, bytes_to_read);
+        read_bytes += r;
+    } while (r > 0);
+
+    if (read_bytes != num_bytes) {
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
+    }
+    assert(read_bytes == num_bytes);
+    close(fd);
+    return 0;
+}
+
+static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+{
+    std::vector<char> regular_buffer;
+    const auto reg_ret = regular_read(filename, regular_buffer);
+    assert(0 == reg_ret);
+    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
+              << std::endl;
+
+    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+
+    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
+}
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes)
+{
+    const auto msg_suffix = std::string("deepspeed_aio_") +
+                            std::string(read_op ? "read()" : "write()") +
+                            std::string("using read()");
+
+    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
+        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
+        return false;
+    }
+
+    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
+    return true;
+}
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
new file mode 100644
index 000000000000..1f32fc8f794f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_utils.h>
+#include <stdlib.h>
+#include <memory>
+#include <string>
+
+using namespace std;
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf);
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf);
+
+int open_file(const char* filename, const bool read_op);
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code);
+
+int regular_read(const char* filename, std::vector<char>& buffer);
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
new file mode 100644
index 000000000000..5f717c3b5658
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -0,0 +1,74 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+deepspeed_aio_config_t::deepspeed_aio_config_t()
+    : _block_size(c_block_size),
+      _queue_depth(c_io_queue_depth),
+      _single_submit(false),
+      _overlap_events(false),
+      _lock_memory(false)
+{
+}
+
+deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const bool lock_memory)
+    : _block_size(block_size),
+      _queue_depth(queue_depth),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _lock_memory(lock_memory)
+{
+}
+
+void deepspeed_aio_latency_t::dump(const std::string tag)
+{
+    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
+}
+
+void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
+{
+    _min_usec += other._min_usec;
+    _max_usec += other._max_usec;
+    _avg_usec += other._avg_usec;
+}
+
+void deepspeed_aio_latency_t::scale(const float scaler)
+{
+    _min_usec *= scaler;
+    _max_usec *= scaler;
+    _avg_usec *= scaler;
+}
+
+aio_context::aio_context(const int block_size, const int queue_depth)
+{
+    _block_size = block_size;
+    _queue_depth = queue_depth;
+    for (auto i = 0; i < queue_depth; ++i) {
+        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
+    }
+    _io_events.resize(queue_depth);
+    io_queue_init(queue_depth, &_io_ctxt);
+}
+
+aio_context::~aio_context()
+{
+    for (auto& iocb : _iocbs) { free(iocb); }
+    _io_events.resize(0);
+    io_queue_release(_io_ctxt);
+}
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
new file mode 100644
index 000000000000..5c5dcdf0b559
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <libaio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+struct deepspeed_aio_latency_t {
+    double _min_usec;
+    double _max_usec;
+    double _avg_usec;
+
+    void dump(const std::string tag);
+    void accumulate(const deepspeed_aio_latency_t&);
+    void scale(const float value);
+};
+
+struct deepspeed_aio_perf_t {
+    deepspeed_aio_latency_t _submit;
+    deepspeed_aio_latency_t _complete;
+    double _e2e_usec;
+    double _e2e_rate_GB;
+};
+
+struct deepspeed_aio_config_t {
+    const int _block_size;
+    const int _queue_depth;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const bool _lock_memory;
+
+    deepspeed_aio_config_t();
+    deepspeed_aio_config_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool lock_memory);
+};
+
+struct aio_context {
+    io_context_t _io_ctxt;
+    std::vector<struct io_event> _io_events;
+    std::vector<struct iocb*> _iocbs;
+    int _block_size;
+    int _queue_depth;
+
+    aio_context(const int block_size, const int queue_depth);
+    ~aio_context();
+};
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
new file mode 100644
index 000000000000..a3d89be5ad3e
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -0,0 +1,123 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+io_xfer_ctxt::io_xfer_ctxt(const int fd,
+                           const long long int file_offset,
+                           const long long int num_bytes,
+                           const void* buffer)
+    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+{
+}
+
+io_prep_context::io_prep_context(const bool read_op,
+                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 const size_t block_size,
+                                 const std::vector<struct iocb*>* iocbs)
+    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
+{
+}
+
+void io_prep_context::prep_iocbs(const int n_iocbs,
+                                 const size_t num_bytes,
+                                 const void* start_buffer,
+                                 const long long int start_offset)
+{
+    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto shift = i * _block_size;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        auto byte_count = _block_size;
+        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
+
+        if (_read_op) {
+            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        } else {
+            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        }
+    }
+}
+
+io_prep_generator::io_prep_generator(const bool read_op,
+                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                     const size_t block_size)
+    : _read_op(read_op),
+      _xfer_ctxt(xfer_ctxt),
+      _block_size(block_size),
+      _remaining_bytes(xfer_ctxt->_num_bytes),
+      _next_iocb_index(0)
+{
+    _num_io_blocks =
+        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+    _remaining_io_blocks = _num_io_blocks;
+}
+
+int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
+{
+    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
+        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        return 0;
+    }
+
+    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
+
+    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
+        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+
+        if (_read_op) {
+            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        } else {
+            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        }
+        _remaining_bytes -= num_bytes;
+    }
+    _remaining_io_blocks -= actual_n_iocbs;
+
+    return actual_n_iocbs;
+}
+
+int get_file_size(const char* filename, long long int& size)
+{
+    struct stat st;
+    if (stat(filename, &st) == -1) { return -1; }
+    size = st.st_size;
+    return 0;
+}
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock)
+{
+    void* ptr;
+    int retval;
+
+    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
+    if (retval) { return nullptr; }
+
+    if (lock == false) { return ptr; }
+
+    auto mlock_ret = mlock(ptr, size);
+    if (mlock_ret != 0) {
+        auto mlock_error = errno;
+        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
+
+        free(ptr);
+        return nullptr;
+    }
+
+    return ptr;
+}
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
new file mode 100644
index 000000000000..f37a95c5149a
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -0,0 +1,77 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <deepspeed_aio_types.h>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+struct io_xfer_ctxt {
+    const int _fd;
+    const long long int _base_offset;
+    const void* _mem_buffer;
+    const long long int _num_bytes;
+
+    io_xfer_ctxt(const int fd,
+                 const long long int file_offset,
+                 const long long int num_bytes,
+                 const void* buffer);
+};
+
+struct io_prep_context {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+    const std::vector<struct iocb*>* _iocbs;
+
+    io_prep_context(const bool read_op,
+                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                    const size_t block_size,
+                    const std::vector<struct iocb*>* iocbs);
+
+    void prep_iocbs(const int n_iocbs,
+                    const size_t num_bytes,
+                    const void* start_buffer,
+                    const long long int start_offset);
+};
+
+struct io_prep_generator {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+
+    long long int _remaining_bytes;
+    long long int _num_io_blocks;
+    long long int _remaining_io_blocks;
+    long long int _next_iocb_index;
+
+    io_prep_generator(const bool read_op,
+                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                      const size_t block_size);
+
+    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
+};
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+
+int get_file_size(const char* filename, long long int& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
new file mode 100644
index 000000000000..2c7509cb3ba0
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -0,0 +1,84 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_aio_thread.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int num_bytes,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _num_bytes(num_bytes),
+      _validate(validate)
+{
+    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::fini()
+{
+    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+}
+
+deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
+    : _tid(tid),
+      _aio_config(aio_config),
+      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
+      _time_to_exit(false)
+{
+}
+
+deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
+
+void deepspeed_aio_thread_t::run()
+{
+    while (true) {
+        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
+
+        {
+            std::unique_lock<std::mutex> lock(_work_sync._mutex);
+            _work_sync._cond_var.wait(lock,
+                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
+            if (!_work_queue.empty()) {
+                next_io_op = _work_queue.front();
+                _work_queue.pop();
+            }
+        }
+
+        if (next_io_op) {
+            const auto base_offset = next_io_op->_num_bytes * _tid;
+
+            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
+
+            if (_aio_config._overlap_events) {
+                do_aio_operation_overlap(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            } else {
+                do_aio_operation_sequential(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
+                _complete_queue.push(next_io_op);
+            }
+            _complete_sync._cond_var.notify_one();
+        }
+
+        if (_time_to_exit) { break; }
+    }
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
new file mode 100644
index 000000000000..ee099dd2d16c
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _num_bytes;
+    torch::Tensor _cpu_buffer;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int num_bytes,
+                 const bool validate);
+
+    char* data_ptr() const;
+    void fini();
+};
+
+struct thread_sync_t {
+    std::mutex _mutex;
+    std::condition_variable _cond_var;
+};
+
+struct deepspeed_aio_thread_t {
+    const int _tid;
+    deepspeed_aio_config_t& _aio_config;
+
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
+
+    bool _time_to_exit;
+
+    struct thread_sync_t _work_sync;
+    struct thread_sync_t _complete_sync;
+
+    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
+
+    ~deepspeed_aio_thread_t();
+
+    void run();
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
new file mode 100644
index 000000000000..cc2895cc74b3
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -0,0 +1,121 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "deepspeed_py_aio.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_READ 0
+#define DEBUG_DS_AIO_WRITE 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
new file mode 100644
index 000000000000..a78d5734009d
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -0,0 +1,27 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate);
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
new file mode 100644
index 000000000000..8d4cfd9e8636
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -0,0 +1,282 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_aio_handle.h"
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0)
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_aio_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_aio_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_aio_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_aio_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        completed_op->fini();
+
+        close(completed_op->_fd);
+
+        if (completed_op->_validate) {
+            validate_aio_operation(completed_op->_read_op,
+                                   completed_op->_filename.c_str(),
+                                   completed_op->data_ptr(),
+                                   completed_op->_num_bytes);
+        }
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                       const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
new file mode 100644
index 000000000000..09358f4d927b
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -0,0 +1,68 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+
+struct deepspeed_aio_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+
+    deepspeed_aio_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_aio_handle_t();
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
new file mode 100644
index 000000000000..3cdb5ed344bf
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -0,0 +1,133 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_copy.h"
+#include <omp.h>
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+#if defined(__AVX512__) or defined(__AVX256__)
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#else
+    __m256 data;
+#endif
+};
+#endif
+
+static void helper_memcpy_1(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            AVX_Data src_4;
+            src_4.data = SIMD_LOAD(src + i);
+
+            SIMD_STORE(dest + i, src_4.data);
+        }
+    }
+
+#endif
+
+    if (param_size > rounded_size) {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
+    }
+}
+
+static void helper_memcpy_4(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            AVX_Data src_4[4];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+static void helper_mempcy_8(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            AVX_Data src_4[8];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
+            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
+            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
+            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
+{
+    auto dest_c = dest.contiguous();
+    auto src_c = src.contiguous();
+
+    float* dest_ptr = (float*)dest_c.data_ptr();
+    float* src_ptr = (float*)src_c.data_ptr();
+
+    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
+
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
new file mode 100644
index 000000000000..823c298f6431
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -0,0 +1,40 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cpuid.h>
+#include <x86intrin.h>
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+#define TILE (1024 * 1024 * 1024)
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
new file mode 100755
index 000000000000..eee2cba0a962
--- /dev/null
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -0,0 +1,41 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_aio_handle.h"
+#include "deepspeed_py_copy.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+
+    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
+
+    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>())
+
+        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+
+        .def("read", &deepspeed_aio_handle_t::read)
+        .def("write", &deepspeed_aio_handle_t::write)
+
+        .def("pread", &deepspeed_aio_handle_t::pread)
+        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+
+        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
+        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
+        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
+        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+
+        .def("wait", &deepspeed_aio_handle_t::wait);
+}
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
new file mode 100755
index 000000000000..305cacc54e18
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_read, aio_write
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_read(ctxt['buffer'],
+             ctxt['file'],
+             args.block_size,
+             args.queue_depth,
+             args.single_submit,
+             args.overlap_events,
+             args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_write(ctxt['buffer'],
+              ctxt['file'],
+              args.block_size,
+              args.queue_depth,
+              args.single_submit,
+              args.overlap_events,
+              args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
new file mode 100755
index 000000000000..fe1459c89013
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -0,0 +1,176 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_handle
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    if args.gpu:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
+    else:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    handle = aio_handle(args.block_size,
+                        args.queue_depth,
+                        args.single_submit,
+                        args.overlap_events,
+                        io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['handle'] = handle
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
new file mode 100755
index 000000000000..82adf85ea8a2
--- /dev/null
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+import re
+
+RAW_RATE = 'raw_rate'
+E2E_RATE = 'e2e_rate'
+SUBMIT_LATENCY = 'submit_latency'
+COMPLETE_LATENCY = 'complete_latency'
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+TASK_READ_SPEED = 'task_read_speed'
+
+PERF_METRICS = [
+    RAW_RATE,
+    E2E_RATE,
+    SUBMIT_LATENCY,
+    COMPLETE_LATENCY,
+    READ_SPEED,
+    WRITE_SPEED
+]
+METRIC_SEARCH = {
+    RAW_RATE: 'ds_raw_time',
+    E2E_RATE: 'ds_time',
+    SUBMIT_LATENCY: 'aggr: submit',
+    COMPLETE_LATENCY: 'aggr: complete',
+    READ_SPEED: 'E2E Read Speed',
+    WRITE_SPEED: 'E2E Write Speed'
+}
+
+NUM_BYTES = (400 * 1024 * 1024)
+NUM_GIGA_BYTES = (1024 * 1024 * 1024)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--logdir',
+                        type=str,
+                        required=True,
+                        help='Folder of statistics logs')
+
+    parser.add_argument(
+        '--metric',
+        type=str,
+        required=True,
+        help=
+        'Performance metric to report: [raw_rate|e2e_rate|submit_latency|complete_latency]'
+    )
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(file)
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    num_giga_bytes = NUM_BYTES / NUM_GIGA_BYTES
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric == RAW_RATE:
+                    fields = line.split()
+                    raw_time_sec = float(fields[2]) / 1e06
+                    raw_rate = (thread_count * num_giga_bytes * 1.0) / raw_time_sec
+                    return raw_rate
+                elif metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.logdir):
+        print(f'{args.logdir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    log_files = [
+        f for f in os.listdir(args.logdir)
+        if os.path.isfile(os.path.join(args.logdir,
+                                       f))
+    ]
+
+    log_files_path = [os.path.join(args.logdir, f) for f in log_files]
+    results = get_results(log_files_path, args.metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
new file mode 100755
index 000000000000..8ef29d973d58
--- /dev/null
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <input file> <output log dir>"
+    exit 1
+fi
+
+INPUT_FILE=$1
+if [[ ! -f ${INPUT_FILE} ]]; then
+    echo "Input file not found: ${INPUT_FILE}"
+    exit 1
+fi
+
+LOG_DIR=$2
+RUN_SCRIPT=./test_ds_aio.py
+READ_OPT="--read_file ${INPUT_FILE}"
+
+if [[ -d ${LOG_DIR} ]]; then
+    rm -f ${LOG_DIR}/*
+else
+    mkdir -p ${LOG_DIR}
+fi
+
+DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sync"
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1 ; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${DISABLE_CACHE}
+                        echo ${cmd}
+                        echo ${SYNC}
+
+                        eval ${DISABLE_CACHE}
+                        eval ${cmd}
+                        eval ${SYNC}
+                        sleep 2
+                    done
+                done
+            done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
new file mode 100755
index 000000000000..0360934b469e
--- /dev/null
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
+
+if [[ $# -ne 3 ]]; then
+    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
+    exit 1
+fi
+
+SIZE="$1M"
+WRITE_DIR=$2
+LOG_DIR=$3
+
+OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+
+
+prep_folder ${WRITE_DIR}
+prep_folder ${LOG_DIR}
+
+RUN_SCRIPT=./test_ds_aio.py
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads 1"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/write_${SIZE}B_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${cmd}
+                        eval ${cmd}
+                        sleep 2
+                    done
+                done
+        done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
new file mode 100755
index 000000000000..607a59f58e08
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+import argparse
+import time
+import sys
+from multiprocessing import Pool
+import multiprocessing as mp
+from deepspeed.ops.aio import aio_read, aio_write, aio_handle
+from ds_aio_basic import aio_basic_multiprocessing
+from ds_aio_handle import aio_handle_multiprocessing
+
+GB_DIVISOR = 1024**3
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
+
+    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
+
+    parser.add_argument('--write_size',
+                        type=str,
+                        default=None,
+                        help='Number of bytes to write.')
+
+    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
+
+    parser.add_argument('--threads',
+                        type=int,
+                        default=1,
+                        help='Thread parallelism count.')
+
+    parser.add_argument(
+        '--single_submit',
+        action='store_true',
+        help=
+        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
+    )
+
+    parser.add_argument('--overlap_events',
+                        action='store_true',
+                        help='Overlap I/O submission and completion requests.')
+
+    parser.add_argument('--validate',
+                        action='store_true',
+                        help='Perform validation in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops',
+                        type=int,
+                        default=1,
+                        help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel',
+                        type=int,
+                        default=None,
+                        help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def refine_args(args):
+    if args.write_size and type(args.write_size) == str:
+        args.write_size = refine_integer_value(args.write_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+
+def validate_args(args):
+    if args.read_file and not os.path.isfile(args.read_file):
+        print(f'args validation error: {args.read_file} not found')
+        return False
+
+    return True
+
+
+def main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = parse_arguments()
+    refine_args(args)
+    if not validate_args(args):
+        quit()
+
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    if args.read_file:
+        multiprocess_function(args, True)
+
+    if args.write_file:
+        multiprocess_function(args, False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
new file mode 100755
index 000000000000..689a0a503247
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+
+GIGABYTE = 1024**3
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg):
+    if tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = total_bytes / task_latency_sec / GIGABYTE
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = total_bytes / e2e_latency_sec / GIGABYTE
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 5fae35261f55..ed33b04b3e10 100755
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -20,7 +20,7 @@
         }                                                                                      \
     }
 
-#define TILE (1024 * 1024 * 1024)
+#define TILE (128 * 1024 * 1024)
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index a4a49dca6bf3..8b7c52e68d7e 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -304,7 +304,7 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = len(active_resources) > 1
+    multi_node_exec = True  # len(active_resources) > 1
 
     if not multi_node_exec:
         deepspeed_launch = [
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
new file mode 100755
index 000000000000..50e6c9a3c988
--- /dev/null
+++ b/deepspeed/ops/aio/__init__.py
@@ -0,0 +1,6 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from ..op_builder import AsyncIOBuilder
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index 807802670654..b230eaef859b 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -9,9 +9,6 @@
 
 class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
-        """
-        docstring
-        """
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
@@ -27,9 +24,6 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
-        """
-        docstring
-        """
         self.enabled = get_scalar_param(flops_profiler_dict,
                                         FLOPS_PROFILER_ENABLED,
                                         FLOPS_PROFILER_ENABLED_DEFAULT)
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 8a9785a9aedb..faa60f20efa3 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -24,7 +24,7 @@
 
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
-from deepspeed.runtime.utils import move_to_device
+from deepspeed.runtime.utils import move_to_device, see_memory_usage
 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
 
 # DeepSpeed Checkpointing Enabled or Disabled
@@ -55,34 +55,6 @@
 SYNCHRONIZE = False
 PROFILE_TIME = False
 
-
-def see_memory_usage(message, force=False):
-    # return
-    if not force:
-        return
-    # dist.barrier()
-    if dist.get_rank() == 0:
-        logger.info(message)
-        logger.info(
-            "Memory Allocated %s GigaBytes",
-            torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max Memory Allocated %s GigaBytes",
-            torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Cache Allocated %s GigaBytes",
-            torch.cuda.memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max cache Allocated %s GigaBytes",
-            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info("")
-        #input("Press Any Key To Continue ..")
-
-
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
 transport_stream = None
@@ -685,8 +657,7 @@ def backward(ctx, *grads):
 
         torch.autograd.backward(output_tensors, grad_tensors)
 
-        see_memory_usage("After backward checkpointing code before backward",
-                         force=False)
+        see_memory_usage("After backward checkpointing code after backward", force=False)
 
         if PROFILE_TIME:
             timers('backward').stop()
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index b9ba6cf37cba..9e33876994f9 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -24,6 +24,8 @@
 
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 
+from .swap_tensor.aio_config import get_aio_config
+
 TENSOR_CORE_ALIGN_SIZE = 8
 
 ADAM_OPTIMIZER = 'adam'
@@ -656,6 +658,8 @@ def _initialize_params(self, param_dict):
         self.checkpoint_tag_validation_enabled = validation_mode != ValidationMode.IGNORE
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
 
+        self.aio_config = get_aio_config(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 12711d56f7f6..7a6f994e806a 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -6,7 +6,7 @@
 Collection of DeepSpeed configuration utilities
 """
 import json
-from collections import Counter, Mapping, Sequence
+import collections
 
 
 # adapted from https://stackoverflow.com/a/50701137/9201239
@@ -31,13 +31,13 @@ def iterencode(self, o, _one_shot=False, level=0):
                 return f"{o:e}"
             else:
                 return f"{o}"
-        elif isinstance(o, Mapping):
+        elif isinstance(o, collections.Mapping):
             x = [
                 f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
                 v in o.items()
             ]
             return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
-        elif isinstance(o, Sequence) and not isinstance(o, str):
+        elif isinstance(o, collections.Sequence) and not isinstance(o, str):
             return f"[{ f', '.join(map(self.iterencode, o)) }]"
         return "\n, ".join(super().iterencode(o, _one_shot))
 
@@ -70,7 +70,7 @@ def dict_raise_error_on_duplicate_keys(ordered_pairs):
     """Reject duplicate keys."""
     d = dict((k, v) for k, v in ordered_pairs)
     if len(d) != len(ordered_pairs):
-        counter = Counter([pair[0] for pair in ordered_pairs])
+        counter = collections.Counter([pair[0] for pair in ordered_pairs])
         keys = [key for key, value in counter.items() if value > 1]
         raise ValueError("Duplicate keys in DeepSpeed config: {}".format(keys))
     return d
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 5081adb0b021..f71a7324585a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -340,14 +340,14 @@ def zero_reduce_scatter(self):
     def zero_overlap_comm(self):
         return self._config.zero_config.overlap_comm
 
-    def zero_cpu_offload(self):
-        return self._config.zero_config.cpu_offload
+    def zero_offload_optimizer(self):
+        return self._config.zero_config.offload_optimizer
 
-    def zero_cpu_offload_params(self):
-        return self._config.zero_config.cpu_offload_params
+    def zero_offload_param(self):
+        return self._config.zero_config.offload_param
 
-    def zero_cpu_offload_use_pin_memory(self):
-        return self._config.zero_config.cpu_offload_use_pin_memory
+    def zero_cpu_offload(self):
+        return self._config.zero_config.offload_optimizer is not None
 
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
@@ -436,6 +436,12 @@ def initial_dynamic_scale(self):
     def dynamic_loss_scale_args(self):
         return self._config.dynamic_loss_scale_args
 
+    def swap_tensor_config(self):
+        return self._config.swap_tensor_config
+
+    def aio_config(self):
+        return self._config.aio_config
+
     def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
@@ -594,8 +600,9 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
             client_optimizer.param_groups[:] = [
                 pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
             ]
-            logger.info(
-                "Removing param_group that has no 'params'in the client Optimizer")
+            if self.global_rank == 0:
+                logger.info(
+                    "Removing param_group that has no 'params' in the client Optimizer")
 
             basic_optimizer = client_optimizer
             if self.global_rank == 0:
@@ -793,14 +800,14 @@ def _configure_zero_optimizer(self, optimizer):
                 dp_process_group=self.data_parallel_group,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=self.zero_overlap_comm(),
-                cpu_offload_optimizer_state=self.zero_cpu_offload(),
-                cpu_offload_params=self.zero_cpu_offload_params(),
-                cpu_offload_use_pin_memory=self.zero_cpu_offload_use_pin_memory(),
+                offload_optimizer_config=self.zero_offload_optimizer(),
+                offload_param_config=self.zero_offload_param(),
                 sub_group_size=self.zero_sub_group_size(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor(),
-                gradient_accumulation_steps=self.gradient_accumulation_steps())
+                gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                aio_config=self.aio_config())
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
@@ -905,6 +912,13 @@ def forward(self, *inputs, **kwargs):
         if self.module.training and self.progressive_layer_drop:
             kwargs.update(self.progressive_layer_drop.get_state())
 
+        if self.zero_optimization_partition_weights():
+            # Enable automated discovery of external parameters by indicating that
+            # we are in a forward pass.
+            for module in self.module.modules():
+                module._parameters._in_forward = True
+                pass
+
         if self.wall_clock_breakdown():
             self.timers('forward_microstep').start()
             self.timers('forward').start()
@@ -913,11 +927,15 @@ def forward(self, *inputs, **kwargs):
             self.tput_timer.start()
         loss = self.module(*inputs, **kwargs)
 
-        # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
         if self.zero_optimization_partition_weights():
+            # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
             if not torch._C.is_grad_enabled():
                 self.optimizer.param_coordinator.reset_step()
 
+            # Disable automated discovery of external parameters
+            for module in self.module.modules():
+                module._parameters._in_forward = False
+
         if self.wall_clock_breakdown():
             self.timers('forward').stop()
             self.timers('forward_microstep').stop()
@@ -1085,8 +1103,9 @@ def _take_model_step(self, lr_kwargs):
         else:
             if self.lr_scheduler is not None:
                 self.lr_scheduler.step(**(lr_kwargs or {}))
-            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
-                self._report_progress(self.global_steps + 1)
+
+        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+            self._report_progress(self.global_steps + 1)
 
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
diff --git a/deepspeed/runtime/swap_tensor/__init__.py b/deepspeed/runtime/swap_tensor/__init__.py
new file mode 100644
index 000000000000..8c18558e9423
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/__init__.py
@@ -0,0 +1,4 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
new file mode 100644
index 000000000000..6a7014c1863c
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -0,0 +1,44 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.swap_tensor.constants import *
+
+AIO_DEFAULT_DICT = {
+    AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
+    AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
+    AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
+    AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+}
+
+
+def get_aio_config(param_dict):
+    if AIO in param_dict.keys() and param_dict[AIO] is not None:
+        aio_dict = param_dict[AIO]
+        return {
+            AIO_BLOCK_SIZE:
+            get_scalar_param(aio_dict,
+                             AIO_BLOCK_SIZE,
+                             AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH:
+            get_scalar_param(aio_dict,
+                             AIO_QUEUE_DEPTH,
+                             AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_THREAD_COUNT:
+            get_scalar_param(aio_dict,
+                             AIO_THREAD_COUNT,
+                             AIO_THREAD_COUNT_DEFAULT),
+            AIO_SINGLE_SUBMIT:
+            get_scalar_param(aio_dict,
+                             AIO_SINGLE_SUBMIT,
+                             AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS:
+            get_scalar_param(aio_dict,
+                             AIO_OVERLAP_EVENTS,
+                             AIO_OVERLAP_EVENTS_DEFAULT)
+        }
+
+    return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
new file mode 100644
index 000000000000..e6e19a4c67ef
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
+
+INVALID_BUFFER_INDEX = -1
+ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait'
+
+
+class AsyncTensorSwapper(object):
+    def __init__(self, aio_handle, numel_alignment, timers):
+        self.free_buffer_index = []
+        self.swapping_buffer_index = []
+        self.ready_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.all_buffers = []
+        self.aio_handle = aio_handle
+        self.numel_alignment = numel_alignment
+        self.max_numel = 0
+        self.num_pending_swaps = 0
+        self.timers = timers
+        self.timer_names = set()
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+    def has_buffers(self):
+        return len(self.all_buffers) > 0
+
+    def add_buffers(self, buffer_list):
+        assert len(self.all_buffers) == 0
+        assert all([buffer.is_pinned() for buffer in buffer_list])
+        dtype = buffer_list[0].dtype
+        assert all([buffer.dtype == dtype for buffer in buffer_list])
+
+        self.dtype = dtype
+        self.all_buffers = [SwapBuffer(buffer) for buffer in buffer_list]
+        self.free_buffer_index += [i for i in range(len(self.all_buffers))]
+        self.max_numel = max([buffer.numel() for buffer in buffer_list])
+        self.timer_names = set()
+
+    def get_timer_names(self):
+        return list(self.timer_names)
+
+    def release_buffers(self):
+        self._report_statistics('Swapped out[Before flush]')
+        self._flush_buffers_until_complete()
+        self._report_statistics('Swapped out[After flush]')
+
+        pinned_buffers = [buf.buffer for buf in self.all_buffers]
+        self.all_buffers = []
+        self.free_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+        return pinned_buffers
+
+    def swap_out_tensors(self, tensor_list, path_list):
+        for tensor, swap_path in zip(tensor_list, path_list):
+            self._swap_out_tensor(tensor, swap_path)
+
+    def _report_statistics(self, message):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=self.dtype).element_size()
+            swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
+            logger.info(
+                f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
+            )
+
+    def _swap_out_tensor(self, tensor, swap_path):
+        assert len(self.all_buffers) > 0
+
+        aligned_numel = self._io_aligned_numel(tensor.numel())
+        assert aligned_numel <= self.max_numel
+
+        self._make_swap_space(aligned_numel)
+        assert self.current_buffer_index != INVALID_BUFFER_INDEX
+
+        swap_buffer = self._get_current_buffer()
+        swap_buffer.insert_tensor(tensor, swap_path, aligned_numel)
+
+    def _make_swap_space(self, numel):
+        if self.current_buffer_index == INVALID_BUFFER_INDEX:
+            self._allocate_buffer()
+            return
+
+        if not self._get_current_buffer().has_space(numel):
+            if len(self.free_buffer_index) > 0:
+                self._flush_ready_buffers()
+            else:
+                self._flush_buffers_until_complete()
+            self._allocate_buffer()
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
+
+    def _allocate_buffer(self):
+        assert self.current_buffer_index == INVALID_BUFFER_INDEX
+        assert len(self.all_buffers) > 0
+        assert len(self.free_buffer_index) > 0
+        self.current_buffer_index = self.free_buffer_index[-1]
+        self.free_buffer_index = self.free_buffer_index[:-1]
+
+    def _flush_ready_buffers(self):
+        if self.current_buffer_index != INVALID_BUFFER_INDEX:
+            self.ready_buffer_index.append(self.current_buffer_index)
+            self.current_buffer_index = INVALID_BUFFER_INDEX
+
+        self._swap_out_ready_buffers()
+
+    def _flush_buffers_until_complete(self):
+        self._flush_ready_buffers()
+        assert len(self.ready_buffer_index) == 0
+
+        self._wait_for_swap_complete()
+        assert len(self.swapping_buffer_index) == 0
+        assert len(self.free_buffer_index) == len(self.all_buffers)
+
+    def _swap_out_ready_buffers(self):
+        for buffer_index in self.ready_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            swap_tensors = buffer.get_swap_tensors()
+            swap_paths = buffer.get_swap_paths()
+            self.num_pending_swaps += len(swap_tensors)
+            swap_out_tensors(self.aio_handle, swap_tensors, swap_paths)
+
+        self.swapping_buffer_index += self.ready_buffer_index
+        self.ready_buffer_index = []
+
+    def _wait_for_swap_complete(self):
+        assert len(self.swapping_buffer_index) > 0
+
+        self._start_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        assert self.aio_handle.wait() == self.num_pending_swaps
+        self._stop_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        self.timer_names.add(ASYNC_SWAPPER_WAIT_TIMER)
+
+        self.num_pending_swaps = 0
+
+        for buffer_index in self.swapping_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            self.num_elements_swapped += buffer.get_num_elem()
+            buffer.reset()
+
+        self.free_buffer_index += self.swapping_buffer_index
+        assert len(self.free_buffer_index) <= len(self.all_buffers)
+        self.swapping_buffer_index = []
+
+    def _get_buffer(self, index):
+        assert index != INVALID_BUFFER_INDEX
+        return self.all_buffers[index]
+
+    def _get_current_buffer(self):
+        return self._get_buffer(self.current_buffer_index)
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and force:
+            self.timers.log(name_list)
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
new file mode 100644
index 000000000000..752ec8dcaacf
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -0,0 +1,27 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# AIO
+#########################################
+AIO_FORMAT = '''
+"aio": {
+  "block_size": 1048576,
+  "queue_depth": 8,
+  "thread_count": 1,
+  "single_submit": false,
+  "overlap_events": true
+}
+'''
+AIO = "aio"
+AIO_BLOCK_SIZE = "block_size"
+AIO_BLOCK_SIZE_DEFAULT = 1048576
+AIO_QUEUE_DEPTH = "queue_depth"
+AIO_QUEUE_DEPTH_DEFAULT = 8
+AIO_THREAD_COUNT = "thread_count"
+AIO_THREAD_COUNT_DEFAULT = 1
+AIO_SINGLE_SUBMIT = "single_submit"
+AIO_SINGLE_SUBMIT_DEFAULT = False
+AIO_OVERLAP_EVENTS = "overlap_events"
+AIO_OVERLAP_EVENTS_DEFAULT = True
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
new file mode 100644
index 000000000000..5785ff7cc36d
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -0,0 +1,526 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool
+
+
+class FlattenedTensorSwapInfo(object):
+    def __init__(self, path, length, offset):
+        self.path = path
+        self.offset = offset
+        self.length = length
+
+
+class OptimizerStateSwapInfo(object):
+    def __init__(self, parameter, numel, base_folder):
+        self.tensors = []
+        self.param_id = id(parameter)
+        self.swap_folder = base_folder
+        self.swap_paths = []
+        self.swapped_gradients = {}
+        self.unswapped_gradients = {}
+        self.tensor_numel = numel
+        self.tensor_dtype = parameter.dtype
+        self.tensor_device = parameter.device
+        self.has_state_tensors = False
+        self._add_tensors([parameter])
+
+    def numel(self):
+        return self.tensor_numel
+
+    def has_gradients(self):
+        return self.swapped_gradients or self.unswapped_gradients
+
+    def _add_tensors(self, tensor_list):
+        for t in tensor_list:
+            self.tensors.append(t)
+            self.swap_paths.append(os.path.join(self.swap_folder, f'{id(t)}.tensor.swp'))
+
+    def add_state_tensors(self, tensor_list):
+        self.has_state_tensors = True
+        self._add_tensors(tensor_list)
+
+    def device(self):
+        return self.tensor_device
+
+    def dtype(self):
+        return self.tensor_dtype
+
+    def release_memory(self):
+        for tensor in self.tensors:
+            tensor.data = torch.Tensor()
+
+    def get_or_create_gradient_paths(self, offsets, lengths):
+        gradient_paths = []
+        for offset, length in zip(offsets, lengths):
+            if not offset in self.swapped_gradients.keys():
+                path = os.path.join(
+                    self.swap_folder,
+                    f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
+                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(
+                    path,
+                    length,
+                    offset)
+
+            gradient_paths.append(self.swapped_gradients[offset].path)
+
+        return gradient_paths
+
+    def set_swap_buffers(self, buffers):
+        compute_lengths = [self.numel()] * len(self.tensors)
+        compute_buffers = get_sized_buffers(buffers, compute_lengths)
+        for t, buffer in zip(self.tensors, compute_buffers):
+            t.data = buffer.data
+
+    def get_swap_gradient_buffers(self, swap_buffer):
+        assert self.numel() <= swap_buffer.numel()
+        return [
+            swap_buffer.narrow(0,
+                               grad.offset,
+                               grad.length) for grad in self.swapped_gradients.values()
+        ]
+
+    def get_swap_gradient_paths(self):
+        return [grad.path for grad in self.swapped_gradients.values()]
+
+    def get_unpinned_state_tensors(self):
+        return [t for t in self.tensors if not t.is_pinned()]
+
+    def read_unswapped_gradients(self, dest_buffer):
+        num_elem_count = 0
+        for offset, grad_partition in self.unswapped_gradients.items():
+            dst_tensor = dest_buffer.narrow(0, offset, grad_partition.numel())
+            dst_tensor.data.copy_(grad_partition.data)
+            num_elem_count += grad_partition.numel()
+
+        return num_elem_count
+
+    def release_unswapped_gradients(self):
+        self.unswapped_gradients = {}
+
+
+SWAPPER_DEBUG_MODE = False
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+
+
+class OptimizerSwapper(object):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        self.swap_config = swap_config
+        self.aio_config = aio_config
+
+        # NVMe swap management
+        self.swap_params_info = {}
+        self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
+        self.swap_folder = os.path.join(base_folder,
+                                        'optimizer',
+                                        f'rank{torch.distributed.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.optimizer = optimizer
+
+        # Swap buffer management
+        self.largest_numel = largest_numel
+        self.dtype = dtype
+        self.swap_buffer_manager = SwapBufferManager(
+            num_elems=largest_numel,
+            count=swap_config[OFFLOAD_OPTIMIZER_BUFFER_COUNT],
+            dtype=dtype)
+
+        # Read/Write alignment for each thread during Intra-request parallelism
+        self.min_aio_bytes = max(MIN_AIO_BYTES, aio_config[AIO_BLOCK_SIZE])
+        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_THREAD_COUNT]
+        self.numel_alignment = self.aligned_bytes // self.swap_element_size
+
+        # Timers
+        self.timers = timers
+        self.timer_names = set()
+
+        # Print exclusion list
+        self.print_exclude_list = [
+            'optimizer',
+            'swap_buffer_manager',
+            'swap_params_info',
+            'timers',
+            'timer_names',
+        ]
+
+    def swappable_tensor(self, param=None, numel=None):
+        assert param is not None or numel is not None, "Either param or numel must be provided"
+        if param is not None:
+            return self.min_aio_bytes <= (param.numel() * self.swap_element_size)
+        return self.min_aio_bytes <= (numel * self.swap_element_size)
+
+    def init_timers(self):
+        self.timer_names = set()
+
+    def log_timers(self):
+        if self.timer_names:
+            self._log_timers(list(self.timer_names), force=True)
+
+    def pre_backward(self):
+        self.init_timers()
+
+    def post_backward(self):
+        pass
+
+    def _flush_gradient_swapper(self, gradient_swapper):
+        if gradient_swapper.has_buffers():
+            self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+            pinned_buffers = gradient_swapper.release_buffers()
+            self.swap_buffer_manager.free(pinned_buffers)
+            self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.update(gradient_swapper.get_timer_names())
+
+    def _swap_out_gradients(self,
+                            parameter,
+                            gradient_offsets,
+                            gradient_tensors,
+                            gradient_swapper):
+        if not id(parameter) in self.swap_params_info.keys():
+            return
+
+        swap_info = self.swap_params_info[id(parameter)]
+
+        swappable_tensors = []
+        swappable_offsets = []
+        swappable_lengths = []
+
+        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(
+            tensors=gradient_tensors,
+            offsets=gradient_offsets
+        )
+
+        self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+        for tensor, offset in zip(aligned_gradients, aligned_offsets):
+            if not self.swappable_tensor(param=tensor):
+                swap_info.unswapped_gradients[offset] = tensor
+                continue
+
+            swappable_tensors.append(tensor)
+            swappable_offsets.append(offset)
+            swappable_lengths.append(tensor.numel())
+
+        if len(swappable_tensors) > 0:
+            if not gradient_swapper.has_buffers():
+                pinned_buffers = self.swap_buffer_manager.allocate_all(
+                    num_elems=self.largest_numel,
+                    dtype=self.dtype)
+
+                gradient_swapper.add_buffers(pinned_buffers)
+
+            swappable_paths = swap_info.get_or_create_gradient_paths(
+                swappable_offsets,
+                swappable_lengths)
+
+            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors,
+                                              path_list=swappable_paths)
+
+        self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+
+    def _initialize_from_swapped_fp16_params(self,
+                                             aio_handle,
+                                             fp16_partitions_info,
+                                             fp16_num_elems,
+                                             fp16_pinned_buffers,
+                                             fp32_parameters):
+        assert len(fp32_parameters) == len(fp16_partitions_info)
+        assert len(fp32_parameters) == len(fp16_num_elems)
+        assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers])
+
+        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters,
+                                               num_elems=fp16_num_elems)
+
+        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+
+        fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers]
+        assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \
+        f"numel of fp16 buffers {fp16_buffer_numel} is too small for initializing fp32 params {self.largest_numel}"
+
+        fp32_swap_buffers = SwapBufferPool(fp32_pinned_buffers)
+        fp16_swap_buffers = SwapBufferPool(fp16_pinned_buffers)
+
+        curr_index = 0
+        while curr_index < len(fp32_parameters):
+            fp16_pinned_tensors = self._swap_in_fp16_params(
+                aio_handle=aio_handle,
+                fp16_num_elems=fp16_num_elems[curr_index:],
+                fp16_partitions_info=fp16_partitions_info[curr_index:],
+                fp16_swap_buffers=fp16_swap_buffers)
+
+            if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+                for i, tensor in enumerate(fp16_pinned_tensors):
+                    true_index = curr_index + i
+                    logger.info(
+                        f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
+                    )
+
+            swap_out_count = self._swap_out_fp16_params(
+                aio_handle=aio_handle,
+                fp32_swap_paths=fp32_swap_paths[curr_index:],
+                fp32_swap_buffers=fp32_swap_buffers,
+                fp16_pinned_tensors=fp16_pinned_tensors)
+            assert swap_out_count == len(fp16_pinned_tensors), \
+            f"{swap_out_count} does not match {len(fp16_pinned_tensors)}"
+
+            fp16_swap_buffers.reset()
+            fp32_swap_buffers.reset()
+            curr_index += swap_out_count
+
+        self.swap_buffer_manager.free(fp32_pinned_buffers)
+
+    def _swap_in_fp16_params(self,
+                             aio_handle,
+                             fp16_num_elems,
+                             fp16_partitions_info,
+                             fp16_swap_buffers):
+        assert len(fp16_num_elems) > 0
+
+        swapped_fp16_tensors = []
+        swap_tensors = []
+        swap_paths = []
+        unswapped_srcs = []
+        unswapped_dsts = []
+
+        for i, numel in enumerate(fp16_num_elems):
+            pinned_tensor, _ = fp16_swap_buffers.allocate_tensor(numel, None, numel)
+            if pinned_tensor is None:
+                break
+
+            swapped_fp16_tensors.append(pinned_tensor)
+            offset = 0
+            for tensor, partition_numel, partition_path in fp16_partitions_info[i]:
+                dst_tensor = pinned_tensor.narrow(0, offset, partition_numel)
+                if partition_path is None:
+                    unswapped_srcs.append(tensor)
+                    unswapped_dsts.append(dst_tensor)
+                else:
+                    swap_paths.append(partition_path)
+                    swap_tensors.append(dst_tensor)
+                offset += partition_numel
+
+        assert len(swapped_fp16_tensors) + len(unswapped_srcs) > 0
+        ret = swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+        for src, dst in zip(unswapped_srcs, unswapped_dsts):
+            dst.data.copy_(src.data)
+
+        assert len(swap_tensors) == aio_handle.wait()
+
+        return swapped_fp16_tensors
+
+    def _swap_out_fp16_params(self,
+                              aio_handle,
+                              fp32_swap_paths,
+                              fp32_swap_buffers,
+                              fp16_pinned_tensors):
+
+        assert len(fp16_pinned_tensors) <= len(fp32_swap_paths)
+        swap_out_count = 0
+        for i, fp16_tensor in enumerate(fp16_pinned_tensors):
+            if not fp32_swap_buffers.has_space(fp16_tensor.numel()):
+                fp32_swap_buffers.swap_out(aio_handle)
+                fp32_swap_buffers.reset()
+
+            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(
+                fp16_tensor,
+                fp32_swap_paths[i],
+                self._io_aligned_numel(fp16_tensor.numel())
+                )
+            assert pinned_tensor is not None
+            swap_out_count += 1
+
+        if len(fp32_swap_buffers.get_swap_tensors()) > 0:
+            fp32_swap_buffers.swap_out(aio_handle)
+
+        return swap_out_count
+
+    def _initialize_parameters(self, parameters, src_tensors, aio_handle):
+        assert len(parameters) == len(src_tensors)
+
+        swap_paths = self._get_swap_paths(parameters=parameters,
+                                          num_elems=[src.numel() for src in src_tensors])
+
+        SWAP_INIT_TIMER = "swap_init_write"
+        self._start_timer(SWAP_INIT_TIMER)
+
+        pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+        assert pinned_buffers is not None
+
+        self._swap_out_unpinned_tensors(aio_handle=aio_handle,
+                                        unpinned_tensors=src_tensors,
+                                        dest_paths=swap_paths,
+                                        pinned_buffers=pinned_buffers)
+
+        if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+            for i, tensor in enumerate(src_tensors):
+                logger.info(
+                    f'copy_in_fp16_param: fp32_id = {id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
+                )
+
+        self.swap_buffer_manager.free(pinned_buffers)
+
+        self._stop_timer(SWAP_INIT_TIMER)
+        self._log_timers([SWAP_INIT_TIMER])
+
+    def _get_swap_paths(self, parameters, num_elems):
+        swap_info_list = [
+            self._create_param_swap_info(parameter=p,
+                                         numel=numel) \
+            for p, numel in zip(parameters, num_elems)
+        ]
+        assert len(swap_info_list) == len(num_elems)
+
+        swap_paths = [info.swap_paths[0] for info in swap_info_list]
+        return swap_paths
+
+    def _swap_out_unpinned_tensors(self,
+                                   aio_handle,
+                                   unpinned_tensors,
+                                   dest_paths,
+                                   pinned_buffers):
+
+        swap_buffer_count = len(pinned_buffers)
+        unpinned_tensor_count = len(unpinned_tensors)
+
+        for i in range(0, unpinned_tensor_count, swap_buffer_count):
+            swap_tensor_count = min((unpinned_tensor_count - i), swap_buffer_count)
+
+            src_tensors = unpinned_tensors[i:(i + swap_tensor_count)]
+            compute_lengths = [t.numel() for t in src_tensors]
+            compute_buffers = get_sized_buffers(pinned_buffers, compute_lengths)
+
+            for dst, src in zip(compute_buffers, src_tensors):
+                dst.data.copy_(src.data)
+
+            swap_lengths = [self._io_aligned_numel(t.numel()) for t in src_tensors]
+            swap_buffers = get_sized_buffers(pinned_buffers, swap_lengths)
+
+            swap_paths = dest_paths[i:(i + swap_tensor_count)]
+            swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+            assert aio_handle.wait() == swap_tensor_count
+
+    def _adjust_for_misaligned_lengths(self, tensors, offsets):
+        new_tensors = []
+        new_offsets = []
+
+        for orig_tensor, orig_offset in zip(tensors, offsets):
+            if not self.swappable_tensor(param=orig_tensor):
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            remainder = orig_tensor.numel() % self.numel_alignment
+            if remainder == 0:
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            # Split into two by making remainder a tensor
+            aligned_length = (orig_tensor.numel() //
+                              self.numel_alignment) * self.numel_alignment
+            new_tensors.append(orig_tensor.narrow(0, 0, aligned_length))
+            new_offsets.append(orig_offset)
+
+            # remainder tensor
+            new_tensors.append(orig_tensor.narrow(0, aligned_length, remainder))
+            new_offsets.append(orig_offset + aligned_length)
+
+        return new_tensors, new_offsets
+
+    def _retrieve_unswapped_grad_partitions(self, swap_info, dest_buffer):
+        UNSWAPPED_READ_GRADIENTS = 'unswapped_read_gradients'
+        self._start_timer(UNSWAPPED_READ_GRADIENTS)
+        tensor_count = len(swap_info.unswapped_gradients)
+        num_elem_count = swap_info.read_unswapped_gradients(dest_buffer)
+        self._stop_timer(UNSWAPPED_READ_GRADIENTS)
+        self._log_timers([UNSWAPPED_READ_GRADIENTS])
+
+        # It shoud be safe to discard unswapped gradient partitions
+        swap_info.release_unswapped_gradients()
+
+        if SWAPPER_DEBUG_MODE:
+            logger.info(
+                f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
+            )
+
+    def _get_state_tensors(self, parameter):
+        if not parameter in self.optimizer.state:
+            return []
+
+        tensor_list = []
+        for value in self.optimizer.state[parameter].values():
+            if torch.is_tensor(value):
+                tensor_list.append(value)
+
+        return tensor_list
+
+    def _update_param_state_info(self, swap_info, parameter):
+        if not swap_info.has_state_tensors:
+            state_tensors = self._get_state_tensors(parameter)
+            if state_tensors:
+                swap_info.add_state_tensors(state_tensors)
+
+    def _create_param_swap_info(self, parameter, numel):
+        param_id = id(parameter)
+        assert not param_id in self.swap_params_info
+
+        self.swap_params_info[param_id] = OptimizerStateSwapInfo(
+            parameter=parameter,
+            numel=numel,
+            base_folder=self.swap_folder)
+        swap_info = self.swap_params_info[param_id]
+
+        self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _get_param_swap_info(self, parameter):
+        param_id = id(parameter)
+        swap_info = self.swap_params_info.get(param_id, None)
+
+        if swap_info is not None:
+            self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and (SWAPPER_DEBUG_MODE or force):
+            self.timers.log(name_list)
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
new file mode 100644
index 000000000000..fb321f3b3937
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -0,0 +1,260 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+DEBUG_MODE = False
+
+SWAP_IN_PARAM_TIMER = 'swap_in_param'
+SWAP_OUT_PARAM_TIMER = 'swap_out_param'
+SWAP_IN_GRADIENT_TIMER = 'swap_in_gradient'
+
+
+class PartitionedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PartitionedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                            aio_config[AIO_QUEUE_DEPTH],
+                                            aio_config[AIO_SINGLE_SUBMIT],
+                                            aio_config[AIO_OVERLAP_EVENTS],
+                                            aio_config[AIO_THREAD_COUNT])
+
+        # Overlap swapping out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.print_exclude_list += [
+            'aio_handle',
+            'gradient_swapper',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PartitionedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter=None):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        required_buffer_count = len(
+            swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(swap_info.numel())
+        pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
+                                                           count=required_buffer_count,
+                                                           dtype=parameter.dtype)
+        assert pinned_buffers is not None
+        self.allocated_swap_buffers = pinned_buffers.copy()
+
+        self._start_timer(SWAP_IN_PARAM_TIMER)
+        self._swap_in_parameter(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffers=pinned_buffers[:required_buffer_count])
+        self._stop_timer(SWAP_IN_PARAM_TIMER)
+        self.timer_names.add(SWAP_IN_PARAM_TIMER)
+
+        self._start_timer(SWAP_IN_GRADIENT_TIMER)
+        self._swap_in_gradients(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffer=pinned_buffers[-1])
+        self._stop_timer(SWAP_IN_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_IN_GRADIENT_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap=False):
+        swap_info = self._get_param_swap_info(parameter=parameter)
+
+        if swap_info is None:
+            return
+
+        self._start_timer(SWAP_OUT_PARAM_TIMER)
+        pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info)
+        swap_bytes = sum([
+            self._io_aligned_numel(t.numel()) * t.element_size()
+            for t in swap_info.tensors
+        ])
+
+        WRITE_TIMER = 'swap_submit_write'
+        self._start_timer(WRITE_TIMER)
+
+        swap_out_tensors(self.aio_handle, pinned_tensors, pinned_paths)
+        assert self.aio_handle.wait() == len(pinned_tensors)
+        for t in pinned_tensors:
+            t.data = torch.Tensor()
+
+        if len(unpinned_tensors) > 0:
+            pinned_buffers = self.swap_buffer_manager.allocate_all(
+                num_elems=self.largest_numel,
+                dtype=self.dtype)
+            self._swap_out_unpinned_tensors(aio_handle=self.aio_handle,
+                                            unpinned_tensors=unpinned_tensors,
+                                            dest_paths=unpinned_paths,
+                                            pinned_buffers=pinned_buffers)
+            self.allocated_swap_buffers += pinned_buffers
+
+            for t in unpinned_tensors:
+                t.data = torch.Tensor()
+        self._stop_timer(WRITE_TIMER)
+
+        self.swap_buffer_manager.free(self.allocated_swap_buffers)
+        self.allocated_swap_buffers = []
+
+        self._stop_timer(SWAP_OUT_PARAM_TIMER)
+        self.timer_names.add(SWAP_OUT_PARAM_TIMER)
+
+        self._log_timers([WRITE_TIMER])
+
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_out: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        assert len(swap_info.tensors) <= len(dest_buffers)
+
+        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(
+            swap_info.tensors)
+        swap_buffers = get_sized_buffers(dest_buffers, swap_lengths)
+
+        READ_TIMER = 'swap_submit_read_param'
+        WAIT_TIMER = 'swap_wait_read_param'
+
+        self._start_timer(READ_TIMER)
+        swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths)
+        self._stop_timer(READ_TIMER)
+
+        swap_bytes = sum(
+            [buffer.numel() * buffer.element_size() for buffer in swap_buffers])
+
+        self._start_timer(WAIT_TIMER)
+        aio_handle.wait()
+        self._stop_timer(WAIT_TIMER)
+
+        compute_lengths = [swap_info.numel()] * len(swap_info.tensors)
+        compute_buffers = get_sized_buffers(dest_buffers, compute_lengths)
+        for t, buffer in zip(swap_info.tensors, compute_buffers):
+            t.data = buffer.data
+
+        self._log_timers([READ_TIMER, WAIT_TIMER])
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def _seperate_pinned_tensors(self, swap_info):
+        pinned_tensors = []
+        pinned_paths = []
+
+        unpinned_tensors = []
+        unpinned_paths = []
+
+        for tensor, path in zip(swap_info.tensors, swap_info.swap_paths):
+            if tensor.is_pinned():
+                pinned_tensors.append(tensor)
+                pinned_paths.append(path)
+            else:
+                unpinned_tensors.append(tensor)
+                unpinned_paths.append(path)
+
+        return pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths
+
+    def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
+        swap_info = self.swap_params_info[id(parameter)]
+        param_gradients = swap_info.swapped_gradients.values()
+        swap_buffers = [
+            gradient_tensor.narrow(0,
+                                   grad.offset,
+                                   grad.length) for grad in param_gradients
+        ]
+        swap_paths = [grad.path for grad in param_gradients]
+        SWAP_READ_GRADIENTS = 'swap_submit_read_gradient'
+        SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient'
+
+        self._start_timer(SWAP_READ_GRADIENTS)
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+        self._stop_timer(SWAP_READ_GRADIENTS)
+
+        self._start_timer(SWAP_WAIT_GRADIENTS)
+        assert len(swap_buffers) == aio_handle.wait()
+        self._stop_timer(SWAP_WAIT_GRADIENTS)
+
+        self._log_timers([SWAP_READ_GRADIENTS, SWAP_WAIT_GRADIENTS])
+
+    def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
+        swap_info = self.swap_params_info.get(id(parameter), None)
+        if not (swap_info and swap_info.has_gradients()):
+            return
+
+        assert dest_buffer.is_pinned()
+        assert parameter.numel() <= dest_buffer.numel()
+
+        parameter.grad = dest_buffer.narrow(0, 0, parameter.numel())
+
+        if swap_info.swapped_gradients:
+            self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad)
+
+        if swap_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=swap_info,
+                                                     dest_buffer=parameter.grad)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
new file mode 100644
index 000000000000..0f8a690b7a91
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -0,0 +1,308 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from enum import Enum
+import torch
+import torch.distributed as dist
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+from .constants import *
+from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, print_object
+from ..zero.offload_constants import *
+
+
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
+class PartitionedParamStatus(Enum):
+    # Partitioned parameters are present and ready for use
+    AVAILABLE = 1
+
+    # partitioned params are in some non-memory device
+    NOT_AVAILABLE = 2
+
+    # partitioned params are being read from some non-memory device.
+    INFLIGHT = 3
+
+
+class AsyncPartitionedParameterSwapper(object):
+    def __init__(self, ds_config):
+
+        aio_op = AsyncIOBuilder().load(verbose=False)
+        self.aio_handle = aio_op.aio_handle
+
+        #set swap buffers, create aio handles
+        self._configure_aio(ds_config)
+
+        #mapping from param id to path
+        self.id_to_path = {}
+
+        #mapping from pram_id to buffer id
+        self.param_id_to_buffer_id = {}
+
+        #number of elements in the param
+        self.param_id_to_numel = {}
+
+        self.pending_writes = 0
+        self.pending_reads = 0
+
+        #keep track of async swap in params and buffers
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+        #keep track of available params
+        self.available_params = set()
+        self.available_numel = 0
+
+        self.invalid_buffer = torch.tensor(1).half()
+
+        if dist.get_rank() == 0:
+            exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
+            print_object(obj=self,
+                         name='AsyncPartitionedParameterSwapper',
+                         exclude_list=exclude_list)
+
+    def available_swap_in_buffers(self):
+        return len(self.available_buffer_ids)
+
+    def _configure_aio(self, ds_config):
+        self.swap_config = ds_config.zero_config.offload_param
+        self.swap_folder = os.path.join(self.swap_config[OFFLOAD_PARAM_NVME_PATH],
+                                        'zero_stage_3',
+                                        'fp16params',
+                                        f'rank{dist.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.elements_per_buffer = self.swap_config[OFFLOAD_PARAM_BUFFER_SIZE]
+        self.param_buffer_count = self.swap_config[OFFLOAD_PARAM_BUFFER_COUNT]
+
+        self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
+        self.reserved_buffer_ids = []
+
+        self.buffers = torch.empty(int(self.elements_per_buffer *
+                                       self.param_buffer_count),
+                                   dtype=torch.half,
+                                   pin_memory=True,
+                                   requires_grad=False)
+
+        self.aio_config = ds_config.aio_config
+
+        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                               self.aio_config[AIO_QUEUE_DEPTH],
+                                               self.aio_config[AIO_SINGLE_SUBMIT],
+                                               self.aio_config[AIO_OVERLAP_EVENTS],
+                                               self.aio_config[AIO_THREAD_COUNT])
+
+        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                                self.aio_config[AIO_QUEUE_DEPTH],
+                                                self.aio_config[AIO_SINGLE_SUBMIT],
+                                                self.aio_config[AIO_OVERLAP_EVENTS],
+                                                self.aio_config[AIO_THREAD_COUNT])
+
+        self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
+
+        self.swap_element_size = torch.tensor([], dtype=torch.half).element_size()
+        self.swap_out_params = []
+
+    #Check if partiitoned param or numel in a tensor is swappable or not
+    def swappable_tensor(self, param=None, numel=None):
+        if param is not None:
+            assert numel is None, "Both parma and numel cannot be provided"
+            numel = param.ds_tensor.ds_numel
+        if numel is not None:
+            return self.min_aio_bytes <= numel * self.swap_element_size
+        assert False, "Either param or numel must be provided"
+
+    def get_path(self, param, must_exist=False):
+        paths, _ = self._get_paths([param], must_exist=must_exist)
+        return paths[0]
+
+    def _get_paths(self, params, must_exist=False):
+        paths = []
+        tensors = []
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.id_to_path.keys():
+                param_path = self.id_to_path[param_id]
+            else:
+                assert not must_exist, f"Path for param id {param_id} does not exist"
+                param_path = os.path.join(self.swap_folder,
+                                          f'{param_id}_param.tensor.swp')
+
+                self.id_to_path[param_id] = param_path
+            paths.append(param_path)
+            tensors.append(param.ds_tensor)
+        return paths, tensors
+
+    def _track_numel(self, params):
+        for param in params:
+            assert param.ds_tensor is not None, "Partitioned tensor is None"
+            self.param_id_to_numel[param.ds_id] = param.ds_tensor.ds_numel
+
+    def _allocate_and_return_buffers_for_swap_in(self, params):
+        buffers = []
+        for param in params:
+            param_id = param.ds_id
+            assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown"
+            assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
+
+            buffer_id = self.available_buffer_ids.pop()
+            print_rank_0(
+                f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
+            self.param_id_to_buffer_id[param_id] = buffer_id
+            buffer = self.buffers.narrow(0,
+                                         int(buffer_id * self.elements_per_buffer),
+                                         self.param_id_to_numel[param_id])
+            buffers.append(buffer)
+
+        return buffers
+
+    #waits for inflight nvme write to complete
+    def synchronize_writes(self):
+        if self.pending_writes == 0:
+            return
+        assert self.pending_writes == self.aio_write_handle.wait()
+        self.pending_writes = 0
+        self.remove_partition_and_release_buffers(self.swap_out_params)
+        self.swap_out_params = []
+
+    #waits for inflight nvme reads to complete
+    def synchronize_reads(self):
+        if self.pending_reads == 0:
+            return
+
+        assert self.pending_reads == self.aio_read_handle.wait()
+
+        self.pending_reads = 0
+
+        for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers):
+            param.ds_tensor.data = swap_in_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+
+        self.available_params.update([param.ds_id for param in self.inflight_params])
+        self.available_numel += self.inflight_numel
+
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+    #Removes the memory assignment and releases the buffers
+    #Should only be executed after swapping out the tensors
+    def remove_partition_and_release_buffers(self, params):
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.param_id_to_buffer_id.keys():
+
+                buffer_id = self.param_id_to_buffer_id[param_id]
+
+                assert buffer_id is not None, "Missing buffer id for releasing"
+
+                self.available_buffer_ids.append(buffer_id)
+                del self.param_id_to_buffer_id[param_id]
+                print_rank_0(f"param {param.ds_id} releases buffer id {buffer_id}  ")
+
+                if param_id in self.available_params:
+                    self.available_params.remove(param_id)
+                    self.available_numel -= self.param_id_to_numel[param_id]
+
+            param.ds_tensor.data = self.invalid_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.NOT_AVAILABLE
+
+    #writes from in memory to nvme. Does not release the buffers
+    def _swap_out(self, params, async_op=True):
+
+        swap_out_paths, swap_out_params = self._get_paths(params)
+
+        self._track_numel(params)
+
+        swap_out_tensors(self.aio_write_handle, swap_out_params, swap_out_paths)
+
+        self.pending_writes += len(swap_out_params)
+        self.swap_out_params += params
+
+        if not async_op:
+            self.synchronize_writes()
+
+    #blocking swap out followed by releasing the memory buffers
+    def swap_out_and_release(self, params, async_op=False, force_buffer_release=False):
+        if async_op:
+            assert force_buffer_release, "Should not release preallocated buffers without completing the swap out. Set force_buffer_release to True to do it anyways"
+        self._swap_out(params, async_op=async_op)
+
+    #assigns an in memory buffer and swaps in from nvme
+    def swap_in(self, params, async_op=True, swap_in_buffers=None):
+
+        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight"
+        swap_in_paths, _ = self._get_paths(params)
+
+        if swap_in_buffers is None:
+            if len(self.available_buffer_ids) < len(swap_in_paths):
+                print_rank_0(
+                    f'Not enough swap in buffers {len(self.available_buffer_ids)} for params {len(swap_in_paths)}',
+                    force=True)
+                print_rank_0(
+                    f'Num inflight: params {len(self.inflight_params)}, buffers {len(self.inflight_swap_in_buffers)}, numel = {self.inflight_numel}',
+                    force=True)
+                print_rank_0(
+                    f'Num available: param {len(self.available_params)}, numel = {self.available_numel}',
+                    force=True)
+
+            assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
+            swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params)
+
+        swap_in_tensors(self.aio_read_handle, swap_in_buffers, swap_in_paths)
+
+        self.inflight_params.extend(params)
+        self.inflight_swap_in_buffers.extend(swap_in_buffers)
+        self.inflight_numel += sum([t.numel() for t in swap_in_buffers])
+
+        for param in params:
+            param.ds_tensor.status = PartitionedParamStatus.INFLIGHT
+
+        self.pending_reads += len(params)
+
+        if not async_op:
+            self.synchronize_reads()
+
+    #assign a buffer to a param and return the buffer
+    def get_buffer(self, param, numel):
+        assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}"
+        param_id = param.ds_id
+        self.param_id_to_numel[param_id] = numel
+        buffer_id = self.available_buffer_ids.pop()
+        self.param_id_to_buffer_id[param_id] = buffer_id
+
+        buffer = self.buffers.narrow(0,
+                                     int(buffer_id * self.elements_per_buffer),
+                                     self.param_id_to_numel[param_id])
+        print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id}")
+        return buffer
+
+    def reserve_available_buffers(self):
+        buffers = []
+        for id in self.available_buffer_ids:
+            buffers.append(
+                self.buffers.narrow(0,
+                                    int(id * self.elements_per_buffer),
+                                    int(self.elements_per_buffer)))
+            self.reserved_buffer_ids.append(id)
+
+        self.available_buffer_ids = []
+        return buffers
+
+    def release_reserved_buffers(self):
+        for id in self.reserved_buffer_ids:
+            self.available_buffer_ids.append(id)
+
+        self.reserved_buffer_ids = []
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
new file mode 100644
index 000000000000..7d0116faab5b
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -0,0 +1,284 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import SwapBufferManager, get_sized_buffer
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+
+class OptimizerSwapOp(object):
+    def __init__(self,
+                 aio_handle,
+                 read_op,
+                 param_info,
+                 allocated_buffers,
+                 state_buffers,
+                 num_ops):
+        self.aio_handle = aio_handle
+        self.read_op = read_op
+        self.param_info = param_info
+        self.allocated_buffers = allocated_buffers
+        self.state_buffers = state_buffers
+        self.wait_required = True
+        self.num_ops = num_ops
+
+    def is_parameter(self, parameter):
+        return id(parameter) == self.param_info.param_id
+
+    def wait(self):
+        assert self.wait_required
+        assert self.aio_handle.wait() == self.num_ops
+        self.wait_required = False
+
+
+SYNC_SWAP_IN = 'sync_swap_in'
+ASYNC_SWAP_IN = 'async_swap_in'
+SYNC_SWAP_OUT = 'sync_swap_out'
+ASYNC_SWAP_OUT = 'async_swap_out'
+
+SWAP_IN_STATE_TIMER = 'swap_in_state'
+SWAP_OUT_STATE_TIMER = 'swap_out_state'
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+ASYNC_SWAP_IN_STATE_TIMER = "async_swap_in_state"
+ASYNC_SWAP_OUT_STATE_TIMER = 'async_swap_out_state'
+
+
+class PipelinedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PipelinedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                  aio_config[AIO_QUEUE_DEPTH],
+                                                  aio_config[AIO_SINGLE_SUBMIT],
+                                                  aio_config[AIO_OVERLAP_EVENTS],
+                                                  aio_config[AIO_THREAD_COUNT])
+
+        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                 aio_config[AIO_QUEUE_DEPTH],
+                                                 aio_config[AIO_SINGLE_SUBMIT],
+                                                 aio_config[AIO_OVERLAP_EVENTS],
+                                                 aio_config[AIO_THREAD_COUNT])
+
+        # Overlap gradient swap out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.write_aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ]
+        self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+
+        self.swap_ops = {
+            SYNC_SWAP_IN: None,
+            ASYNC_SWAP_IN: None,
+            SYNC_SWAP_OUT: None,
+            ASYNC_SWAP_OUT: None
+        }
+
+        self.print_exclude_list += [
+            'gradient_swapper',
+            'read_aio_handle',
+            'write_aio_handle',
+            'swap_ops',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PipelinedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.write_aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.write_aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter):
+        assert parameter is not None
+        assert self.swap_ops[SYNC_SWAP_IN] is None
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        self._start_timer(SWAP_IN_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_IN]:
+            assert self.swap_ops[ASYNC_SWAP_IN].is_parameter(parameter)
+            self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN]
+            self.swap_ops[ASYNC_SWAP_IN] = None
+        else:
+            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=parameter)
+
+        if self.swap_ops[SYNC_SWAP_IN]:
+            self.swap_ops[SYNC_SWAP_IN].wait()
+
+        if self.async_swap_in and async_parameter is not None:
+            assert self.swap_ops[ASYNC_SWAP_IN] is None
+            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=async_parameter)
+
+        self._stop_timer(SWAP_IN_STATE_TIMER)
+        self.timer_names.add(SWAP_IN_STATE_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap):
+        self._start_timer(SWAP_OUT_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_OUT]:
+            self._start_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self._complete_swap_out(ASYNC_SWAP_OUT)
+            self._stop_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self.timer_names.add(ASYNC_SWAP_OUT_STATE_TIMER)
+
+        assert self.swap_ops[SYNC_SWAP_IN] is not None
+        assert not self.swap_ops[SYNC_SWAP_IN].wait_required
+        swap_op = self._swap_out_optimizer_state(aio_handle=self.write_aio_handle,
+                                                 parameter=parameter,
+                                                 swap_in_op=self.swap_ops[SYNC_SWAP_IN])
+        self.swap_ops[SYNC_SWAP_IN] = None
+
+        if self.async_swap_out and async_swap:
+            self.swap_ops[ASYNC_SWAP_OUT] = swap_op
+        else:
+            self.swap_ops[SYNC_SWAP_OUT] = swap_op
+            self._complete_swap_out(SYNC_SWAP_OUT)
+
+        self._stop_timer(SWAP_OUT_STATE_TIMER)
+        self.timer_names.add(SWAP_OUT_STATE_TIMER)
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _complete_swap_out(self, swap_out_type):
+        self.swap_ops[swap_out_type].wait()
+        self.swap_buffer_manager.free(self.swap_ops[swap_out_type].allocated_buffers)
+        self.swap_ops[swap_out_type] = None
+
+    def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
+        assert swap_in_op.is_parameter(parameter)
+
+        allocated_buffers = swap_in_op.allocated_buffers.copy()
+        swap_buffers = swap_in_op.state_buffers.copy()
+
+        param_info = swap_in_op.param_info
+        self._update_param_state_info(param_info, parameter)
+        unpinned_tensors = param_info.get_unpinned_state_tensors()
+
+        if len(unpinned_tensors) > 0:
+            new_alloc_buffers = self.swap_buffer_manager.allocate(
+                num_elems=self._io_aligned_numel(param_info.numel()),
+                count=len(unpinned_tensors),
+                dtype=param_info.dtype())
+            assert new_alloc_buffers is not None
+
+            allocated_buffers += new_alloc_buffers
+            swap_buffers += new_alloc_buffers
+
+            for pinned_dst, unpinned_src in zip(new_alloc_buffers, unpinned_tensors):
+                dst = get_sized_buffer(pinned_dst, unpinned_src.numel())
+                dst.data.copy_(unpinned_src.data)
+
+        swap_paths = param_info.swap_paths.copy()
+        assert len(swap_paths) == len(swap_buffers)
+
+        swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+        swap_out_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                      param_info=param_info,
+                                      read_op=False,
+                                      allocated_buffers=allocated_buffers,
+                                      state_buffers=swap_buffers,
+                                      num_ops=len(swap_buffers))
+
+        return swap_out_op
+
+    def _swap_in_optimizer_state(self, aio_handle, parameter):
+        param_info = self._get_param_swap_info(parameter)
+        if param_info is None:
+            return None
+
+        required_buffer_count = len(
+            param_info.tensors) + (1 if param_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(param_info.numel())
+        allocated_buffers = self.swap_buffer_manager.allocate(
+            num_elems=aligned_numel,
+            count=required_buffer_count,
+            dtype=parameter.dtype)
+        assert allocated_buffers is not None, \
+        f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing {OFFLOAD_OPTIMIZER_BUFFER_COUNT}"
+
+        state_buffers = allocated_buffers[:len(param_info.tensors)]
+        param_info.set_swap_buffers(state_buffers)
+
+        swap_buffers = state_buffers.copy()
+        swap_paths = param_info.swap_paths.copy()
+
+        if param_info.has_gradients():
+            parameter.grad = allocated_buffers[-1].narrow(0, 0, param_info.numel())
+            if param_info.swapped_gradients:
+                swap_buffers += param_info.get_swap_gradient_buffers(parameter.grad)
+                swap_paths += param_info.get_swap_gradient_paths()
+
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+
+        if param_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=param_info,
+                                                     dest_buffer=parameter.grad)
+
+        swap_in_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                     param_info=param_info,
+                                     read_op=True,
+                                     allocated_buffers=allocated_buffers,
+                                     state_buffers=state_buffers,
+                                     num_ops=len(swap_buffers))
+
+        return swap_in_op
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
new file mode 100644
index 000000000000..a88746fdc739
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -0,0 +1,241 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+from deepspeed.utils.logging import logger
+
+from deepspeed.runtime.swap_tensor.constants import AIO_BLOCK_SIZE, AIO_QUEUE_DEPTH, \
+    AIO_THREAD_COUNT, AIO_SINGLE_SUBMIT, AIO_OVERLAP_EVENTS
+
+MIN_AIO_BYTES = 1024**2
+AIO_ALIGNED_BYTES = 1024
+
+
+def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pread(buffer, path) == 0)
+
+
+def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pwrite(buffer, path) == 0)
+
+
+def print_object(obj, name, exclude_list=[]):
+    logger.info('{}:'.format(name))
+    for arg in sorted(vars(obj)):
+        if not arg in exclude_list:
+            dots = '.' * (29 - len(arg))
+            logger.info('  {} {} {}'.format(arg, dots, getattr(obj, arg)))
+
+
+class SwapBuffer(object):
+    def __init__(self, buffer):
+        self.buffer = buffer
+        self.reset()
+
+    def reset(self):
+        self.offset = 0
+        self.swap_tensors = {}
+        self.compute_tensors = {}
+        self.swap_paths = {}
+        self.num_elem = 0
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        swap_tensor, compute_tensor = self.allocate_tensor(swap_path, tensor.numel(), aligned_numel)
+        compute_tensor.data.copy_(tensor.data)
+        return swap_tensor, compute_tensor
+
+    def allocate_tensor(self, swap_path, numel, aligned_numel):
+        assert self.has_space(aligned_numel)
+        assert not self.offset in self.swap_tensors
+
+        allocate_offset = self.offset
+        swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel)
+        dest_tensor = swap_tensor.narrow(0, 0, numel)
+
+        self.swap_tensors[allocate_offset] = swap_tensor
+        self.compute_tensors[allocate_offset] = dest_tensor
+        self.swap_paths[allocate_offset] = swap_path
+        self.offset += aligned_numel
+        self.num_elem += numel
+
+        return self.swap_tensors[allocate_offset], self.compute_tensors[allocate_offset]
+
+    def has_space(self, numel):
+        return (self.offset + numel) <= self.buffer.numel()
+
+    def get_swap_tensors(self):
+        return [tensor for tensor in self.swap_tensors.values()]
+
+    def get_swap_paths(self):
+        return [path for path in self.swap_paths.values()]
+
+    def get_compute_tensors(self):
+        return [tensor for tensor in self.compute_tensors.values()]
+
+    def get_num_elem(self):
+        return self.num_elem
+
+    def get_swap_tensor(self, offset):
+        return self.swap_tensors.get(offset, None)
+
+    def get_compute_tensor(self, offset):
+        return self.compute_tensors.get(offset, None)
+
+    def get_swap_path(self, offset):
+        return self.swap_paths(offset, None)
+
+
+class SwapBufferPool(object):
+    def __init__(self, buffers):
+        assert all([buf.is_pinned() for buf in buffers])
+        self.buffers = [SwapBuffer(buf) for buf in buffers]
+        self.current_index = 0
+
+    def reset(self):
+        self.current_index = 0
+        for buffer in self.buffers:
+            buffer.reset()
+
+    def allocate_tensor(self, numel, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().allocate_tensor(swap_path, numel, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().insert_tensor(tensor, swap_path, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def get_swap_tensors(self):
+        swap_tensors = []
+        for buffer in self._get_used_buffers():
+            swap_tensors += buffer.get_swap_tensors()
+
+        return swap_tensors
+
+    def get_swap_paths(self):
+        swap_paths = []
+        for buffer in self._get_used_buffers():
+            swap_paths += buffer.get_swap_paths()
+
+        return swap_paths
+
+    def get_compute_tensors(self):
+        compute_tensors = []
+        for buffer in self._get_used_buffers():
+            compute_tensors += buffer.get_compute_tensors()
+
+        return compute_tensors
+
+    def has_space(self, numel):
+        if self._get_current_buffer().has_space(numel):
+            return True
+
+        if self.current_index == len(self.buffers) - 1:
+            return False
+
+        self.current_index += 1
+        return self._get_current_buffer().has_space(numel)
+
+    def swap_out(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_out_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def swap_in(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def _get_current_buffer(self):
+        return self.buffers[self.current_index]
+
+    def _get_used_buffers(self):
+        return self.buffers[:self.current_index + 1]
+
+
+class SwapBufferManager(object):
+    def __init__(self, num_elems, count, dtype):
+        self.num_elems = num_elems
+        self.count = count
+        self.dtype = dtype
+        self.all_buffers = [
+            torch.zeros(num_elems,
+                        device='cpu',
+                        dtype=dtype).pin_memory() for _ in range(count)
+        ]
+        self.free_buffer_index = [i for i in range(count)]
+        self.used_buffer_index = {}
+        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
+                                                                                     **3)
+
+        if torch.distributed.get_rank() == 0:
+            exclude_list = ['all_buffers']
+            print_object(obj=self, name='SwapBufferManager', exclude_list=exclude_list)
+
+    def allocate(self, num_elems, count, dtype):
+        assert dtype == self.dtype
+        assert num_elems <= self.num_elems
+        if count > len(self.free_buffer_index):
+            return None
+
+        used_indices = self.free_buffer_index[-count:]
+        self.free_buffer_index = self.free_buffer_index[:-count]
+
+        buffers = []
+        for i in used_indices:
+            tmp_buffer = self.all_buffers[i].narrow(0, 0, num_elems)
+            buffers.append(tmp_buffer)
+            self.used_buffer_index[id(tmp_buffer)] = i
+        return buffers
+
+    def allocate_all(self, num_elems, dtype):
+        return self.allocate(num_elems=num_elems,
+                             count=len(self.free_buffer_index),
+                             dtype=dtype)
+
+    def free(self, buffers):
+        buffer_ids = []
+        for buf in buffers:
+            buffer_ids.append(id(buf))
+
+        assert all([b_id in self.used_buffer_index for b_id in buffer_ids])
+
+        for b_id in buffer_ids:
+            self.free_buffer_index.append(self.used_buffer_index[b_id])
+            del (self.used_buffer_index[b_id])
+
+
+def get_sized_buffer(buffer, num_elems):
+    assert num_elems <= buffer.numel(), \
+        f'num_elems {num_elems}> buffer {buffer.numel()}'
+    return buffer.narrow(0, 0, num_elems) if num_elems < buffer.numel() else buffer
+
+
+def get_sized_buffers(buffer_list, num_elems_list):
+    swap_buffers = [
+        get_sized_buffer(buffer, num_elems) \
+        for buffer, num_elems in zip(buffer_list, num_elems_list)
+    ]
+    return swap_buffers
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index d521573e1a77..bd881c8ec69f 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -1,5 +1,13 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 from .partition_parameters import ZeroParamType
 from .partition_parameters import ZeroParamStatus
 from .partition_parameters import Init
 from .partition_parameters import GatheredParameters
 from .partition_parameters import register_external_parameter
+
+from .tiling import TiledLinear
+from .tiling import TiledLinearReturnBias
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index c179d01f2988..6bae0f1fbcd0 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -5,7 +5,10 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.utils import logger
-from deepspeed.runtime.zero.constants import *
+from .constants import *
+from .offload_constants import *
+from .offload_config import get_offload_param_config, get_default_offload_param_config, \
+    get_offload_optimizer_config, get_default_offload_optimizer_config
 
 
 class DeepSpeedZeroConfig(DeepSpeedConfigObject):
@@ -24,9 +27,8 @@ def __init__(self, param_dict):
         self.elastic_checkpoint = None
 
         #Offload Specific Parameters
-        self.cpu_offload = None
-        self.cpu_offload_params = None
-        self.cpu_offload_use_pin_memory = None
+        self.offload_param = None
+        self.offload_optimizer = None
         self.sub_group_size = None
 
         #Stage3 Specific Parameters
@@ -60,7 +62,24 @@ def read_zero_config_deprecated(self, param_dict):
             .format(ZERO_FORMAT))
         return zero_config_dict
 
+    def _sanity_check(self, zero_config_dict):
+        deprecated_dict = {
+            ZERO_OPTIMIZATION_CPU_OFFLOAD:
+            ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
+            ZERO_OPTIMIZATION_OFFLOAD_PARAM,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
+            f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}'
+        }
+
+        for old_key, new_key in deprecated_dict.items():
+            if old_key in zero_config_dict:
+                logger.warning(
+                    f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.')
+
     def _initialize(self, zero_config_dict):
+        self._sanity_check(zero_config_dict)
+
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
                                       ZERO_OPTIMIZATION_STAGE_DEFAULT)
@@ -103,24 +122,30 @@ def _initialize(self, zero_config_dict):
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
 
-        self.cpu_offload = get_scalar_param(zero_config_dict,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
-
         self.elastic_checkpoint = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT)
 
-        self.cpu_offload_params = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
-
-        self.cpu_offload_use_pin_memory = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT)
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD in zero_config_dict:
+            cpu_offload_optimizer = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
+            if cpu_offload_optimizer:
+                self.offload_optimizer = get_default_offload_optimizer_config()
+        else:
+            self.offload_optimizer = get_offload_optimizer_config(zero_config_dict)
+
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS in zero_config_dict:
+            cpu_offload_params = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
+            if cpu_offload_params:
+                self.offload_param = get_default_offload_param_config()
+        else:
+            self.offload_param = get_offload_param_config(zero_config_dict)
 
         self.sub_group_size = get_scalar_param(zero_config_dict,
                                                ZERO_OPTIMIZATION_SUB_GROUP_SIZE,
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index e5812980a337..40b450649850 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -3,6 +3,8 @@
 Licensed under the MIT license.
 """
 
+from .offload_constants import *
+
 #########################################
 # ZeRO optimization
 #########################################
@@ -22,10 +24,12 @@
     "overlap_comm": [true|false],
     "reduce_bucket_size": 500000000,
     "load_from_fp32_weights": [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory": [true|false],
-    "sub_group_size" : 1000000000000
+    "cpu_offload": [true|false] (deprecated),
+    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_use_pin_memory": [true|false] (deprecated),
+    "sub_group_size" : 1000000000000,
+    "offload_param": {...},
+    "offload_optimizer": {...}
     }
 }
 '''
@@ -67,18 +71,24 @@
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
 
-ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
-
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT = 'elastic_checkpoint'
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT = True
 
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS = 'cpu_offload_params'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT = False
 
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY = 'cpu_offload_use_pin_memory'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT = False
 
+ZERO_OPTIMIZATION_OFFLOAD_PARAM = OFFLOAD_PARAM
+ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT = None
+
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER = OFFLOAD_OPTIMIZER
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT = None
+
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE = 'sub_group_size'
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT = 1000000000000
 
@@ -118,16 +128,12 @@
     ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT:
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM:
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT,
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER:
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT,
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE:
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS:
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 23f97d5a542a..fb65673bd9b4 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -21,6 +21,11 @@
 tensor_map = {}
 
 
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
 class LinearFunctionForZeroStage3(torch.autograd.Function):
 
     # Note that both forward and backward are @staticmethods
@@ -46,6 +51,7 @@ def forward(ctx, input, weight, bias=None):
             if bias is not None:
                 output += bias
             ret = output
+
         return ret
 
     # This function has only a single output, so it gets only one gradient
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
new file mode 100644
index 000000000000..eaf3f13e2819
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -0,0 +1,63 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from .offload_constants import *
+
+OFFLOAD_PARAM_KEY_DEFAULT_DICT = {
+    OFFLOAD_PARAM_DEVICE: OFFLOAD_PARAM_DEVICE_DEFAULT,
+    OFFLOAD_PARAM_NVME_PATH: OFFLOAD_PARAM_NVME_PATH_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_COUNT: OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_SIZE: OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT,
+    OFFLOAD_PARAM_MAX_IN_CPU: OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT,
+    OFFLOAD_PARAM_PIN_MEMORY: OFFLOAD_PARAM_PIN_MEMORY_DEFAULT
+}
+
+OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT = {
+    OFFLOAD_OPTIMIZER_DEVICE: OFFLOAD_OPTIMIZER_DEVICE_DEFAULT,
+    OFFLOAD_OPTIMIZER_NVME_PATH: OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT,
+    OFFLOAD_OPTIMIZER_BUFFER_COUNT: OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIN_MEMORY: OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_READ: OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_WRITE: OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT,
+    OFFLOAD_OPTIMIZER_FAST_INIT: OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT
+}
+
+
+def _get_offload_config(param_dict, key_default_dict):
+    offload_config = {}
+    for key, default_value in key_default_dict.items():
+        offload_config[key] = get_scalar_param(param_dict, key, default_value)
+
+    return offload_config
+
+
+def get_offload_param_config(param_dict):
+    if OFFLOAD_PARAM in param_dict and param_dict[OFFLOAD_PARAM] is not None:
+        return _get_offload_config(param_dict=param_dict[OFFLOAD_PARAM],
+                                   key_default_dict=OFFLOAD_PARAM_KEY_DEFAULT_DICT)
+
+    return None
+
+
+def get_default_offload_param_config():
+    return OFFLOAD_PARAM_KEY_DEFAULT_DICT
+
+
+def get_offload_optimizer_config(param_dict):
+    if OFFLOAD_OPTIMIZER in param_dict and param_dict[OFFLOAD_OPTIMIZER] is not None:
+        offload_config = _get_offload_config(
+            param_dict=param_dict[OFFLOAD_OPTIMIZER],
+            key_default_dict=OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT)
+        offload_config[OFFLOAD_OPTIMIZER_PIPELINE] = offload_config[
+            OFFLOAD_OPTIMIZER_PIPELINE_READ] or offload_config[
+                OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+        return offload_config
+
+    return None
+
+
+def get_default_offload_optimizer_config():
+    return OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT
diff --git a/deepspeed/runtime/zero/offload_constants.py b/deepspeed/runtime/zero/offload_constants.py
new file mode 100644
index 000000000000..3ba71df4a807
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_constants.py
@@ -0,0 +1,67 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# TENSOR OFFLOADING
+#########################################
+OFFLOAD_CPU_DEVICE = "cpu"
+OFFLOAD_NVME_DEVICE = "nvme"
+
+#########################################
+# PARAM TENSOR OFFLOADING
+#########################################
+OFFLOAD_PARAM_FORMAT = '''
+"offload_param": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 5,
+  "buffer_size": 1e8,
+  "max_in_cpu": 1e9,
+  "pin_memory": [true|false]
+}
+'''
+OFFLOAD_PARAM = "offload_param"
+OFFLOAD_PARAM_DEVICE = "device"
+OFFLOAD_PARAM_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_PARAM_NVME_PATH = "nvme_path"
+OFFLOAD_PARAM_NVME_PATH_DEFAULT = None
+OFFLOAD_PARAM_BUFFER_COUNT = "buffer_count"
+OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT = 5
+OFFLOAD_PARAM_BUFFER_SIZE = "buffer_size"
+OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT = 1e8
+OFFLOAD_PARAM_MAX_IN_CPU = "max_in_cpu"
+OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT = 1e9
+OFFLOAD_PARAM_PIN_MEMORY = "pin_memory"
+OFFLOAD_PARAM_PIN_MEMORY_DEFAULT = False
+
+#########################################
+# OPTIMIZER TENSOR OFFLOADING
+#########################################
+OFFLOAD_OPTIMIZER_FORMAT = '''
+"offload_optimizer": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 4,
+  "pin_memory": [true|false],
+  "pipeline_read": false,
+  "pipeline_write": false,
+  "fast_init": false
+}
+'''
+OFFLOAD_OPTIMIZER = "offload_optimizer"
+OFFLOAD_OPTIMIZER_DEVICE = "device"
+OFFLOAD_OPTIMIZER_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_OPTIMIZER_NVME_PATH = "nvme_path"
+OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT = None
+OFFLOAD_OPTIMIZER_BUFFER_COUNT = "buffer_count"
+OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT = 4
+OFFLOAD_OPTIMIZER_PIN_MEMORY = "pin_memory"
+OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_READ = "pipeline_read"
+OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE = "pipeline_write"
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE = "pipeline"
+OFFLOAD_OPTIMIZER_FAST_INIT = "fast_init"
+OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT = False
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 42cdcd645f4d..c8bde6390b3c 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1,3 +1,8 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 import os
 import time
 import types
@@ -8,10 +13,15 @@
 import torch
 from torch.distributed.distributed_c10d import _get_global_rank
 
-from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
-from deepspeed.runtime.utils import see_memory_usage
+from .linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
+from .offload_constants import *
+
+from ..utils import see_memory_usage
 from deepspeed.utils import log_dist, init_distributed
 
+from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
+from ..config import DeepSpeedConfig
+
 param_count = 0
 
 
@@ -21,6 +31,8 @@ def print_rank_0(message, debug=False, force=False):
 
 
 def is_zero_param(parameter):
+    if not torch.is_tensor(parameter):
+        return False
     return hasattr(parameter, 'ds_id')
 
 
@@ -29,8 +41,6 @@ def _init_external_params(module):
         module._external_params = {}
 
         def external_parameters(self):
-            if not hasattr(self, '_external_params'):
-                self._external_params = {}
             return self._external_params.items()
 
         def all_parameters(self):
@@ -94,6 +104,28 @@ def forward(self, input):
     module._external_params[key] = parameter
 
 
+def unregister_external_parameter(module, parameter):
+    """Reverses the effects of :meth:`register_external_parameter`.
+
+    Args:
+        module (``torch.nn.Module``): The module to affect.
+        parameter (``torch.nn.Parameter``): The parameter to unregister.
+
+    Raises:
+        RuntimeError: If ``parameter`` is not of type ``torch.nn.Parameter``.
+        RuntimeError: If ``parameter`` is not a registered external parameter of ``module``.
+    """
+    if not isinstance(parameter, torch.nn.Parameter):
+        raise RuntimeError('Parameter is not a torch.nn.Parameter')
+
+    if not hasattr(module,
+                   '_external_params') or id(parameter) not in module._external_params:
+        raise RuntimeError('Parameter is not a registered external parameter of module.')
+
+    key = id(parameter)
+    del module._external_params[key]
+
+
 class ZeroParamType(Enum):
 
     # same as regular pytorch parameters
@@ -216,6 +248,8 @@ def _disable_class(cls):
         #un doing it here will undo it during training
         #if self.mem_efficient_linear:
         #    torch.nn.functional.linear = self.linear_bk
+        #        if self.mem_efficient_linear:
+        #            torch.nn.functional.linear = self.linear_bk
 
         # Now that we cleaned up the metaclass injection, raise the exception.
         if exc_type is not None:
@@ -236,6 +270,7 @@ def __init__(self,
                  mem_efficient_linear=True,
                  remote_device=None,
                  pin_memory=False,
+                 deepspeed_config=None,
                  enabled=True):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
@@ -249,14 +284,15 @@ def __init__(self,
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
                 DeepSpeed to partition parameters. Defaults to ``True``.
-            remote_device (string, optional): The device to store model
-                weights. Passing ``"cpu"`` will create the model in CPU
-                memory. The model may still be moved to GPU if
-                ``cpu_offload_param`` is ``False`` in the config provided to
-                :meth:`deepspeed.initialize`. Defaults to the local GPU.
+            remote_device (string, optional): The initial device to store model
+                weights e.g., ``cpu``, ``nvme``. Passing ``"cpu"`` will create the model in CPU
+                memory. The model may still be moved to GPU based on the
+                offload settings for training. Defaults to the local GPU.
             pin_memory (bool, optional): Potentially increase performance by
                 using pinned memory for model weights. ``remote_device`` must be
                 ``"cpu"``. Defaults to ``False``.
+            deepspeed_config (``json file``, optional): If provided, provides configuration
+                for swapping fp16 params to NVMe.
             enabled (bool, optional): If ``False``, this context has no
                 effect. Defaults to ``True``.
 
@@ -264,15 +300,15 @@ def __init__(self,
         are too large to allocate in their entirety in CPU memory. It has the
         following effects:
 
-        #. allocates tensors to either GPU or CPU memory
+        #. allocates tensors to either GPU or CPU memory or NVMe
         #. converts floating point tensors to half precision
         #. immediately partitions tensors among the group of data-parallel devices
         #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
            memory-efficient implementation
 
         These modifications allow for models that exceed the size of local CPU/GPU
-        memory, but fit within the total system memory (*i.e.*, aggregate CPU
-        or GPU memory) across all nodes. Consider initializing a model with one
+        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
+        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
         trillion parameters, whose weights occupy two terabytes (TB) in half
         precision. The initial CPU allocation in full precision requires 4TB of
         memory *per process*, and so a system with 8 GPUs per node would need 32TB of
@@ -302,7 +338,6 @@ def get_model():
         .. note::
             Only applicable to training with ZeRO-3.
 
-
         Examples
         --------
 
@@ -347,10 +382,20 @@ def get_model():
         #It is the device where parameters are fully instantiated using allgather
         self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
 
+        self._validate_remote_device(remote_device, deepspeed_config)
+
         #Remote device is the device where parameter partiitons are stored
-        #It can be same as local_device or it could be CPU.
+        #It can be same as local_device or it could be CPU or NVMe.
         self.remote_device = self.local_device if remote_device is None else remote_device
-        self.pin_memory = pin_memory if (self.remote_device == 'cpu') else False
+        self.pin_memory = pin_memory if (
+            self.remote_device == OFFLOAD_CPU_DEVICE) else False
+
+        # Enable fp16 param swapping to NVMe
+        if self.remote_device == OFFLOAD_NVME_DEVICE:
+            _ds_config = DeepSpeedConfig(deepspeed_config)
+            self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config)
+        else:
+            self.param_swapper = None
 
         # If we are provided an already-allocated module to prepare.
         if module is not None:
@@ -361,6 +406,23 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
+    def _validate_remote_device(self, remote_device, ds_config):
+        if ds_config is not None:
+            _ds_config = DeepSpeedConfig(ds_config)
+            if remote_device in [None, OFFLOAD_CPU_DEVICE]:
+                if _ds_config.zero_config.offload_param is not None:
+                    offload_param_device = _ds_config.zero_config.offload_param[
+                        OFFLOAD_PARAM_DEVICE]
+                    assert offload_param_device != OFFLOAD_NVME_DEVICE, \
+                    f"{OFFLOAD_PARAM_DEVICE} in DeepSpeed Config cannot be {offload_param_device} if remote device is {remote_device}."
+
+            if remote_device == OFFLOAD_NVME_DEVICE:
+                assert _ds_config.zero_config.offload_param is not None, \
+                f'{OFFLOAD_PARAM} must be defined in DeepSpeed Config if remote device is {OFFLOAD_NVME_DEVICE}.'
+
+                assert _ds_config.zero_config.offload_param[OFFLOAD_PARAM_NVME_PATH] is not None, \
+                f'{OFFLOAD_PARAM_NVME_PATH} in DeepSpeed Config cannot be None if remote device is {OFFLOAD_NVME_DEVICE}'
+
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
@@ -408,6 +470,10 @@ def _convert_to_deepspeed_param(self, param):
         # The group that the parameter is scattered across.
         param.ds_process_group = self.ds_process_group
 
+        # This is set to the Async Param swapper if remote device is nvme
+        # else this is set to None
+        param.nvme_swapper = self.param_swapper
+
         # DeepSped Param ID
         param.ds_id = Init.param_id
         Init.param_id += 1
@@ -458,6 +524,9 @@ def aligned_size():
         def padding_size():
             return self._padding_size(param)
 
+        def partitioned_size():
+            return self._partitioned_size(param)
+
         # Collectives for gathering and partitioning parameters
         param.all_gather = all_gather
         param.partition = partition
@@ -469,6 +538,7 @@ def padding_size():
         # Partitioning size utilities
         param.aligned_size = aligned_size
         param.padding_size = padding_size
+        param.partitioned_size = partitioned_size
 
     def _aligned_size(self, param):
         return param.ds_numel + self._padding_size(param)
@@ -477,7 +547,29 @@ def _padding_size(self, param):
         remainder = param.ds_numel % self.world_size
         return (self.world_size - remainder) if remainder else 0
 
+    def _partitioned_size(self, param):
+        return param.ds_tensor.ds_numel
+
+    def _ensure_availability_of_partitioned_params(self, params):
+        swap_in_list = []
+        swap_in_flight = []
+        for param in params:
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_list.append(param)
+            if param.ds_tensor.status == PartitionedParamStatus.INFLIGHT:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_flight.append(param)
+        if len(swap_in_list) > 0:
+            swap_in_list[0].nvme_swapper.swap_in(swap_in_list, async_op=False)
+        elif len(swap_in_flight) > 0:
+            swap_in_flight[0].nvme_swapper.synchronize_reads()
+
     def _all_gather(self, param_list, async_op=False, hierarchy=None):
+
+        #fetches from nvme if the partition is not available and in nvme
+        self._ensure_availability_of_partitioned_params(param_list)
+
         handles = []
         all_gather_list = []
         for param in param_list:
@@ -511,8 +603,9 @@ def _partition(self, param_list, force=False, has_been_updated=False):
             #print_rank_0(f"After Partitioning Param {param.ds_id}")
             # self._param_status(param)
 
-    def _partition_param(self, param, has_been_updated=False):
+    def _partition_param(self, param, buffer=None, has_been_updated=False):
         assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot parititon a param in flight"
+
         global reuse_buffers
         #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}")
         if param.ds_status is ZeroParamStatus.AVAILABLE:
@@ -534,22 +627,54 @@ def _partition_param(self, param, has_been_updated=False):
 
                 #param.data = param.ds_tensor.data
 
+                see_memory_usage(
+                    f'Before partitioning param {param.ds_id} {param.shape}',
+                    force=False)
                 #param.data does not store anything meaningful in partitioned state
                 param.data = torch.ones(1).half().to(param.device)
+                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                                 force=False)
+
+                if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                    print_rank_0(
+                        f"Param {param.ds_id} partition released since it exists in nvme",
+                        force=False)
+                    param.nvme_swapper.remove_partition_and_release_buffers([param])
+
                 return
 
             tensor_size = self._aligned_size(param)
             partition_size = tensor_size // self.world_size
 
             if param.ds_tensor is None:
-                partitioned_tensor = torch.zeros(partition_size,
-                                                 dtype=param.dtype,
-                                                 device=self.remote_device)
-                partitioned_tensor.requires_grad = False
-                if self.pin_memory:
-                    partitioned_tensor = partitioned_tensor.pin_memory()
+                final_location = None
+                if self.remote_device == OFFLOAD_NVME_DEVICE and self.param_swapper.swappable_tensor(
+                        numel=partition_size):
+                    final_location = OFFLOAD_NVME_DEVICE
+                    buffer = self.param_swapper.get_buffer(param, partition_size)
+                    partitioned_tensor = torch.zeros(1,
+                                                     dtype=param.dtype,
+                                                     device=buffer.device)
+                    partitioned_tensor.data = buffer.data
+                    print_rank_0(
+                        f"ID {param.ds_id} Initializing partition for the first time for nvme offload."
+                    )
 
+                else:
+                    partitioned_tensor = torch.zeros(
+                        partition_size,
+                        dtype=param.dtype,
+                        device=OFFLOAD_CPU_DEVICE
+                        if self.remote_device == OFFLOAD_NVME_DEVICE else
+                        self.remote_device)
+                    if self.pin_memory:
+                        partitioned_tensor = partitioned_tensor.pin_memory()
+
+                partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
+                param.ds_tensor.ds_numel = partition_size
+                param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+                param.ds_tensor.final_location = final_location
 
             start = partition_size * self.rank
             end = start + partition_size
@@ -584,7 +709,20 @@ def _partition_param(self, param, has_been_updated=False):
             #param.data = param.ds_tensor.data
 
             #param.data does not store anything meaningful in partitioned state
+
+            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
+                             force=False)
             param.data = torch.ones(1).half().to(param.device)
+            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                             force=False)
+
+            if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                self.param_swapper.swap_out_and_release([param])
+                print_rank_0(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
+                see_memory_usage(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.",
+                    force=False)
 
             print_rank_0(
                 f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}"
@@ -602,7 +740,7 @@ def _param_status(self, param):
 
     def _allgather_param(self, param, async_op=False, hierarchy=0):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         tensor_size = partition_size * self.world_size
         aligned_param_size = self._aligned_size(param)
@@ -611,9 +749,16 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         print_rank_0(
             f"{'--'* hierarchy}---- Before allocating Allgather param with id {param.ds_id} and status {param.ds_status} Partition Size {partition_size} and data shape {param.ds_shape}"
         )
+
+        see_memory_usage(
+            f'Before allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
         flat_tensor = torch.zeros(aligned_param_size,
                                   dtype=param.dtype,
                                   device=param.device).view(-1)
+        see_memory_usage(
+            f'After allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
 
         torch.cuda.synchronize()
 
@@ -646,7 +791,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return
 
-        partition_size = sum([param.ds_tensor.numel() for param in param_list])
+        partition_size = sum([param.ds_tensor.ds_numel for param in param_list])
 
         tensor_size = partition_size * self.world_size
         flat_tensor = torch.empty(tensor_size,
@@ -662,7 +807,7 @@ def _allgather_params(self, param_list, hierarchy=0):
             if i == self.rank:
                 offset = 0
                 for param in param_list:
-                    param_numel = param.ds_tensor.numel()
+                    param_numel = param.ds_tensor.ds_numel
 
                     partitions[i].narrow(0,
                                          offset,
@@ -677,9 +822,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         param_offset = 0
 
         for param in param_list:
-
-            param_partition_size = param.ds_tensor.numel()
-
+            param_partition_size = param.ds_tensor.ds_numel
             param_size = param.ds_numel
             replicated_tensor = torch.empty(param.ds_shape,
                                             dtype=param.dtype,
@@ -700,7 +843,7 @@ def _allgather_params(self, param_list, hierarchy=0):
                                                       param_start,
                                                       numel_to_copy).copy_(part_to_copy)
             #param_offset += param.data.numel()
-            param_offset += param.ds_tensor.numel()
+            param_offset += param.ds_tensor.ds_numel
 
             param.data = replicated_tensor.data
 
@@ -724,7 +867,7 @@ def _reduce_scatter_gradients(self, param_list):
             # some ranks may have partitions that are padded to go beyond the grad size.
             # For these ranks the output of reduce scatter is a separate buffer and needs
             # to be copied in
-            partition_size = param.ds_tensor.numel()
+            partition_size = param.ds_tensor.ds_numel
             start = self.rank * partition_size
             end = start + partition_size
             #print_rank_0("REduce scatter was executed for praam {param.ds_id}")
@@ -739,7 +882,7 @@ def _reduce_scatter_gradients(self, param_list):
 
     def _reduce_scatter_gradient(self, param):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
         #output = torch.empty(partition_size, dtype=param.dtype, device=param.device)
 
         total_size = partition_size * self.world_size
@@ -791,10 +934,10 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         # param.grad=None
         # param.grad.test()
         print_rank_0(
-            f"Partitioning param {id(param)} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.numel()}"
+            f"Partitioning param {param.ds_id} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.ds_numel}"
         )
         see_memory_usage("Before partitioning gradients", force=False)
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         if partition_buffer is None:
             assert not accumulate, "No buffer to accumulate to"
@@ -884,6 +1027,10 @@ def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
                     if torch.distributed.get_rank() == 0:
                         linear.weight.zero_()
 
+                with deepspeed.zero.GatheredParameters(linear.weight,
+                                                       modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        linear.weight.zero_()
 
         #. Collect a partitioned weight to pass to another module during
            training. The parameter will be registered as an external parameter
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index c7eb4b5cfc7b..f8b526952de8 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1,10 +1,12 @@
-from deepspeed.utils.logging import logger
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
 
+import sys
 import os
-
+from collections import defaultdict, OrderedDict
+import itertools
 import torch
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
@@ -12,18 +14,25 @@
 from torch._six import inf
 from torch.autograd import Variable
 
+from deepspeed.utils.logging import logger
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, ZeroParamType, _init_external_params, Init, is_zero_param
+from deepspeed.runtime.zero.partition_parameters import *
+from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.ops.op_builder import UtilsBuilder
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
+from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
+from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 
-import itertools
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
+FWD_MODULE_STACK = list()
+
 
 def print_rank_0(message, debug=False, force=False):
     if torch.distributed.get_rank() == 0 and (debug or force):
@@ -108,6 +117,47 @@ def _apply_forward_and_backward_to_tensors_only(module,
         return outputs
 
 
+class ZeROOrderedDict(OrderedDict):
+    def __init__(self, parent_module, *args, **kwargs):
+        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
+
+        Args:
+            parent_module (``collections.OrderedDict``): the collection to replace
+        """
+
+        super().__init__(*args, **kwargs)
+        self._parent_module = parent_module
+        self._in_forward = False
+
+    def __getitem__(self, key):
+        param = super().__getitem__(key)
+
+        # Params can be registered as None (e.g., bias)
+        if param is None:
+            return param
+
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if self._parent_module._parameters._in_forward:
+                print_rank_0(f'Registering external parameter from getter {key}',
+                             force=False)
+                register_external_parameter(FWD_MODULE_STACK[-1], param)
+                param.all_gather()
+
+        return param
+
+
+def _inject_parameters(module, cls):
+    for module in module.modules():
+        if cls == ZeROOrderedDict:
+            new_param = cls(parent_module=module)
+        else:
+            new_param = cls()
+
+        for key, param in module._parameters.items():
+            new_param[key] = param
+        module._parameters = new_param
+
+
 # TODO Needs to be implemented
 class PrefetchCoordinator(object):
     def __init__(self):
@@ -217,7 +267,6 @@ def get_reuse_distance_in_numel(self, sub_module, sub_module_step_id=None):
                     start_step,
                     end_step,
                     trace)
-
                 break
 
         self.reuse_numel_for_step_id[sub_module_step_id] = reuse_distance_in_numel
@@ -271,9 +320,26 @@ def finish_tracing(self, print_trace=False):
         if print_trace:
             self.prefetch_coordinator.print_trace()
 
+    #swap in parameter partitions from nvme for those parameters that will be used
+    # after the ones that are already being prefetched into full parameters
+    def _prefetch_nvme_param_partitions(self, sub_module, params_in_flight):
+        numel_in_flight = sum([param.ds_tensor.ds_numel for param in params_in_flight])
+        upcoming_param_list = self.prefetch_coordinator.get_params_to_prefetch(
+            sub_module,
+            numel=2 * numel_in_flight)
+        swap_in_params = []
+        for param in upcoming_param_list:
+            if len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers():
+                break
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_in_params.append(param)
+
+        if len(swap_in_params) > 0:
+            swap_in_params[0].nvme_swapper.swap_in(swap_in_params, async_op=True)
+
     # Pre fetches the parameters for sub_modules that comes after
     #  the current sub_module. This call is asynchronous
-    def prefetch_next_sub_modules(self, sub_module, numel=5000000):
+    def prefetch_next_sub_modules(self, sub_module, numel=5000000, nvme=False):
 
         params_to_prefetch = []
         if not self.prefetch_coordinator.trace_completed:
@@ -292,6 +358,9 @@ def prefetch_next_sub_modules(self, sub_module, numel=5000000):
                 # keeping track of number of elements consumed by available parmaeters
                 self._increment_available_parameter_numel(param.ds_numel)
 
+            if nvme:
+                self._prefetch_nvme_param_partitions(sub_module, params_to_prefetch)
+
         self._print_prefetch_elements_info(sub_module, params_to_prefetch)
         print_rank_0(
             f"{'--' * self.hierarchy}--PreFetching parameters {[param.ds_id for param in params_to_prefetch]} and available {self.total_available_parameter_numel}, max limit {self.max_available_parameters_in_numel}",
@@ -371,7 +440,9 @@ def fetch_sub_module(self, sub_module):
 
         for _, param in sub_module.named_parameters(recurse=False):
             param.ds_status = ZeroParamStatus.AVAILABLE
-            #print(f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} ")
+            print_rank_0(
+                f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} norm {param.norm()}",
+                force=False)
         #print_rank_0(f"After fetching (id, shape, device): {[(param.ds_id, param.shape, param.device) for param in sub_module.named_parameters(recurse=False)]}")
 
     def release_sub_module(self, sub_module):
@@ -383,6 +454,7 @@ def release_sub_module(self, sub_module):
             param for _,
             param in sub_module.named_parameters(recurse=False)
         ]
+
         if hasattr(sub_module, 'ds_external_parameters'):
             #print_rank_0(f"Releasing external parameters {sub_module.ds_external_parameters()}")
             params_to_release += [
@@ -396,25 +468,25 @@ def release_sub_module(self, sub_module):
             if not param.ds_active_sub_modules and not self._keep_for_later(
                     sub_module) and not param.ds_persist:
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}"
-                )
+                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}",
+                    force=False)
 
                 # Keeping track of number of elements that are consumed by available parameters
                 self._decrement_available_parameter_numel(param.ds_numel)
                 see_memory_usage(
-                    f"Before releasing param {param.ds_id} with numel{param.numel()}",
+                    f"Before releasing param {param.ds_id} with numel {param.numel()}",
                     force=False)
                 param.partition(hierarchy=self.hierarchy)
                 see_memory_usage(
-                    f"After releasing param {param.ds_id} has numel{param.numel()} ",
+                    f"After releasing param {param.ds_id} has numel {param.numel()} ",
                     force=False)
 
                 param.ds_status = ZeroParamStatus.NOT_AVAILABLE
             else:
 
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}"
-                )
+                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}",
+                    force=False)
 
     def release_and_reset_parameter(self, param):
         param.ds_active_sub_modules = 0
@@ -428,6 +500,8 @@ def release_and_reset_parameter(self, param):
     def _keep_for_later(self, sub_module):
         if not self.prefetch_coordinator.trace_completed:
             return False
+        if self.max_reuse_distance_in_numel == 0:
+            return False
         reuse_distance_in_numel = self.prefetch_coordinator.get_reuse_distance_in_numel(
             sub_module)
         #print_rank_0(f"Reuse distance and numel for sub_module id {sub_module.id} is {reuse_distance_in_numel}")
@@ -532,9 +606,8 @@ def __init__(self,
                  dp_process_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
-                 cpu_offload_optimizer_state=False,
-                 cpu_offload_params=False,
-                 cpu_offload_use_pin_memory=False,
+                 offload_optimizer_config=None,
+                 offload_param_config=None,
                  sub_group_size=1000000000000,
                  mpu=None,
                  clip_grad=0.0,
@@ -542,7 +615,8 @@ def __init__(self,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
                  gradient_accumulation_steps=1,
-                 elastic_checkpoint=False):
+                 elastic_checkpoint=False,
+                 aio_config=None):
 
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
@@ -580,21 +654,51 @@ def __init__(self,
         self.elastic_checkpoint = elastic_checkpoint
         self.overlap_comm = overlap_comm
 
+        # Replace ._parameters with a new class to enable auto-registration of
+        # external parameters
+        _inject_parameters(module, ZeROOrderedDict)
+
         if self.overlap_comm:
             self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-        ######################cpu offload setup##################################
-        self.cpu_offload = cpu_offload_optimizer_state
-        self.cpu_offload_use_pin_memory = cpu_offload_use_pin_memory
-
-        if cpu_offload_params:
-            assert cpu_offload_optimizer_state, "parameter offload is only available with optimizer state offload"
-        self.cpu_offload_params = cpu_offload_optimizer_state and cpu_offload_params
+        ###################### offload optimizer setup ##################################
+        self.optimizer_swapper = None
+        self.swap_optimizer = False
+
+        self.offload_optimizer = False
+        self.offload_optimizer_pin_memory = False
+        self.offload_optimizer_fast_init = False
+        if offload_optimizer_config is not None:
+            self.offload_optimizer = True
+            self.offload_optimizer_pin_memory = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_PIN_MEMORY]
+            self.swap_optimizer = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.offload_optimizer_fast_init = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_FAST_INIT]
+
+        ###################### offload param setup ##################################
+        self.offload_param = False
+        self.offload_param_pin_memory = False
+        self.params_in_nvme_and_cpu = False
+        self.max_params_in_cpu = 0
+        if offload_param_config is not None:
+            assert self.offload_optimizer, "parameter offload is only available with optimizer state offload"
+            self.offload_param = True
+            self.offload_param_pin_memory = offload_param_config[
+                OFFLOAD_PARAM_PIN_MEMORY]
+            self.params_in_nvme_and_cpu = offload_param_config[
+                OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
+            print_rank_0(
+                f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
+                force=True)
 
-        self.deepspeed_adam_offload = (self.cpu_offload
+        self.deepspeed_adam_offload = (self.offload_optimizer
                                        and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+        self.device = torch.cuda.current_device(
+        ) if not self.offload_optimizer else OFFLOAD_CPU_DEVICE
         ############################################################################
 
         see_memory_usage("Before Partitioned Parameter Coordinator", force=False)
@@ -661,10 +765,18 @@ def __init__(self,
 
         # Holds a fused and flattened copy of the parameters
         self.fp16_partitioned_groups_flat = []
+        self.fp16_partitioned_groups_flat_numel = []
+
+        #defragmented pinned memory
+        self.param_groups_fp16_flat_cpu_memory = []
+
+        #fp16 buffer for swapping out nvme params
+        self.param_group_fp16_flat_reuse_buffer = None
 
         #a single 32-bit partition of the parallel partitioned parameters
         #that this process will update
         self.fp32_partitioned_groups_flat = []
+        self.next_swappable_fp32_partitioned_groups = []
 
         # number of elements per partition in each group
         self.partition_size = []
@@ -680,20 +792,28 @@ def __init__(self,
 
         self.sub_group_to_group_id = {}
 
-        see_memory_usage("Before creating fp16 partitions", force=False)
-        #self._create_fp16_partitions()
+        see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
         see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
                          force=False)
 
+        # Optimizer ensor swapping
+        if self.swap_optimizer:
+            self._configure_tensor_swapping(offload_optimizer_config, aio_config)
+
         see_memory_usage("Before creating fp32 partitions", force=False)
         self._create_fp32_partitions()
         see_memory_usage("After creating fp32 partitions", force=False)
+        dist.barrier()
+
+        # To support pipelined optimizer swapping
+        self._create_next_swappable_fp32_groups()
 
         see_memory_usage("Before initializing optimizer states", force=False)
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states", force=False)
+        dist.barrier()
 
         if dist.get_rank() == 0:
             logger.info(f"optimizer state initialized")
@@ -718,6 +838,7 @@ def __init__(self,
         self.params_in_ipg_bucket = []
         self.elements_in_ipg_bucket = 0
         self.params_already_reduced = []
+        self.is_gradient_accumulation_boundary = True
         self._release_ipg_buffers()
         self.previous_reduced_grads = None
 
@@ -734,7 +855,10 @@ def __init__(self,
                 count = count + 1
 
         #Largest partitioned param
-        largest_partitioned_param_numel = self._get_largest_partitioned_numel()
+        largest_partitioned_param_numel = max(self.fp16_partitioned_groups_flat_numel)
+        print_rank_0(
+            f'Largest partitioned param numel = {largest_partitioned_param_numel}',
+            force=True)
 
         see_memory_usage(f"Before Set Grad positions", force=False)
 
@@ -744,7 +868,7 @@ def __init__(self,
 
         self.grads_in_partition = None
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.accumulated_grads_in_cpu = {}
             self.norm_for_param_grads = {}
             self.local_overflow = False
@@ -789,14 +913,26 @@ def __init__(self,
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
-    def _get_largest_partitioned_numel(self):
-        largest_partitioned_param_numel = 0
-        for partitioned_params_group in self.fp16_partitioned_groups:
-            for partitioned_param in partitioned_params_group:
-                if partitioned_param.numel() > largest_partitioned_param_numel:
-                    largest_partitioned_param_numel = partitioned_param.numel()
+    def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
+        nvme_swap_folder = os.path.join(
+            offload_optimizer_config[OFFLOAD_OPTIMIZER_NVME_PATH],
+            'zero_stage_3')
+        os.makedirs(nvme_swap_folder, exist_ok=True)
+        if torch.distributed.get_rank() == 0:
+            logger.info(f'Tensor Swapping: Adding optimizer tensors')
+
+        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config[
+            OFFLOAD_OPTIMIZER_PIPELINE] else PartitionedOptimizerSwapper
 
-        return largest_partitioned_param_numel
+        self.optimizer_swapper = swapper_type(
+            swap_config=offload_optimizer_config,
+            aio_config=aio_config,
+            base_folder=nvme_swap_folder,
+            optimizer=self.optimizer,
+            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
+            device=self.device,
+            dtype=torch.float32,
+            timers=self.timers)
 
     def _create_fp16_partitions(self):
         dist.barrier()
@@ -832,7 +968,7 @@ def _create_fp16_partitions(self):
                 #removing cloning here
                 see_memory_usage(f"Before Flattening param group {i}", force=False)
 
-                if not self.cpu_offload_params:
+                if not self.offload_param:
                     see_memory_usage(f"Before moving param group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
@@ -868,31 +1004,86 @@ def _create_fp16_partitions(self):
                 for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
                     partitioned_param.data = q.data
 
-    def _move_to_flat_buffer(self, src_list, flat_buffer):
+    def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
+        '''If flat buffer is None then the parameters in the param_list are
+        not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
+        Some of these parameters may aready be in CPU in unflattened buffers
+        or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
+        they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
+        needed during training.'''
+        if flat_buffer is None:
+            # this dst buffer is on NVMe, so skip this
+            return
+
         start = 0
-        for src in src_list:
-            dest = flat_buffer.narrow(0, start, src.numel())
-            start = start + src.numel()
-            dest.data.copy_(src.data)
-            src.data = dest.data
+        for param in param_list:
+            src = param.ds_tensor
+            dest = flat_buffer.narrow(0, start, src.ds_numel)
+            start = start + src.ds_numel
+            '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
+            if src.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with partition size {param.ds_tensor.ds_numel} permanently to CPU"
+                )
+                param.nvme_swapper.swap_in([param],
+                                           swap_in_buffers=[dest],
+                                           async_op=False)
+            else:
+                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
+                if not avoid_copy:
+                    dest.data.copy_(src.data)
+                src.data = dest.data
 
-    def _create_fp16_partitions_with_defragmentation(self):
-        dist.barrier()
-        partition_id = dist.get_rank(group=self.dp_process_group)
+            # Final location must be gpu/cpu in this case
+            param.ds_tensor.final_location = 'not-nvme'
+
+    def _create_param_groups_fp16_flat_cpu_memory(self):
 
-        if self.cpu_offload_params:
-            self.param_groups_fp16_flat_cpu_memory = []
-            for j, param_group in enumerate(self.optimizer.param_groups):
-                total_params = sum([p.ds_tensor.numel() for p in param_group['params']])
+        aggregate_params_count = 0
+
+        for j, param_group in enumerate(self.optimizer.param_groups):
+            params_in_group = sum([p.ds_tensor.ds_numel for p in param_group['params']])
+
+            flat_buffer_size = params_in_group
+
+            if self.params_in_nvme_and_cpu and \
+                aggregate_params_count + params_in_group > self.max_params_in_cpu:
+
+                flat_buffer_size = max(0,
+                                       self.max_params_in_cpu - aggregate_params_count)
+
+            aggregate_params_count += params_in_group
+
+            if flat_buffer_size > 0:
+                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
+                             force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(total_params,
+                    torch.empty(int(flat_buffer_size),
                                 dtype=torch.half,
                                 pin_memory=True))
+            else:
+                print_rank_0(
+                    f"No flat buffer size. Param group size was  {params_in_group}",
+                    force=False)
+
+                self.param_groups_fp16_flat_cpu_memory.append(
+                    torch.empty(1,
+                                dtype=torch.half))
+
+    def _create_fp16_partitions_with_defragmentation(self):
+        dist.barrier()
+        partition_id = dist.get_rank(group=self.dp_process_group)
+
+        #create a flat CPU memory allocation for each param group
+        if self.offload_param:
+            self._create_param_groups_fp16_flat_cpu_memory()
 
         # loop to deal with groups
         for j, param_group in enumerate(self.optimizer.param_groups):
 
             sub_groups = self._create_fp16_sub_groups(param_group['params'])
+            print_rank_0(f'fp16 group {j} has {len(sub_groups)} subgroups', force=True)
+
             flat_offset = 0
             for sub_group in sub_groups:
                 i = len(self.fp16_groups)
@@ -905,6 +1096,10 @@ def _create_fp16_partitions_with_defragmentation(self):
                 self.fp16_partitioned_groups.append(
                     [param.ds_tensor for param in self.fp16_groups[i]])
 
+                total_elements = sum(
+                    [t.ds_numel for t in self.fp16_partitioned_groups[i]])
+                self.fp16_partitioned_groups_flat_numel.append(total_elements)
+
                 print_rank_0(
                     f"fp16 group {i} partitioned_param norms : {[param.ds_tensor.norm().item() for param in self.fp16_groups[i]]}"
                 )
@@ -918,14 +1113,16 @@ def _create_fp16_partitions_with_defragmentation(self):
 
                 #not sure why apex was cloning the weights before flattening
                 #removing cloning here
-                see_memory_usage(f"Before Flattening param group {i}", force=False)
+                see_memory_usage(f"Before Flattening param subgroup {i}", force=False)
 
-                if not self.cpu_offload_params:
-                    see_memory_usage(f"Before moving param group {i} to CPU",
+                #all partitioned parameters remain in GPU during training
+                if not self.offload_param:
+                    see_memory_usage(f"Before moving param subgroup group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
                     move_to_cpu(self.fp16_partitioned_groups[i])
-                    see_memory_usage(f"After moving param group {i} to CPU", force=False)
+                    see_memory_usage(f"After moving param subgroup {i} to CPU",
+                                     force=False)
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
@@ -933,45 +1130,208 @@ def _create_fp16_partitions_with_defragmentation(self):
                             self.fp16_partitioned_groups[i],
                             1).cuda(torch.cuda.current_device()))
                     see_memory_usage(
-                        f"After flattening and moving param group {i} to GPU",
+                        f"After flattening and moving param subgroup {i} to GPU",
                         force=False)
+
+                #all partitioned parameters are in CPU during training
                 else:
-                    total_elements = sum(
-                        [t.numel() for t in self.fp16_partitioned_groups[i]])
-                    fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                        j].narrow(0,
-                                  flat_offset,
-                                  total_elements)
+                    print_rank_0(f"Params in nvme and cpu {self.params_in_nvme_and_cpu}")
+                    #Flat buffer may not be available for parameters that reside in NVME
+                    if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
+                            j].numel():
+                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
+                            j].narrow(0,
+                                      flat_offset,
+                                      total_elements)
+                        print_rank_0(
+                            f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
+                            force=False)
+                    #these parameters reside in NVME and
+                    elif self.params_in_nvme_and_cpu:
+                        fp16_partitioned_group_flat = None
+                        print_rank_0(
+                            f"No flat buffer for sub group {i} of {total_elements} elements",
+                            force=False)
+                    else:
+                        assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs"
+
                     self.fp16_partitioned_groups_flat.append(fp16_partitioned_group_flat)
                     flat_offset += total_elements
 
                 # move param to flat buffer for both param offload on/off
-                self._move_to_flat_buffer(self.fp16_partitioned_groups[i],
-                                          self.fp16_partitioned_groups_flat[i])
+                self._move_to_flat_buffer(self.fp16_groups[i],
+                                          self.fp16_partitioned_groups_flat[i],
+                                          avoid_copy=not self.offload_param)
 
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
+                #create a pinned memory to be used for swapping out params to NVME after optimizer step
+                if self.fp16_partitioned_groups_flat[
+                        -1] is None and self.param_group_fp16_flat_reuse_buffer is None:
+                    self.param_group_fp16_flat_reuse_buffer = torch.empty(
+                        max(self.fp16_partitioned_groups_flat_numel),
+                        dtype=torch.half,
+                        device='cpu',
+                        pin_memory=True)
+
+                see_memory_usage(f"After Flattening param subgroup {i}", force=False)
+
+    def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
+        offset = 0
+        elements_in_sub_group = sum(
+            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
+        assert (flat_buffer.numel() == elements_in_sub_group)
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.ds_tensor.ds_numel}"
+                )
+                param.nvme_swapper.swap_in([param], async_op=False)
+                dest.data.copy_(partitioned_param.data)
+                param.nvme_swapper.remove_partition_and_release_buffers([param])
+                print_rank_0(f"Swapping in {param.ds_id} done")
+            else:
+                dest.data.copy_(partitioned_param.data)
+            offset += partitioned_param.ds_numel
+
+    def _create_next_swappable_fp32_groups(self):
+        reverse_order_indices = [
+            i for i in range(len(self.fp32_partitioned_groups_flat))
+        ]
+        reverse_order_indices.reverse()
+
+        next_group = None
+        for i in reverse_order_indices:
+            self.next_swappable_fp32_partitioned_groups.append(next_group)
+            if self._swappable_optimizer_subgroup(i):
+                next_group = self.fp32_partitioned_groups_flat[i]
+
+        self.next_swappable_fp32_partitioned_groups.reverse()
+
+    def _get_sub_group_partitions(self, sub_group_id):
+        sub_group_partitions = []
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_path = param.nvme_swapper.get_path(param, True)
+                sub_group_partitions.append((partitioned_param,
+                                             param.ds_tensor.ds_numel,
+                                             swap_path))
+            else:
+                sub_group_partitions.append((partitioned_param,
+                                             partitioned_param.ds_numel,
+                                             None))
+
+        return sub_group_partitions
+
     def _create_fp32_partitions(self):
+        cpu_memory_usage = 0
+        cpu_memory_sub_groups = 0
+        nvme_memory_usage = 0
+        num_swappable_partitions = 0
+        num_swap_from_nvme_partitions = 0
+        num_swap_from_cpu_partitions = 0
+        swap_from_nvme_memory_usage = 0
+        swap_from_cpu_memory_usage = 0
+        GIGA_BYTES = (1024**3)
+
+        swappable_fp32_tensors = []
+        swappable_fp16_src_tensors = []
+        nvme_fp16_partitions_info = []
+        nvme_fp16_num_elems = []
+        nvme_fp32_dest_tensors = []
+        fp32_element_size = torch.tensor([], dtype=torch.float32).element_size()
+
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
-            # a partition of the fp32 master weights that will be updated by this process
+            num_elements = self.fp16_partitioned_groups_flat_numel[i]
 
-            self.fp32_partitioned_groups_flat.append(
-                self.fp16_partitioned_groups_flat[i].to(
-                    self.device).clone().float().detach())
-            element_size = self.fp32_partitioned_groups_flat[i].element_size()
-            num_elements = self.fp32_partitioned_groups_flat[i].numel()
+            # a partition of the fp32 master weights that will be updated by this process
+            if self._swappable_optimizer_subgroup(i):
+                self.fp32_partitioned_groups_flat.append(torch.Tensor())
+                nvme_memory_usage += (fp32_element_size * num_elements)
+                num_swappable_partitions += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    num_swap_from_nvme_partitions += 1
+                    swap_from_nvme_memory_usage += (fp32_element_size * num_elements)
+                    if self.offload_optimizer_fast_init:
+                        sub_group_partitions = self._get_sub_group_partitions(i)
+                        nvme_fp16_partitions_info.append(sub_group_partitions)
+                        nvme_fp16_num_elems.append(num_elements)
+                        nvme_fp32_dest_tensors.append(
+                            self.fp32_partitioned_groups_flat[i])
+                    else:
+                        unpinned_fp32_buffer = torch.empty(num_elements,
+                                                           device=self.device,
+                                                           dtype=torch.float)
+                        self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                        self.optimizer_swapper.initialize_parameters(
+                            parameters=[self.fp32_partitioned_groups_flat[i]],
+                            src_tensors=[unpinned_fp32_buffer])
+                else:
+                    num_swap_from_cpu_partitions += 1
+                    swap_from_cpu_memory_usage += (fp32_element_size * num_elements)
+                    swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i])
+                    swappable_fp16_src_tensors.append(
+                        self.fp16_partitioned_groups_flat[i])
+            else:
+                cpu_memory_usage += (fp32_element_size * num_elements)
+                cpu_memory_sub_groups += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    unpinned_fp32_buffer = torch.empty(num_elements,
+                                                       device=self.device,
+                                                       dtype=torch.float)
+                    self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                    self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer)
+                else:
+                    self.fp32_partitioned_groups_flat.append(
+                        self.fp16_partitioned_groups_flat[i].to(
+                            self.device).clone().float().detach())
 
             self.fp32_partitioned_groups_flat[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
 
+        if len(swappable_fp32_tensors) > 0:
+            self.optimizer_swapper.initialize_parameters(
+                parameters=swappable_fp32_tensors,
+                src_tensors=swappable_fp16_src_tensors)
+
+        if len(nvme_fp32_dest_tensors) > 0:
+            fp16_pinned_buffers = self.fp16_groups[0][
+                0].nvme_swapper.reserve_available_buffers()
+            assert len(fp16_pinned_buffers) > 0
+            self.optimizer_swapper.initialize_from_swapped_fp16_params(
+                fp16_partitions_info=nvme_fp16_partitions_info,
+                fp16_num_elems=nvme_fp16_num_elems,
+                fp16_pinned_buffers=fp16_pinned_buffers,
+                fp32_parameters=nvme_fp32_dest_tensors)
+            self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
+
+        nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
+            force=True)
+        if self.params_in_nvme_and_cpu:
+            print_rank_0(
+                f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+            print_rank_0(
+                f'Swap from CPU Partitions: count = {num_swap_from_cpu_partitions}, size = {swap_from_cpu_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+
+        cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
+            force=True)
+
         # Clear for on-the-fly population before the optimizer step
         for param_group in self.optimizer.param_groups:
             param_group['params'] = []
 
     def _create_fp16_sub_groups(self, params_group):
 
-        params_group_numel = sum([param.ds_tensor.numel() for param in params_group])
-
+        params_group_numel = sum([param.partitioned_size() for param in params_group])
         sub_group_size = self.sub_group_size
 
         if sub_group_size is None or sub_group_size >= params_group_numel:
@@ -983,7 +1343,7 @@ def _create_fp16_sub_groups(self, params_group):
         for param in params_group:
 
             sub_group.append(param)
-            local_sub_group_size += param.ds_tensor.numel()
+            local_sub_group_size += param.partitioned_size()
 
             if local_sub_group_size >= sub_group_size or id(param) == id(
                     params_group[-1]):
@@ -1019,6 +1379,10 @@ def _end_of_forward_hook(module, *args):
         self.module.register_forward_hook(_end_of_forward_hook)
         self.module.register_forward_pre_hook(_pre_forward_hook)
 
+        # Add top todule to stack trace
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(self.module)
+
     def persistent_parameters(self):
         persistent_params = []
         total_persistent_parameters = 0
@@ -1046,7 +1410,41 @@ def _register_hooks_recursively(self, module, count=[0]):
         def _pre_forward_module_hook(module, *args):
             self.pre_sub_module_forward_function(module)
 
-        def _post_forward_module_hook(module, *args):
+        def _post_forward_module_hook(module, input, output):
+            global FWD_MODULE_STACK
+            FWD_MODULE_STACK.pop()
+
+            if not isinstance(output, (list, tuple)):
+                if torch.is_tensor(output):
+                    output = [output]
+                else:
+                    print(f'got UNKNOWN type {type(output)}')
+                    outputs = []
+                    for name, val in vars(output).items():
+                        if not name.startswith('__') and torch.is_tensor(val):
+                            outputs.append(val)
+                    output = outputs
+                    print(f'convert output to {output}')
+
+            for item in filter(lambda item: is_zero_param(item), output):
+                if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):
+                    item.ds_active_sub_modules += 1
+                    module_to_register = FWD_MODULE_STACK[-1]
+                    print_rank_0(
+                        f'Registering dangling parameter for module {module_to_register.__class__.__name__}.',
+                        force=False)
+                    register_external_parameter(module_to_register, item)
+
+                    # It's possible that the parameter was already external to the completed module. If so, remove it the
+                    # registration as it will be covered by the outer module instead.
+                    if id(item) in module._external_params:
+                        print_rank_0(
+                            f'  Unregistering nested dangling parameter from module {module.__class__.__name__}',
+                            force=False)
+                        unregister_external_parameter(module, item)
+
+                    item.all_gather()
+
             self.post_sub_module_forward_function(module)
 
         def _pre_backward_module_hook(module, inputs, output):
@@ -1110,6 +1508,9 @@ def pre_sub_module_forward_function(self, sub_module):
         see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
                          force=False)
 
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(sub_module)
+
         self.param_coordinator.record_trace(sub_module)
 
         self.param_coordinator.fetch_sub_module(sub_module)
@@ -1117,8 +1518,10 @@ def pre_sub_module_forward_function(self, sub_module):
             f"Before sub module function {sub_module.__class__.__name__} after fetch",
             force=False)
 
-        self.param_coordinator.prefetch_next_sub_modules(sub_module,
-                                                         numel=self.prefetch_elements)
+        self.param_coordinator.prefetch_next_sub_modules(
+            sub_module,
+            numel=self.prefetch_elements,
+            nvme=self.params_in_nvme_and_cpu)
         see_memory_usage(
             f"Before sub module function {sub_module.__class__.__name__} after prefetch",
             force=False)
@@ -1127,11 +1530,13 @@ def pre_sub_module_forward_function(self, sub_module):
 
     def post_sub_module_forward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} before release",
+            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
+
         self.param_coordinator.release_sub_module(sub_module)
+
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} after release",
+            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
             force=False)
 
     def pre_sub_module_backward_function(self, sub_module):
@@ -1146,17 +1551,17 @@ def pre_sub_module_backward_function(self, sub_module):
 
     def post_sub_module_backward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} before release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
         self.param_coordinator.release_sub_module(sub_module)
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} after release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
             force=False)
 
     def _release_ipg_buffers(self):
         if self.contiguous_gradients:
             self.ipg_buffer = None
-            if not self.cpu_offload:
+            if not self.offload_optimizer and self.is_gradient_accumulation_boundary:
                 self.grads_in_partition = None
 
             self.grads_in_partition_offset = 0
@@ -1166,35 +1571,93 @@ def _optimizer_step(self, sub_group_id):
         fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
         fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
         self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+
         self.optimizer.step()
         self.optimizer.param_groups[param_group_id]['params'] = []
-        fp16_param.data.copy_(fp32_param.data)
+
+        if fp16_param is not None:
+            fp16_param.data.copy_(fp32_param.data)
+        else:
+            #synchronize incase there is a previous write going on the reuse buffer
+            self.fp16_groups[sub_group_id][0].nvme_swapper.synchronize_writes()
+            self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                0,
+                fp32_param.numel()).data.copy_(fp32_param.data)
+
+    def _swappable_optimizer_subgroup(self, sub_group_id):
+        if not self.swap_optimizer:
+            return False
+
+        return self.optimizer_swapper.swappable_tensor(
+            None,
+            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
+
+    def _partitioned_params_swap_out(self, i):
+        swap_out_params = []
+        offset = 0
+        for param, partitioned_param in zip(self.fp16_groups[i], self.fp16_partitioned_groups[i]):
+            src = self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                offset,
+                partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.AVAILABLE:
+                partitioned_param.data.copy_(src.data)
+            else:
+                partitioned_param.data = src.data
+                #Setting it to available just for good practice. It will be released at the end of the call
+                #by swap out and release
+                partitioned_param.status = PartitionedParamStatus.AVAILABLE
+                swap_out_params.append(param)
+            offset += partitioned_param.ds_numel
+
+        if len(swap_out_params) > 0:
+            #The write synchronize will happen before the buffer is reused in _optimizer_step so the buffer can be released
+            swap_out_params[0].nvme_swapper.swap_out_and_release(
+                swap_out_params,
+                async_op=True,
+                force_buffer_release=True)
 
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
-        largest_numel = max([t.numel() for t in self.fp16_partitioned_groups_flat])
+        largest_numel = max(
+            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
         gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
         gradient_buffer = torch.zeros(int(largest_numel),
                                       dtype=gradient_dtype,
                                       device=self.device)
 
+        timers = self.timers
+        timer_names = set()
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.init_timers()
+
+        INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
+        timer_names.add(INIT_OPTIMIZER_TIMER)
+        self.start_timers([INIT_OPTIMIZER_TIMER])
+
         for i, group in enumerate(self.fp16_groups):
+            swappable_optimizer_subgroup = self._swappable_optimizer_subgroup(i)
+            swappable_param_subgroup = self.fp16_partitioned_groups_flat[i] is None
+
+            num_elements = int(self.fp16_partitioned_groups_flat_numel[i])
+
             see_memory_usage(
-                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-            num_elements = int(self.fp16_partitioned_groups_flat[i].numel())
-            if self.cpu_offload and not self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device)
-            elif self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device).pin_memory()
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_in(i, timer_names)
+
+            if self.offload_optimizer and not swappable_optimizer_subgroup:
+                subgroup_gradient_buffer = torch.zeros(num_elements,
+                                                       dtype=gradient_dtype,
+                                                       device=self.device)
+                if self.offload_optimizer_pin_memory:
+                    subgroup_gradient_buffer = subgroup_gradient_buffer.pin_memory()
+                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
             else:
                 self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
                     0,
@@ -1203,14 +1666,27 @@ def initialize_optimizer_states(self):
 
             self._optimizer_step(i)
 
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_out(i, timer_names)
+
+            if swappable_param_subgroup:
+                self._partitioned_params_swap_out(i)
+
             see_memory_usage(
-                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-        if not self.cpu_offload:
+        self.stop_timers([INIT_OPTIMIZER_TIMER])
+        self.log_timers(timer_names)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
+
+        if not self.offload_optimizer:
             for group in self.fp32_partitioned_groups_flat:
                 group.grad = None
 
+        # Reset steps
         return
 
     #########################################################################
@@ -1269,7 +1745,7 @@ def independent_gradient_partition_epilogue(self):
 
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
         #TODO: use a similar code path for both cpu_offload and non-cpu offload
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             for i, sub_group in enumerate(self.fp16_groups):
                 self.averaged_gradients[i] = [
                     torch.zeros_like(param.ds_tensor) if param.grad is None else
@@ -1491,7 +1967,7 @@ def set_grad_positions(self):
             current_offset = 0
             for param in group:
                 param_id = self.get_param_id(param)
-                num_elements = param.ds_tensor.numel()
+                num_elements = param.ds_tensor.ds_numel
 
                 self.grad_position[param_id] = [
                     int(i),
@@ -1507,7 +1983,7 @@ def async_accumulate_grad_in_cpu_via_gpu(self, param, acc_grad_cpu_partition):
         dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
             0,
             0,
-            param.ds_tensor.numel())
+            param.ds_tensor.ds_numel)
 
         if self.micro_step_id > 0:
             dest_buffer.copy_(acc_grad_cpu_partition.view(-1), non_blocking=True)
@@ -1581,7 +2057,7 @@ def partition_previous_reduced_grads(self):
         if not self.previous_reduced_grads:
             return
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             allocate_grads_in_partition = self.grads_in_partition is None\
             and self.gradient_accumulation_steps > 1
         else:
@@ -1593,12 +2069,12 @@ def partition_previous_reduced_grads(self):
             for i, group in enumerate(self.fp16_groups):
                 total_size = 0
                 for param_in_partition in group:
-                    total_size += param_in_partition.ds_tensor.numel()
+                    total_size += param_in_partition.ds_tensor.ds_numel
 
                 see_memory_usage(
                     f"group {i} before creating {total_size} reduced gradients into partition",
                     force=False)
-                if self.cpu_offload_use_pin_memory:
+                if self.offload_param_pin_memory:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
                                     dtype=torch.half,
@@ -1612,51 +2088,73 @@ def partition_previous_reduced_grads(self):
                     f"group {i} after creating {total_size} reduced gradients into partition",
                     force=False)
 
-        for param in self.previous_reduced_grads:
-
-            [i, dest_offset, num_elements] = self.grad_position[self.get_param_id(param)]
-
-            # self.debug_fp16_grads[i][self.get_param_id(param)] = (
-            #     float(param.data.float().norm(2)),
-            #     float(param.grad.data.float().norm(2)))
-
-            if self.cpu_offload:
+        if self.offload_optimizer:
+            offload_fp32_gradients = {}
+            offload_fp32_offsets = {}
 
-                param.partition_gradients(partition_buffers=self.temp_grad_gpu_buffer)
-                with torch.cuda.stream(self.copy_grad_stream):
-                    self.reduction_stream.synchronize()
-
-                if self.gradient_accumulation_steps > 1:
+        with torch.cuda.stream(self.copy_grad_stream):
+            self.reduction_stream.synchronize()
+            for param in self.previous_reduced_grads:
+
+                [i,
+                 dest_offset,
+                 num_elements] = self.grad_position[self.get_param_id(param)]
+
+                if self.offload_optimizer:
+                    param.partition_gradients(
+                        partition_buffers=self.temp_grad_gpu_buffer)
+                    #with torch.cuda.stream(self.copy_grad_stream):
+                    #    self.reduction_stream.synchronize()
+
+                    if self.gradient_accumulation_steps > 1:
+                        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+                        fp16_grad_tensor = self.grads_in_partition[i].narrow(
+                            0,
+                            dest_offset,
+                            num_elements)
+                        self.async_accumulate_grad_in_cpu_via_gpu(
+                            param,
+                            fp16_grad_tensor)
+
+                    if self.is_gradient_accumulation_boundary:
+
+                        self.set_norm_for_param_grad_in_gpu(param)
+
+                        self.update_overflow_tracker_for_param_grad(param)
+
+                        if self._swappable_optimizer_subgroup(i):
+                            if not i in offload_fp32_gradients.keys():
+                                offload_fp32_gradients[i] = []
+                                offload_fp32_offsets[i] = []
+
+                            offload_fp32_gradients[i].append(param.grad.view(-1).float())
+                            param.grad = None
+                            offload_fp32_offsets[i].append(dest_offset)
+                        else:
+                            fp32_grad_tensor = self.fp32_partitioned_groups_flat[
+                                i].grad.narrow(0,
+                                               dest_offset,
+                                               num_elements)
+
+                            self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
+                                param,
+                                fp32_grad_tensor)
+                else:
                     # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
                     fp16_grad_tensor = self.grads_in_partition[i].narrow(
                         0,
                         dest_offset,
                         num_elements)
-                    self.async_accumulate_grad_in_cpu_via_gpu(param, fp16_grad_tensor)
-
-                if self.is_gradient_accumulation_boundary:
-
-                    self.set_norm_for_param_grad_in_gpu(param)
-
-                    self.update_overflow_tracker_for_param_grad(param)
-
-                    fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
+                    param.partition_gradients(
+                        partition_buffers=fp16_grad_tensor,
+                        accumulate=True if self.micro_step_id > 0 else False)
 
-                    self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
-                        param,
-                        fp32_grad_tensor)
-            else:
-                # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-                fp16_grad_tensor = self.grads_in_partition[i].narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
-                param.partition_gradients(
-                    partition_buffers=fp16_grad_tensor,
-                    accumulate=True if self.micro_step_id > 0 else False)
+            if self.offload_optimizer and self.swap_optimizer:
+                for i in offload_fp32_gradients.keys():
+                    self.optimizer_swapper.swap_out_gradients(
+                        parameter=self.fp32_partitioned_groups_flat[i],
+                        gradient_offsets=offload_fp32_offsets[i],
+                        gradient_tensors=offload_fp32_gradients[i])
 
         self.previous_reduced_grads = []
 
@@ -2046,164 +2544,7 @@ def stop_timers(self, timer_names):
         for name in timer_names:
             self.timers(name).stop()
 
-    def old_step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-
-        self.micro_step_id = INITIAL_MICRO_STEP_ID
-
-        # if self.cpu_offload:
-        #    torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        print_rank_0(f"Inside Step function")
-        see_memory_usage(f"In step before checking overflow", force=False)
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-        self.param_coordinator.hierarchy = 0
-        self.param_coordinator.finish_tracing(print_trace=True)
-
-        self.param_coordinator.reset_step()
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        timers = self.timers
-
-        OPTIMIZER_STEP = 'optimizer_step'
-        OPTIMIZER_FP16_UPDATE = 'optimizer_fp16_update'
-        OPTIMIZER_FP32_GRADIENT = 'optimizer_fp32_gradient'
-        timer_names = [OPTIMIZER_STEP, OPTIMIZER_FP16_UPDATE, OPTIMIZER_FP32_GRADIENT]
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients', force=False)
-            self.zero_grad()
-
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            see_memory_usage('After overflow after clearing gradients', force=False)
-
-            logger.info(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
-            return
-
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        debug_fp32_grads = [{} for _ in self.fp16_groups]
-
-        self.start_timers([OPTIMIZER_FP32_GRADIENT])
-        for i, group in enumerate(self.fp16_groups):
-
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
-
-                single_grad_partition = self.fp32_partitioned_groups_flat[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
-
-                # free gradients for all the prameters that are not updated by this process
-                # self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                # create a flat gradients for parameters updated by this process
-
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
-                    self.fp32_partitioned_groups_flat[i].dtype)
-
-                assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[i].numel(), \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                        single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.fp32_partitioned_groups_flat[i].grad = single_grad_partition
-
-                # release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.zero_grad()
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-            debug_fp32_grads[i] = [(t.clone().detach(),
-                                    t)
-                                   for t in self.unflatten(single_grad_partition,
-                                                           group)]
-
-        self.stop_timers([OPTIMIZER_FP32_GRADIENT])
-
-        print(f"Norm groups: {norm_groups}")
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-
-        #self.dump_pre_step_gradients(debug_fp32_grads)
-
-        self.start_timers([OPTIMIZER_STEP])
-        self.optimizer.step()
-        self.stop_timers([OPTIMIZER_STEP])
-
-        # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
-            for group in self.fp32_partitioned_groups_flat:
-                group.grad = None
-
-        self.start_timers([OPTIMIZER_FP16_UPDATE])
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat):
-            fp16_partitions.data.copy_(fp32_partition.data)
-        self.stop_timers([OPTIMIZER_FP16_UPDATE])
-
-        print(
-            f"fp16 groups norm : {[group_flat.norm() for group_flat in self.fp16_partitioned_groups_flat]}"
-        )
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            #for p in self.fp16_groups[i]:
-            #    p.data=p.ds_tensor
-
-            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
-                                            self.fp16_partitioned_groups[i])
-            for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
-                # print(f"Grad fn: {p.grad_fn}")
-                # p.data = torch.ones(1).half().cuda()
-                partitioned_param.data = q.data
-
-        #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
-
-        #self.dump_post_step_gradients()
-        self.debug_fp16_grads = [{} for _ in self.fp16_groups]
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        self.log_timers(timer_names)
-
-        see_memory_usage('After zero_optimizer step', force=False)
-        print_rank_0(f"------------------Finishing Step-----------------------",
-                     force=False)
-        return
-
     def _pre_step(self):
-
         self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         print_rank_0(f"Inside Step function")
@@ -2220,7 +2561,7 @@ def _pre_step(self):
     def _get_norm_groups(self):
         norm_groups = []
         for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
+            if self.offload_optimizer:
                 norm_groups.append(
                     self.complete_grad_norm_calculation_for_cpu_offload(
                         self.fp16_groups[i]))
@@ -2231,7 +2572,6 @@ def _get_norm_groups(self):
         return norm_groups
 
     def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
-
         partition_id = dist.get_rank(group=self.dp_process_group)
 
         single_grad_partition = self.flatten(self.averaged_gradients[sub_group_id]).to(
@@ -2251,18 +2591,42 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
     def _prepare_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}',
                          force=False)
-        if not self.cpu_offload:
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names)
+        elif not self.offload_optimizer:
             self._prepare_fp32_grad_for_sub_group(sub_group_id)
         see_memory_usage(f'After prepare optimizer sub group {sub_group_id}',
                          force=False)
 
+    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
+        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+        self.start_timers([OPTIMIZER_SWAP_IN_STATE])
+
+        self.optimizer_swapper.swap_in_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_parameter=self.next_swappable_fp32_partitioned_groups[sub_group_id])
+
+        self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
+        timer_names.add(OPTIMIZER_SWAP_IN_STATE)
+        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+
     def _release_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before release optimizer sub group {sub_group_id}',
                          force=False)
         # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             self.fp32_partitioned_groups_flat[sub_group_id].grad = None
 
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names)
         see_memory_usage(f'After release optimizer sub group {sub_group_id}',
                          force=False)
 
@@ -2287,6 +2651,32 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
         return self.flatten(padded_tensor_list)
 
+    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
+        see_memory_usage(
+            f'post-step Before swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
+
+        self.optimizer_swapper.swap_out_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is
+            not None)
+
+        self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
+        see_memory_usage(
+            f'post-step After swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
+
+        # get rid of the fp32 gradients. Not needed anymore
+        self.fp32_partitioned_groups_flat[sub_group_id].grad = None
+
     def _unflatten_partitioned_parameters(self, sub_group_id):
         updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
                                         self.fp16_partitioned_groups[sub_group_id])
@@ -2298,7 +2688,7 @@ def _overflow_clean_up(self, prev_scale):
         see_memory_usage('After overflow before clearing gradients', force=False)
         self.zero_grad()
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
         else:
             self.averaged_gradients = {}
@@ -2327,17 +2717,28 @@ def _overflow_check_and_loss_scale_update(self):
         return self.overflow
 
     def _post_step(self, timer_names=set()):
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
 
         #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
 
         self.log_timers(timer_names)
 
         see_memory_usage('After zero_optimizer step', force=False)
         print_rank_0(f"------------------Finishing Step-----------------------")
 
+    def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
+        if self.fp16_partitioned_groups_flat[sub_group_id] is not None:
+            #unflatten fp16 parameter subgroup
+            self._unflatten_partitioned_parameters(sub_group_id)
+        else:
+            self._partitioned_params_swap_out(sub_group_id)
+
     def step(self, closure=None):
         """
             Not supporting closure.
@@ -2346,6 +2747,8 @@ def step(self, closure=None):
 
         #checks for overflow, adjust the loss scale accordingly
         if self._overflow_check_and_loss_scale_update():
+            if self.swap_optimizer:
+                self.optimizer_swapper.log_timers()
             return
 
         norm_groups = self._get_norm_groups()
@@ -2370,8 +2773,8 @@ def step(self, closure=None):
             #release memory or swap out optimizer states of fp32 parameters
             self._release_sub_group(sub_group_id, timer_names)
 
-            #unflatten fp16 parameter subgroup
-            self._unflatten_partitioned_parameters(sub_group_id)
+            #put fp16 parameters in appropriate location
+            self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
 
         self.stop_timers(['optimizer_step'])
 
@@ -2460,7 +2863,7 @@ def has_overflow(self, partition_gradients=True):
                 self.local_overflow = self._has_inf_or_nan(self.gpu_sum)
                 self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            overflow = self.local_overflow if self.offload_optimizer else self.has_overflow_partitioned_grads_serial(
             )
             #overflow = self.has_overflow_partitioned_grads_serial()
             overflow_gpu = torch.cuda.ByteTensor([overflow])
@@ -2519,6 +2922,10 @@ def backward(self, loss, retain_graph=False):
         print_rank_0(
             f"Total fully available parameters {self.param_coordinator.total_available_parameter_numel}"
         )
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.pre_backward()
+
         see_memory_usage(f"Before backward", force=False)
         if self.contiguous_gradients:
             self.ipg_buffer = []
@@ -2541,6 +2948,9 @@ def backward(self, loss, retain_graph=False):
         grad computation do not trigger post call and will therefore will remain unpartitioned '''
         self._partition_all_parameters()
 
+        if self.swap_optimizer:
+            self.optimizer_swapper.post_backward()
+
     def _partition_all_parameters(self):
         for name, param in self.module.named_parameters(recurse=True):
             self.param_coordinator.release_and_reset_parameter(param)
@@ -2664,6 +3074,11 @@ def state_dict(self):
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
 
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
         return self._rigid_state_dict()
 
 
@@ -2806,19 +3221,26 @@ def load_state_dict(self,
             raise NotImplementedError(
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
-        else:
-            self._rigid_load_state_dict(
-                state_dict_list[dist.get_rank(group=self.dp_process_group)],
-                load_optimizer_states=load_optimizer_states)
 
-        self.persistent_parameters[0].partition(self.persistent_parameters)
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
+        self._rigid_load_state_dict(
+            state_dict_list[dist.get_rank(group=self.dp_process_group)],
+            load_optimizer_states=load_optimizer_states)
+
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].partition(self.persistent_parameters)
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
     def save_checkpoint_prologue(self):
         self._partition_all_parameters()
 
     def save_checkpoint_epilogue(self):
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
 
 def _handle_overflow(cpu_sum, x, i):
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
new file mode 100644
index 000000000000..c6f912500741
--- /dev/null
+++ b/deepspeed/runtime/zero/tiling.py
@@ -0,0 +1,293 @@
+import torch
+import deepspeed
+from deepspeed.runtime.utils import partition_uniform as partition
+
+
+def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension. Adapted from Megatron-LM.
+
+    Arguments:
+        tensor: input tensor.
+        partitions: list of partition sizes to supply to torch.split
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    # Split.
+    tensor_list = torch.split(tensor, partitions, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class TiledLinear(torch.nn.Module):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 in_splits=1,
+                 out_splits=1,
+                 input_is_already_split=False,
+                 combine_out_splits=True,
+                 linear_cls=torch.nn.Linear,
+                 init_linear=None,
+                 **kwargs):
+        """A replacement for ``torch.nn.Linear`` that works with ZeRO-3 to reduce
+        memory requirements via tiling.
+
+        TiledLinear breaks the input and output dimensions of a linear layer
+        into tiles that are processed in sequence. This class enables huge
+        linear layers when combined with ZeRO-3 because inactive tiles can be
+        partitioned and offloaded.
+
+        .. note::
+            We recommend using as few tiles as necessary. Tiling
+            significantly reduces memory usage, but can reduce throughput
+            for inexpensive layers. This due to the smaller kernels having
+            less parallelism and lower arithmetic intensity, while
+            introducing more frequent synchronization and communication.
+
+        Args:
+            in_features (int): See ``torch.nn.Linear``
+            out_features (int): See ``torch.nn.Linear``
+            bias (bool, optional): See ``torch.nn.Linear``
+            in_splits (int, optional): The number of tiles along the input dimension. Defaults to 1.
+            out_splits (int, optional): The number of tiles along the output dimension. Defaults to 1.
+            input_is_already_split (bool, optional): If set to ``True``, assume that the ``input_`` in
+                to ``forward()`` is already split into ``in_splits`` chunks. Defaults to ``False``.
+            combine_out_splits (bool, optional): If set to ``False``, do not combine the ``out_splits`` outputs
+                into a single tensor. Defaults to ``True``.
+            linear_cls (class, optional): The underlying class to build individual tiles.
+                Defaults to ``torch.nn.Linear``.
+            init_linear (``torch.nn.Linear``, optional): If set, copy the parameters of
+                ``init_linear``. Useful for debugging. Defaults to ``None``.
+            kwargs (dict, optional): additional keyword arguments to provide to ``linear_cls()``.
+
+        Raises:
+            RuntimeError: ``in_splits`` must be within the range [1, in_features).
+            RuntimeError: ``out_splits`` must be within the range of [1, out_features).
+        """
+
+        super().__init__()
+
+        if (in_splits < 1) or (in_splits > in_features):
+            raise RuntimeError('in splits must be in range [1, in_features].')
+        if (out_splits < 1) or (out_splits > out_features):
+            raise RuntimeError('out splits must be in range [1, out_features].')
+
+        # global, not necessarily local
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+
+        self.out_splits = out_splits
+        self.in_splits = in_splits
+        self.input_is_already_split = input_is_already_split
+        self.combine_out_splits = combine_out_splits
+
+        # Build partition-lists. These are CSR-style splits [0, part0, part1, ..., features]
+        # For example, row_parts[p] gives the start of partition p and row_parts[p+1]
+        # is the exclusive end.
+        self.in_parts = partition(num_items=in_features, num_parts=in_splits)
+        self.out_parts = partition(num_items=out_features, num_parts=out_splits)
+
+        assert len(self.out_parts) == out_splits + 1
+        assert len(self.in_parts) == in_splits + 1
+        assert self.out_parts[0] == 0
+        assert self.out_parts[out_splits] == out_features
+        assert self.in_parts[in_splits] == in_features
+
+        self.linears = torch.nn.ModuleList()
+        for out_id in range(out_splits):
+            self.linears.append(torch.nn.ModuleList())
+
+            local_out_dim = self.out_parts[out_id + 1] - self.out_parts[out_id]
+
+            for in_id in range(in_splits):
+                #if input_size is split, we only need one bias
+                local_bias = bias if in_id == (in_splits - 1) else False
+
+                local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id]
+                local = linear_cls(local_in_dim,
+                                   local_out_dim,
+                                   bias=local_bias,
+                                   **kwargs)
+                self.linears[out_id].append(local)
+
+        # Optionally initialize with a known tensor
+        if init_linear is not None:
+            self.copy_params_from(init_linear)
+
+    def forward(self, input_):
+        if self.in_splits > 1 and not self.input_is_already_split:
+            split_sizes = [
+                self.in_parts[p + 1] - self.in_parts[p] for p in range(self.in_splits)
+            ]
+            inputs = self._split_global_input(input_, split_sizes)
+        elif self.in_splits > 1:
+            inputs = input_
+            assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
+        else:
+            # no splits
+            inputs = [input_]
+
+        outputs = [None] * self.out_splits
+        for out_id in range(self.out_splits):
+            for in_id in range(self.in_splits):
+                local_output = self.linears[out_id][in_id](inputs[in_id])
+
+                outputs[out_id] = self._reduce_local_output(in_id=in_id,
+                                                            out_id=out_id,
+                                                            current_out=outputs[out_id],
+                                                            new_out=local_output)
+
+        if self.combine_out_splits:
+            return self._combine_output_splits(outputs)
+
+        return outputs
+
+    def _split_global_input(self, input, split_sizes):
+        """Partition an input tensor along the last dimension, aligned with given splits.
+
+        Subclasses should override this method to account for new input types.
+
+        Args:
+            input (List[Tensor]): The tensor to partition along the last dimension.
+            split_sizes (List[int]): The size of each partition.
+
+        Returns:
+            List[Any]: A list of the chunks of ``input``.
+        """
+        return split_tensor_along_last_dim(input, split_sizes)
+
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduce (sum) a new local result into the existing local results.
+
+        Subclasses should override this method.
+
+        For a given ``out_id``, this method is called ``in_id-1`` times. The first input
+        split is a simple assignment.
+
+        Args:
+            in_id (int): The input split that produced ``new_out``.
+            out_id (int): The output split that produced ``new_out``.
+            current_out (Any): The reduced form of all previous ``out_id`` results.
+            new_out (Any): The local result from forward (``in_id``, ``out_id``)e
+
+        Returns:
+            Any: The combined result of ``current_out`` and ``new_out``.
+        """
+
+        if current_out is None:
+            #this clone is necessary to preserve auto grad
+            #there is some issue with inplace update for outputs that are views
+            return new_out.clone()
+        else:
+            return current_out + new_out
+
+    def _combine_output_splits(self, outputs):
+        """Join the splits of the output into a single result.
+
+        Args:
+            outputs (List[Any]): The reduced outputs for each output split.
+
+        Returns:
+            Any: The combined outputs.
+        """
+        assert len(outputs) == self.out_splits
+        return torch.cat(outputs, dim=-1)
+
+    @torch.no_grad()
+    def copy_params_from(self, other):
+        """Copy the weight and bias data from ``other``.
+
+        This is especially useful for reproducible initialization and testing.
+
+        Equivalent to:
+
+        .. code-block:: python
+
+            with torch.no_grad():
+                self.weight.copy_(other.weight)
+                if self.bias is not None:
+                    self.bias.copy_(other.bias)
+
+        .. note::
+            If ZeRO-3 is enabled, this is a collective operation and the updated parameters of
+            data-parallel rank 0 will be visibly on all ranks. See
+            :class:`deepspeed.zero.GatheredParameters` for more information.
+
+
+        Args:
+            other (``torch.nn.Linear``): the linear layer to copy from.
+        """
+        assert hasattr(other, 'weight')
+        assert other.weight.size() == (self.out_features, self.in_features)
+        if self.use_bias:
+            assert hasattr(other, 'bias')
+            assert other.bias is not None
+            assert other.bias.size() == (self.out_features, )
+        else:
+            assert other.bias is None
+
+        for row in range(self.out_splits):
+            rstart = self.out_parts[row]
+            rstop = self.out_parts[row + 1]
+
+            for col in range(self.in_splits):
+                cstart = self.in_parts[col]
+                cstop = self.in_parts[col + 1]
+
+                local = self.linears[row][col]
+                global_weight = other.weight[rstart:rstop, cstart:cstop]
+                with deepspeed.zero.GatheredParameters(local.weight, modifier_rank=0):
+                    local.weight.copy_(global_weight)
+
+            if local.bias is not None:
+                with deepspeed.zero.GatheredParameters(local.bias, modifier_rank=0):
+                    local.bias.data.copy_(other.bias[rstart:rstop].data)
+
+
+class TiledLinearReturnBias(TiledLinear):
+    """Wrapper for a Linear class that returns its own bias parameter, such as
+    used by Megatron-LM.
+    """
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduces output tensors, but not the returned bias. """
+        if current_out is not None:
+            old_tensor, old_bias = current_out
+        else:
+            old_tensor, old_bias = None, None
+
+        assert isinstance(new_out, tuple)
+        assert len(new_out) == 2
+
+        tensor, bias = new_out
+        assert tensor is not None
+
+        tensor = super()._reduce_local_output(in_id=in_id,
+                                              out_id=out_id,
+                                              current_out=old_tensor,
+                                              new_out=tensor)
+
+        if bias is None:
+            bias = old_bias
+
+        return tensor, bias
+
+    def _combine_output_splits(self, outputs):
+        # stack output tensors
+        tensors = [o[0] for o in outputs]
+        tensor = super()._combine_output_splits(tensors)
+
+        # stack biases if applicable
+        biases = [o[1] for o in outputs if o[1] is not None]
+        if len(biases) > 0:
+            bias = super()._combine_output_splits(biases)
+        else:
+            bias = None
+
+        return tensor, bias
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 318cb2213404..8b41df6a79f6 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -43,6 +43,10 @@ lnav:
         url: /docs/config-json/#gradient-clipping
       - title: "ZeRO optimizations"
         url: /docs/config-json/#zero-optimizations-for-fp16-training
+      - title: "Parameter Offloading"
+        url: /docs/config-json/#parameter-offloading
+      - title: "Optimizer Offloading"
+        url: /docs/config-json/#optimizer-offloading
       - title: "Logging"
         url: /docs/config-json/#logging
       - title: "Flops Profiler"
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 4ec491e1de3f..b7df0c47a464 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -250,9 +250,12 @@ Enabling and configuring ZeRO memory optimizations
     "reduce_scatter": [true|false],
     "reduce_bucket_size": 5e8,
     "contiguous_gradients" : [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory" : [true|false],
+    "offload_param": {
+      ...
+    },
+    "offload_optimizer": {
+      ...
+    },
     "stage3_max_live_parameters" : 1e9,
     "stage3_max_reuse_distance" : 1e9,
     "stage3_prefetch_bucket_size" : 5e8,
@@ -281,7 +284,7 @@ Enabling and configuring ZeRO memory optimizations
 | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
 | Chooses between allgather collective or a series of broadcast collectives to gather updated parameters from all the GPUs at the end of each step | `true`  |
 
-***allgather_bucket_size***: [boolean]
+***allgather_bucket_size***: [integer]
 
 | Description                                                                                                  | Default |
 | ------------------------------------------------------------------------------------------------------------ | ------- |
@@ -299,7 +302,7 @@ Enabling and configuring ZeRO memory optimizations
 | ----------------------------------------------------------------------- | ------- |
 | Uses reduce or reduce scatter instead of allreduce to average gradients | `true`  |
 
-***reduce_bucket_size***: [boolean]
+***reduce_bucket_size***: [integer]
 
 | Description                                                                                                         | Default |
 | ------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -311,23 +314,18 @@ Enabling and configuring ZeRO memory optimizations
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models. | `False` |
 
-***cpu_offload***: [boolean]
-
-| Description                                                                                                              | Default |
-| ------------------------------------------------------------------------------------------------------------------------ | ------- |
-| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. | `False` |
 
-***cpu_offload_params***: [boolean]
+***offload_param***: [dictionary]
 
 | Description                                                                                                                       | Default |
 | --------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Enable offloading of model parameters to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. | `False` |
+| Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#parameter-offloading) for more details. | `False` |
 
-***cpu_offload_use_pin_memory***: [boolean]
+***offload_optimizer***: [dictionary]
 
-| Description                                                                              | Default |
-| ---------------------------------------------------------------------------------------- | ------- |
-| Use pinned CPU memory when offloading. Can improve performance. Valid only with stage 3. | `False` |
+| Description                                                                               | Default |
+| ----------------------------------------------------------------------------------------- | ------- |
+| Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#optimizer-offloading) for more details. | `False` |
 
 ***stage3_max_live_parameters***: [integer]
 
@@ -349,16 +347,112 @@ Enabling and configuring ZeRO memory optimizations
 
 
 ***stage3_param_persistence_threshold***: [integer]
+
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
 
 
 ***stage3_gather_fp16_weights_on_model_save***: [boolean]
+
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+***cpu_offload***: [boolean]
+
+**Deprecated:** **cpu_offload** is disabled and will be removed in future, please use `offload_optimizer` instead.
+{: .notice--warning}
+
+| Description                                                                                                              | Default |
+| ------------------------------------------------------------------------------------------------------------------------ | ------- |
+| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 2.| `False` |
+
+
+### Parameter offloading
+Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
+```json
+  "offload_param": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 5,
+    "buffer_size": 1e8,
+    "max_in_cpu": 1e9
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload model parameters. Supported options are `cpu` and `nvme`. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for parameter offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for parameter offloading to NVMe. | 5  |
+
+
+***buffer_size***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Size of buffers in buffer pool for parameter offloading to NVMe. | 1e8  |
+
+***max_in_cpu***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. | 1e9  |
+
+### Optimizer offloading
+Enabling and configuring ZeRO optimization of offloading optimizer computation to CPU and state to CPU/NVMe. CPU offloading is available with ZeRO stage 2 or 3. NVMe offloading is available only with ZeRO stage 3.
+```json
+  "offload_optimizer": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 4,
+    "pin_memory": [true|false],
+    "fast_init": false
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload optimizer state. Supported options are `cpu` and `nvme`. Optimizer computation is offload to CPU regardless of device option. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for optimizer state offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). | 4  |
+
+
+***pin_memory***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | `false`  |
+
+***fast_init***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Enable fast optimizer initialization when offloading to NVMe. | `false`  |
+
 ### Logging
 
 ***steps\_per\_print***: [integer]
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
index 1751846830ef..529da7880f94 100644
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,9 +276,15 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
 
 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
+<<<<<<< HEAD
 Then each worker will allocate only the layers it's assigned to. So, comparing to the
 example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
 allocate a total of 1x model size on its CPU memory and not 16x.
+=======
+Then each worker will allocate only the layers it's assigned to. So, continuing the
+example from the previous paragraph, a machine with 16 GPUs will need to allocate a
+total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
+>>>>>>> [squash] Staging zero infinity v1 (#168)
 
 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 8f506d25babe..82c3414ff44f 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -106,121 +106,36 @@ Here is a screenshot of nvidia-smi showing GPU activity during training:
 <img src="/assets/images/zero2_dp32_10B_smi.png">
 </a>
 
-### Training trillion-scale models with ZeRO-3 Offload
+### Training trillion-scale models with ZeRO-Infinity
 
 Stage 3 can be enabled in the JSON configuration. A full description of these
 configurations is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
 
 ```json
-{
   "zero_optimization": {
     "stage": 3,
     "cpu_offload": true,
     "cpu_offload_params": true,
-    "overlap_comm": true,
     "contiguous_gradients": true,
-    "stage3_max_live_parameters": 6000000,
-    "stage3_max_reuse_distance": 100000000,
-    "stage3_prefetch_bucket_size": 200000,
-    "stage3_param_persistence_threshold": 100000,
-    "reduce_bucket_size": 3000000,
-    "sub_group_size": 1e6
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 1e7,
+    "stage3_param_persistence_threshold": 1e5,
+    "reduce_bucket_size": 1e7,
+    "sub_group_size": 1e9
   }
 }
 ```
 
 
-ZeRO-3 will automatically collect and partition the parameters as they are
-needed during the forward and backward passes. However, in some cases a
-parameter may be used outside of its module's forward pass. We call these
-*external parameters*. ZeRO-3 can coordinate these parameters if they are
-registered. Please see our [ZeRO-3 docs](https://deepspeed.readthedocs.io/en/latest/zero3.html) for more
-information and examples of external parameters.
-
-The Megatron-LM model has three external parameters that must be registered
-with ZeRO-3. External parameters are those that are accessed outside of the
-owning module's forward pass.
-
-1. `megatron/model/gpt2_model.py:GPT2Model`: register the word embedding for both uses in forward.
-
-```python
-    class GPT2Model(MegatronModule):
-    def __init__(self, num_tokentypes=0, parallel_output=True):
-        ...
-        deepspeed.zero.register_external_parameter(self,
-                                                   self.language_model.embedding.word_embeddings.weight)
-
-
-    def forward(self, input_ids, position_ids, attention_mask, labels=None,
-                tokentype_ids=None, layer_past=None, get_key_value=False,
-                forward_method_parallel_output=None):
-        # self.embeddings will compute its forward pass here
-        lm_output = self.language_model(input_ids,
-                                        position_ids,
-                                        attention_mask,
-                                        tokentype_ids=tokentype_ids,
-                                        layer_past=layer_past,
-                                        get_key_value=get_key_value)
-        ...
-
-        # Accesses word_embeddings.weight outside of the embedding's forward pass.
-        output = parallel_lm_logits(
-            lm_output,
-            self.language_model.embedding.word_embeddings.weight,
-            parallel_output)
-```
-
-2. `megatron/model/transformer.py:ParallelMLP`: register a bias that is
-returned from a submodule forward and used in this forward.
-
-```python
-class ParallelMLP(MegatronModule):
-    def __init__(self, init_method, output_layer_init_method):
-        ...
-        if self.dense_h_to_4h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.dense_h_to_4h.bias)
 
-    def forward(self, hidden_states):
 
-        # bias_parallel is a parameter of dense_h_to_4h
+#### Registering external parameters with ZeRO-3
 
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-        ...
-```
-
-3. `megatron/model/transformer.py:ParallelTransformerLayer`: register two biases that
-are returned from submodules and used in forward.
-
-```python
-class ParallelTransformerLayer(MegatronModule):
-    ...
-    def __init__(self, attention_mask_func, init_method,
-                 output_layer_init_method, layer_number):
-        ...
-        if self.attention.dense.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.attention.dense.bias)
-        if self.mlp.dense_4h_to_h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.mlp.dense_4h_to_h.bias)
-
-    def forward(self, hidden_states, attention_mask, layer_past=None,
-                get_key_value=False):
-        ...
-        # attention_bias is a parameter returned from attention
-
-        # Self attention.
-        attention_output, attention_bias = \
-            self.attention(layernorm_output,
-                           attention_mask,
-                           layer_past=layer_past,
-                           get_key_value=get_key_value)
-
-        ...
-
-        # mlp_bias is a parameter returned from mlp
-        mlp_output, mlp_bias = self.mlp(layernorm_output)
-        ...
-```
+**Deprecated:**
+DeepSpeed version `0.3.15` introduced automatic external parameter
+registration and this step is no longer needed.
+{: .notice--info}
 
 
 
@@ -231,7 +146,7 @@ that exceed *local* system memory, but not *total* system memory.
 
 1. Allocate the model in a memory-scalable fashion. The model parameters will
 be allocated and immediately partitioned across the data parallel group. If
-`remote_device="cpu"`, the model will also be allocated in CPU memory
+`remote_device` is  `"cpu"` or `"nvme"`, the model will also be allocated in CPU/NVMe memory
 instead of GPU memory. Please see the full
 [ZeRO-3 Init docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init)
 for more details.
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index d7b338561b96..53024d161b3e 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -17,4 +17,8 @@ FusedLamb (GPU)
 
 OneBitAdam (GPU)
 ----------------------------
+<<<<<<< HEAD
 .. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
+=======
+.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
+>>>>>>> [squash] Staging zero infinity v1 (#168)
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index c986990444f3..0192a69b5bb3 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -16,12 +16,13 @@ For more information on our algorithms, please see our papers on `ZeRO
 <https://arxiv.org/abs/1910.02054>`_ and `ZeRO-Offload
 <https://arxiv.org/abs/2101.06840>`_.
 
+
 Getting Started
 ---------------
 
 If you are new to DeepSpeed, check out our `Getting Started <https://www.deepspeed.ai/getting-started/>`_ page.
 
-Once you are training with DeepSpeed, enabling ZeRO-3 Offload is as simple as enabling it
+Once you are training with DeepSpeed, enabling ZeRO-3 offload is as simple as enabling it
 in your DeepSpeed configuration! Below are a few examples of ZeRO-3 configurations. Please see
 our `config guide <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`_
 for a complete list of options for configuration and performance tuning.
@@ -46,6 +47,7 @@ Example ZeRO-3 Offload Configurations
             "zero_optimization": {
                 "stage": 3,
                 "overlap_comm": true
+
             },
             "fp16": {
                 "enabled": true
@@ -69,13 +71,14 @@ Example ZeRO-3 Offload Configurations
 #. Additionally offload the optimizer states and computations to the CPU.
 
     .. code-block:: python
-        :emphasize-lines:  4
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
                 "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
             },
             ...
         }
@@ -84,14 +87,38 @@ Example ZeRO-3 Offload Configurations
 #. Save even more memory by offloading parameters to the CPU memory.
 
     .. code-block:: python
-        :emphasize-lines:  5
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
-                "cpu_offload_params": true,
                 "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
+                "offload_param": {
+                    "device": "cpu"
+                }
+            },
+            ...
+        }
+
+
+#. Save even MORE memory by offloading to NVMe (if available):
+
+    .. code-block:: python
+
+        {
+            "zero_optimization": {
+                "stage": 3,
+                "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
+                "offload_param": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
             },
             ...
         }
@@ -126,8 +153,6 @@ you can simply allocate your model in our context:
         model = MyLargeModel()
 
 
-
-.. autoclass:: deepspeed.zero.Init
     :members:
 
 
@@ -179,6 +204,35 @@ because it is used in the training loop outside of its owning module's
 forward pass. DeepSpeed will coordinate external parameters if they are
 registered prior to the first forward pass.
 
+Consider the following pattern common in language models such as GPT:
+
+.. code-block:: python
+
+    class LanguageModel(torch.nn.Module):
+        ...
+        def forward(self, inputs):
+            embeds = self.embeddings(inputs)
+            ...
+            logits = compute_logits(output, self.embeddings.weight)
+            ...
+
+
+The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
+``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
+because it is used in the training loop outside of its owning module's
+forward pass. DeepSpeed will coordinate external parameters if they are
+registered prior to the first forward pass.
+
+.. note::
+    Most models should not need to manually register parameters.
+
 .. autofunction:: deepspeed.zero.register_external_parameter
 
 .. autofunction:: deepspeed.zero.unregister_external_parameter
+
+
+Memory-Centric Tiling
+---------------------
+
+.. autoclass:: deepspeed.zero.TiledLinear
+    :members:
diff --git a/docs/index.md b/docs/index.md
index 497f88bab5c3..2642285a3f45 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -236,6 +236,7 @@ comments.
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 38f27a9897ce..72e09b293811 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -8,6 +8,7 @@
 from .transformer import TransformerBuilder
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
+from .async_io import AsyncIOBuilder
 from .builder import get_default_compute_capatabilities
 
 # TODO: infer this list instead of hard coded
@@ -19,6 +20,7 @@
     SparseAttnBuilder(),
     TransformerBuilder(),
     StochasticTransformerBuilder(),
-    UtilsBuilder()
+    UtilsBuilder(),
+    AsyncIOBuilder()
 ]
 ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
new file mode 100644
index 000000000000..749ac8725906
--- /dev/null
+++ b/op_builder/async_io.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .builder import OpBuilder
+
+
+class AsyncIOBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp'
+        ]
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        return [
+            '-g',
+            '-Wall',
+            '-O0',
+            '-std=c++14',
+            '-shared',
+            '-fPIC',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            '-laio',
+            self.simd_width()
+        ]
+
+    def extra_ldflags(self):
+        return ['-laio']
+
+    def is_compatible(self):
+        aio_libraries = ['libaio-dev']
+        aio_compatible = self.libraries_installed(aio_libraries)
+        if not aio_compatible:
+            self.warning(
+                f"{self.NAME} requires the libraries: {aio_libraries} but are missing.")
+        return super().is_compatible() and aio_compatible
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 8fcae8bcca18..015e8c508cee 100755
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -132,18 +132,15 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
     return model
 
 
-@torch.jit.script
 def f_gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_gelu(bias, y):
     x = bias + y
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_tanh(bias, y):
     x = bias + y
     return torch.tanh(x)
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index 61f07a196971..a29d22a2a954 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -96,6 +96,6 @@ def _helper():
         base_output = base_output.to('cpu')
         pipe_output = pipe_output.to('cpu')
 
-        assert torch.allclose(base_output, pipe_output)
+        assert torch.allclose(base_output, pipe_output, atol=1e-4)
 
     _helper()
diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py
index 0e5b2e0696e6..9c45b58abf66 100644
--- a/tests/unit/test_zero_context.py
+++ b/tests/unit/test_zero_context.py
@@ -1,4 +1,7 @@
 import os
+import sys
+from types import SimpleNamespace
+
 import torch
 import pytest
 
@@ -62,55 +65,59 @@ def test_gather_update():
         assert torch.equal(l.weight, torch.zeros_like(l.weight))
 
 
-@pytest.mark.skip('WIP')
-def test_external_param():
+config_dict = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+def test_ext_param_getattr():
     setup_serial_env()
 
-    print()
-
     class ExtLinear(torch.nn.Module):
-        def __init__(self, dim=10, copycat=None):
+        def __init__(self, dim=16):
             super().__init__()
             self.dim = dim
-            self.linear = torch.nn.Linear(dim, dim)
-            if copycat is not None:
-                with deepspeed.zero.GatheredParameters(self.linear.weight,
-                                                  modifier_rank=0), \
-                     torch.no_grad():
-                    self.linear.weight.copy_(copycat.linear.weight)
-
-            if hasattr(self.linear.weight, 'ds_id'):
-                print('registering')
-                super().ds_register_external_parameter('samyam', self.linear.weight)
+            self.linear1 = torch.nn.Linear(dim, dim)
+            self.linear2 = torch.nn.Linear(dim, dim)
 
         def forward(self, input):
-            yamsam = self.linear(input)
-            if hasattr(self.linear.weight, 'ds_status'):
-                assert self.linear.weight.ds_status == ZeroParamStatus.AVAILABLE
-            jeff = torch.nn.functional.linear(yamsam, self.linear.weight)
-            return jeff
+            A = self.linear1(input)
+            B = self.linear2(A)
 
-    l1_base = ExtLinear().half().cuda()
-    l2_base = ExtLinear().half().cuda()
+            # external use of self.linear1.weight
+            C = torch.nn.functional.linear(B, self.linear1.weight)
+            return C.sum()
 
-    input = torch.rand(10).half().cuda()
+    net = ExtLinear()
 
-    l1_base_out = l1_base(input.clone().detach())
-    l2_base_out = l2_base(input.clone().detach())
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
 
-    with deepspeed.zero.Init():
-        l1_test = ExtLinear(copycat=l1_base).cuda()
-        #l2_test = ExtLinear(copycat=l2_base).cuda()
-        assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-    # XXX l1 and l2 share their external parameter (l2.linear.weight)
-
-    assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    l1_test_out = l1_test(input.clone().detach())
-    #assert torch.allclose(l1_base_out, l1_test_out)
+    with deepspeed.zero.GatheredParameters(net.linear1.weight):
+        assert net.linear1.weight.numel() == net.dim**2
 
-    #l2_test_out = l2_test(input.clone().detach())
-    #assert torch.allclose(l2_base_out, l2_test_out)
+    input = torch.rand(net.dim).to(engine.device).half()
+    loss = engine(input)
+    engine.backward(loss)
+    engine.step()
 
 
 def test_scatter_halftype():
@@ -122,3 +129,117 @@ def test_scatter_halftype():
 
         y = torch.LongTensor([3, 3])
         assert y.dtype == torch.long
+
+
+class DanglingBias(torch.nn.Linear):
+    def forward(self, *inputs):
+        out = super().forward(*inputs)
+        # return the bias to trigger a dangling external param
+        return out, self.bias
+
+
+class DataClass:
+    """Just wraps data in an object. """
+    def __init__(self, out=None, bias=None):
+        self.out = out
+        self.bias = bias
+
+
+class DanglingBiasClass(DanglingBias):
+    def forward(self, *inputs):
+        out, bias = super().forward(*inputs)
+        return DataClass(out=out, bias=bias)
+
+
+class DanglingAttention(torch.nn.Linear):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__(dim, dim)
+        self.dim = dim
+        self.return_obj = return_obj
+        if return_obj:
+            self.d_linear = DanglingBiasClass(dim, dim)
+        else:
+            self.d_linear = DanglingBias(dim, dim)
+
+    def forward(self, input):
+        out = super().forward(input)
+        if self.return_obj:
+            out_obj = self.d_linear(out)
+            assert out_obj.bias.ds_status == ZeroParamStatus.AVAILABLE
+            # forward the external param
+            return out_obj.out, out_obj.bias
+        else:
+            out, bias = self.d_linear(out)
+            assert bias.ds_status == ZeroParamStatus.AVAILABLE
+            return out, bias
+
+
+class ModelContainer(torch.nn.Module):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__()
+        self.dim = dim
+        self.linear1 = torch.nn.Linear(dim, dim)
+        self.dangler = DanglingAttention(dim, return_obj=return_obj)
+
+    def forward(self, input):
+        act1 = self.linear1(input)
+        # bias is actually dangler.d_linear1.bias
+        act2, bias = self.dangler(act1)
+        assert bias.ds_status == ZeroParamStatus.AVAILABLE
+        return (act2 + bias).sum()
+
+
+class DanglingExt(torch.nn.Module):
+    def __init__(self, dim=16):
+        super().__init__()
+        self.dim = dim
+        self.container = ModelContainer(dim)
+
+    def forward(self, input):
+        out = self.container(input)
+
+        # Make sure it's at the right level of the stack
+        assert len(self._external_params) == 0
+        assert len(self.container._external_params) == 1
+        assert len(self.container.dangler._external_params) == 0
+        return out
+
+
+def test_ext_param_return():
+    setup_serial_env()
+
+    net = DanglingExt()
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        engine.backward(loss)
+        engine.step()
+
+
+@pytest.mark.skip('WIP')
+def test_ext_param_returnobj():
+    setup_serial_env()
+    print()
+
+    net = ModelContainer(return_obj=True)
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        assert len(net._external_params) == 1
+        assert len(net.dangler._external_params) == 0
+        engine.backward(loss)
+        engine.step()
diff --git a/tests/unit/test_zero_tiled.py b/tests/unit/test_zero_tiled.py
new file mode 100644
index 000000000000..a8b63b11d32a
--- /dev/null
+++ b/tests/unit/test_zero_tiled.py
@@ -0,0 +1,169 @@
+import copy
+
+import torch
+import deepspeed
+from deepspeed.runtime.zero.tiling import TiledLinear, TiledLinearReturnBias
+
+import pytest
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2), (5, 5), (32, 32)])
+def test_tiled_init(in_splits, out_splits):
+    in_f = 32
+    out_f = 40
+    base = torch.nn.Linear(in_f, out_f, bias=True)
+    l = TiledLinear(in_f,
+                    out_f,
+                    bias=True,
+                    init_linear=copy.deepcopy(base),
+                    out_splits=out_splits,
+                    in_splits=in_splits)
+
+    for out_id in range(out_splits):
+        for in_id in range(in_splits):
+            local_l = l.linears[out_id][in_id]
+            assert isinstance(local_l, torch.nn.Linear)
+
+            rstart = l.out_parts[out_id]
+            rstop = l.out_parts[out_id + 1]
+            cstart = l.in_parts[in_id]
+            cstop = l.in_parts[in_id + 1]
+
+            local_out = rstop - rstart
+            local_in = cstop - cstart
+            assert local_l.weight.size()[1] == local_in, f'local[{out_id}][{in_id}].size {local_l.weight.size()}'
+            assert local_l.weight.size()[0] == local_out
+
+            test = base.weight[rstart:rstop, cstart:cstop]
+
+            assert local_l.weight.size() == test.size()
+            assert torch.equal(local_l.weight.data, test.data)
+
+            if in_id == in_splits - 1:
+                assert local_l.bias is not None
+                assert local_l.bias.size()[0] == local_out
+            else:
+                assert local_l.bias is None
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(0, 0), (33, 33)])
+def test_tiled_baddim(in_splits, out_splits):
+    dim = 32
+    with pytest.raises(RuntimeError):
+        l = TiledLinear(dim, dim, out_splits=out_splits, in_splits=in_splits)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_forward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+    base_out.sum().backward()
+    test_out.sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
+
+
+class LinearWrapper(torch.nn.Linear):
+    """Returns its own bias to simulate Megatron-LM's behavior.
+
+    Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
+    """
+    def forward(self, input):
+        out = super().forward(input)
+        return out, self.bias
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_returnbias_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = LinearWrapper(in_f, out_f, bias=bias)
+    test = TiledLinearReturnBias(in_f,
+                                 out_f,
+                                 bias=bias,
+                                 linear_cls=LinearWrapper,
+                                 init_linear=copy.deepcopy(base),
+                                 out_splits=out_splits,
+                                 in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out_t, base_out_b = base(copy.deepcopy(inp))
+    test_out_t, test_out_b = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out_t, test_out_t, rtol=1e-4)
+    if base_out_b is None:
+        assert test_out_b is None
+        base_out_b = torch.zeros_like(base_out_t)
+        test_out_b = torch.zeros_like(test_out_t)
+    else:
+        assert test_out_b is not None
+        assert torch.allclose(base_out_b, test_out_b, rtol=1e-4)
+
+    (base_out_t + base_out_b).sum().backward()
+    (test_out_t + test_out_b).sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)