From c61a25f0ed83bd2cf0df67c9cce8901f1b04ed49 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:00:39 +0100
Subject: [PATCH 01/10] [SRC] Re-format .cc .h files

---
 src/api/_api_internal/_api_internal.cc        |   4 +-
 src/api/operator/numpy/np_tri_op.cc           |   6 +-
 src/c_api/c_api.cc                            |   6 +-
 src/c_api/c_api_ndarray.cc                    |   3 +-
 src/c_api/c_api_symbolic.cc                   |   6 +-
 src/common/cuda/nvtx.h                        |  19 +-
 src/common/cuda/utils.h                       |   4 +-
 src/common/utils.h                            |   4 +-
 src/engine/naive_engine.cc                    |   4 +-
 src/engine/threaded_engine.h                  |   2 +-
 src/engine/threaded_engine_perdevice.cc       |  10 +-
 src/engine/threaded_engine_pooled.cc          |   6 +-
 src/imperative/attach_op_resource_pass.cc     |   5 +-
 src/imperative/exec_pass.h                    |   2 +-
 src/imperative/imperative.cc                  |  19 +-
 src/imperative/imperative_utils.h             |  10 +-
 src/io/iter_prefetcher.h                      |   6 +-
 src/kvstore/comm.h                            |  26 +-
 src/kvstore/gpu_topology.h                    |   4 +-
 src/kvstore/kvstore_dist.h                    |  12 +-
 src/kvstore/p3store_dist.h                    |   2 +-
 src/ndarray/ndarray.cc                        |  16 +-
 src/nnvm/gradient.cc                          |  10 +-
 src/nnvm/plan_memory.cc                       |   6 +-
 src/operator/contrib/adamw.cu                 |   2 +-
 src/operator/contrib/bilinear_resize-inl.h    |  16 +-
 src/operator/contrib/bounding_box-inl.h       |  30 +-
 src/operator/contrib/bounding_box.cu          |   6 +-
 .../contrib/deformable_psroi_pooling.cc       |  40 +-
 .../contrib/deformable_psroi_pooling.cu       |  40 +-
 .../contrib/intgemm/prepare_weight_op.cc      |   6 +-
 src/operator/contrib/multi_lamb.cc            |   8 +-
 src/operator/contrib/multi_lamb.cu            |  12 +-
 src/operator/contrib/multi_lans.cc            |   8 +-
 src/operator/contrib/multi_lans.cu            |  12 +-
 src/operator/contrib/multi_lars-inl.h         |   8 +-
 src/operator/control_flow.cc                  |   6 +-
 src/operator/correlation.cc                   |  12 +-
 src/operator/leaky_relu.cc                    |  12 +-
 src/operator/mxnet_op.h                       |  56 +-
 src/operator/nn/batch_norm-inl.h              |  10 +-
 src/operator/nn/batch_norm.cu                 |  32 +-
 src/operator/nn/concat.cc                     |   4 +-
 src/operator/nn/convolution.cc                |  36 +-
 src/operator/nn/cudnn/cudnn_batch_norm.cu     | 186 ++--
 src/operator/nn/cudnn/cudnn_batch_norm.h      |  14 +-
 src/operator/nn/cudnn/cudnn_convolution-inl.h | 831 +++++++++++++++++
 .../nn/cudnn/cudnn_deconvolution-inl.h        | 852 ++++++++++++++++++
 src/operator/nn/cudnn/cudnn_pooling-inl.h     |  48 +-
 src/operator/nn/dnnl/dnnl_base-inl.h          |   6 +-
 src/operator/nn/dnnl/dnnl_base.cc             |  41 +-
 src/operator/nn/dnnl/dnnl_convolution.cc      |   4 +-
 src/operator/nn/dnnl/dnnl_deconvolution-inl.h |  10 +-
 src/operator/nn/dnnl/dnnl_fully_connected.cc  |   6 +-
 src/operator/nn/dnnl/dnnl_rnn.cc              |  41 +-
 src/operator/nn/pooling-inl.h                 |  12 +-
 src/operator/nn/pooling.cc                    |  33 +-
 src/operator/nn/softmax-inl.h                 |  30 +-
 src/operator/nn/softmax.cc                    |   6 +-
 src/operator/npx_control_flow.cc              |   6 +-
 src/operator/numpy/linalg/np_lstsq.cc         |   6 +-
 src/operator/numpy/linalg/np_norm.cc          |   4 +-
 src/operator/numpy/np_bincount_op.cc          |   6 +-
 src/operator/numpy/np_boolean_mask_assign.cc  |   6 +-
 .../numpy/np_broadcast_reduce_op_value.h      |   6 +-
 src/operator/numpy/np_delete_op-inl.h         |   8 +-
 src/operator/numpy/np_delete_op.cc            |   6 +-
 src/operator/numpy/np_einsum_op-inl.h         |   4 +-
 .../numpy/np_elemwise_broadcast_logic_op.h    |  21 +-
 src/operator/numpy/np_elemwise_broadcast_op.h |   4 +-
 .../numpy/np_elemwise_broadcast_op_add.cc     |  37 +-
 .../numpy/np_elemwise_broadcast_op_add.cu     |   5 +-
 .../numpy/np_elemwise_broadcast_op_mod.cc     |  37 +-
 .../numpy/np_elemwise_broadcast_op_mod.cu     |   5 +-
 .../numpy/np_elemwise_broadcast_op_mul.cc     |  37 +-
 .../numpy/np_elemwise_broadcast_op_mul.cu     |   4 +-
 .../numpy/np_elemwise_broadcast_op_pow.cc     |  38 +-
 .../numpy/np_elemwise_broadcast_op_pow.cu     |   5 +-
 .../numpy/np_elemwise_broadcast_op_scalar.cc  |  32 +-
 .../numpy/np_elemwise_broadcast_op_scalar.cu  |  16 +-
 .../numpy/np_elemwise_broadcast_op_sub.cc     |  37 +-
 .../numpy/np_elemwise_broadcast_op_sub.cu     |   4 +-
 src/operator/numpy/np_insert_op_scalar-inl.h  |   6 +-
 src/operator/numpy/np_insert_op_slice-inl.h   |   6 +-
 src/operator/numpy/np_insert_op_tensor-inl.h  |   6 +-
 src/operator/numpy/np_interp_op.cc            |   6 +-
 src/operator/numpy/np_moments_op.cc           |   6 +-
 src/operator/numpy/np_percentile_op.cc        |   6 +-
 src/operator/numpy/np_true_divide.cc          |   6 +-
 src/operator/numpy/np_unique_op.cc            |   7 +-
 src/operator/numpy/random/np_bernoulli_op.cc  |   6 +-
 .../numpy/random/np_exponential_op.cc         |   6 +-
 src/operator/numpy/random/np_pareto_op.cc     |   6 +-
 src/operator/numpy/random/np_power_op.cc      |   6 +-
 src/operator/numpy/random/np_rayleigh_op.cc   |   6 +-
 src/operator/numpy/random/np_weibull_op.cc    |   6 +-
 src/operator/optimizer_op-inl.h               |   6 +-
 src/operator/optimizer_op.cc                  |   4 +-
 src/operator/optimizer_op.cu                  |   4 +-
 src/operator/random/sampler.h                 |   8 +-
 src/operator/random/shuffle_op.cu             |   4 +-
 src/operator/sequence_last-inl.h              |  12 +-
 src/operator/subgraph/build_subgraph.cc       |   6 +-
 src/operator/subgraph/dnnl/dnnl_conv.cc       |  15 +-
 src/operator/subgraph/dnnl/dnnl_fc.cc         |   4 +-
 .../subgraph/tensorrt/nnvm_to_onnx.cc         |   2 +-
 .../subgraph/tensorrt/onnx_to_tensorrt.h      |  10 +-
 src/operator/subgraph/tensorrt/tensorrt-inl.h |   2 +-
 src/operator/tensor/amp_cast.cc               |  12 +-
 src/operator/tensor/broadcast_reduce-inl.h    |   6 +-
 src/operator/tensor/dot-inl.h                 |  12 +-
 src/operator/tensor/elemwise_binary_op-inl.h  |  16 +-
 .../tensor/elemwise_binary_scalar_op.h        |   4 +-
 src/operator/tensor/histogram.cc              |   6 +-
 src/operator/tensor/la_op-inl.h               |  20 +-
 src/operator/tensor/la_op.h                   |  12 +-
 src/operator/tensor/matrix_op.cu              |  10 +-
 src/operator/tensor/reduce_rtc.cc             |  12 +-
 src/operator/tensor/square_sum.cc             |   2 +-
 src/operator/tensor/square_sum.cu             |   2 +-
 src/profiler/aggregate_stats.cc               |  12 +-
 src/runtime/container.cc                      |   4 +-
 src/serialization/cnpy.cc                     |   8 +-
 src/storage/pooled_storage_manager.h          |   2 +-
 124 files changed, 2520 insertions(+), 784 deletions(-)
 create mode 100644 src/operator/nn/cudnn/cudnn_convolution-inl.h
 create mode 100644 src/operator/nn/cudnn/cudnn_deconvolution-inl.h

diff --git a/src/api/_api_internal/_api_internal.cc b/src/api/_api_internal/_api_internal.cc
index dc0dac811037..82d86d105065 100644
--- a/src/api/_api_internal/_api_internal.cc
+++ b/src/api/_api_internal/_api_internal.cc
@@ -62,8 +62,8 @@ MXNET_REGISTER_GLOBAL("_ADT").set_body([](runtime::MXNetArgs args, runtime::MXNe
       ObjectRef input       = NDArrayHandle(array);
       data.push_back(input);
     } else if (args[i].type_code() != kNull) {
-      ObjectRef input = String::CanConvertFrom(args[i]) ? args[i].operator String()
-                                                        : args[i].operator ObjectRef();
+      ObjectRef input = String::CanConvertFrom(args[i]) ? args[i].operator String() :
+                                                          args[i].operator ObjectRef();
       data.push_back(input);
     } else {
       data.emplace_back(nullptr);
diff --git a/src/api/operator/numpy/np_tri_op.cc b/src/api/operator/numpy/np_tri_op.cc
index 915c68ca4eb0..dacc8953bc43 100644
--- a/src/api/operator/numpy/np_tri_op.cc
+++ b/src/api/operator/numpy/np_tri_op.cc
@@ -39,9 +39,9 @@ MXNET_REGISTER_API("_npi.tri").set_body([](runtime::MXNetArgs args, runtime::MXN
     param.M = args[1].operator nnvm::dim_t();
   }
   param.k     = args[2].operator int();
-  param.dtype = args[3].type_code() == kNull
-                    ? mshadow::kFloat32
-                    : String2MXNetTypeWithBool(args[3].operator std::string());
+  param.dtype = args[3].type_code() == kNull ?
+                    mshadow::kFloat32 :
+                    String2MXNetTypeWithBool(args[3].operator std::string());
   if (args[4].type_code() != kNull) {
     attrs.dict["ctx"] = args[4].operator std::string();
   }
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 8bb2b54bcc8d..d69db4eebe23 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -2822,8 +2822,8 @@ int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle* out) {
   // TODO(tianjun) make label 1D when label_width=0
   mxnet::TShape shape = no_label ? TShape({
                                        1,
-                                   })
-                                 : db.data[1].shape();
+                                   }) :
+                                   db.data[1].shape();
   if (no_label || shape.Size() < 1) {
     // it's possible that label is not available and not required
     // but we need to bypass the invalid copy
@@ -3947,7 +3947,7 @@ int MXShallowCopyNDArray(NDArrayHandle src_handle, NDArrayHandle* out) {
   API_END_HANDLE_ERROR(delete ret);
 }
 
-int MXNVTXRangePush(const char * name, mx_uint color) {
+int MXNVTXRangePush(const char* name, mx_uint color) {
   API_BEGIN();
 #if MXNET_USE_CUDA && MXNET_USE_NVTX
   mxnet::common::cuda::nvtx::gpuRangeStart(color, name);
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 13c200cd0dd6..2e9c0a373621 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -334,8 +334,7 @@ int MXAutogradMarkVariables(uint32_t num_var,
   API_END();
 }
 
-int MXAutogradDropGrads(uint32_t num_var,
-                       NDArrayHandle *var_handles) {
+int MXAutogradDropGrads(uint32_t num_var, NDArrayHandle* var_handles) {
   API_BEGIN();
   std::vector<NDArray*> variables;
   variables.reserve(num_var);
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 1e12b3f6b46d..82cccd879511 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -1195,9 +1195,9 @@ int MXGenBackendSubgraph(SymbolHandle sym_handle,
   const auto& subgraph_prop_list = backend->GetSubgraphProperties();
   for (auto property : subgraph_prop_list) {
     if (property->HasAttr("disable") && property->GetAttr<bool>("disable") == true) {
-      auto full_name = property->HasAttr("property_name")
-                           ? property->GetAttr<std::string>("property_name")
-                           : std::string();
+      auto full_name = property->HasAttr("property_name") ?
+                           property->GetAttr<std::string>("property_name") :
+                           std::string();
       LOG(INFO) << "subgraph property " << full_name << " from backend " << backend_name
                 << " is disabled.";
       continue;
diff --git a/src/common/cuda/nvtx.h b/src/common/cuda/nvtx.h
index 4142ee112f1e..ae67c623fe41 100644
--- a/src/common/cuda/nvtx.h
+++ b/src/common/cuda/nvtx.h
@@ -34,8 +34,7 @@ namespace cuda {
 
 class NVTXDuration {
  public:
-  explicit NVTXDuration(const char *name) noexcept
-      : range_id_(0), name_(name) {}
+  explicit NVTXDuration(const char* name) noexcept : range_id_(0), name_(name) {}
 
   inline void start() {
     range_id_ = nvtxRangeStartA(name_);
@@ -47,7 +46,7 @@ class NVTXDuration {
 
  private:
   nvtxRangeId_t range_id_;
-  const char *name_;
+  const char* name_;
 };
 
 // Utility class for NVTX
@@ -68,19 +67,19 @@ class nvtx {
 
   static void gpuRangeStart(const uint32_t rgb, const std::string& range_name) {
     nvtxEventAttributes_t att;
-    att.version = NVTX_VERSION;
-    att.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-    att.colorType = NVTX_COLOR_ARGB;
-    att.color = rgb | 0xff000000;
-    att.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    att.version       = NVTX_VERSION;
+    att.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    att.colorType     = NVTX_COLOR_ARGB;
+    att.color         = rgb | 0xff000000;
+    att.messageType   = NVTX_MESSAGE_TYPE_ASCII;
     att.message.ascii = range_name.c_str();
     nvtxRangePushEx(&att);
   }
 
   // Utility to map a range name prefix to a random color based on its hash
   static uint32_t nameToColor(const std::string& range_name, int prefix_len) {
-    static std::vector<uint32_t> colors{kRed, kGreen, kBlue, kYellow, kOrange, kRed1, kMagenta,
-                                        kViolet, kBlue1, kCyan, kGreen1};
+    static std::vector<uint32_t> colors{
+        kRed, kGreen, kBlue, kYellow, kOrange, kRed1, kMagenta, kViolet, kBlue1, kCyan, kGreen1};
     std::string s(range_name, 0, prefix_len);
     std::hash<std::string> hash_fn;
     return colors[hash_fn(s) % colors.size()];
diff --git a/src/common/cuda/utils.h b/src/common/cuda/utils.h
index 0290fabe7aec..35330c445396 100644
--- a/src/common/cuda/utils.h
+++ b/src/common/cuda/utils.h
@@ -739,8 +739,8 @@ static inline __device__ void atomicAdd(mshadow::half::half_t* address, mshadow:
     mshadow::half::half_t hsum;
     hsum.half_ = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
     hsum += val;
-    old = reinterpret_cast<size_t>(address) & 2 ? (old & 0xffff) | (hsum.half_ << 16)
-                                                : (old & 0xffff0000) | hsum.half_;
+    old = reinterpret_cast<size_t>(address) & 2 ? (old & 0xffff) | (hsum.half_ << 16) :
+                                                  (old & 0xffff0000) | hsum.half_;
     old = atomicCAS(address_as_ui, assumed, old);
   } while (assumed != old);
 }
diff --git a/src/common/utils.h b/src/common/utils.h
index 15e676c816c9..180295a14902 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -711,8 +711,8 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name, const Context
  */
 template <typename T>
 constexpr size_t MaxIntegerValue() {
-  return std::is_integral<T>::value ? std::numeric_limits<T>::max()
-                                    : size_t(2) << (std::numeric_limits<T>::digits - 1);
+  return std::is_integral<T>::value ? std::numeric_limits<T>::max() :
+                                      size_t(2) << (std::numeric_limits<T>::digits - 1);
 }
 
 template <>
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index ad24af1dabe9..25841e072cda 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -254,8 +254,8 @@ class NaiveEngine final : public Engine {
 #endif
   /*!
    * \brief Holding a shared_ptr to the object pool to prevent it from being destructed too early
-   * See also #309 (https://github.com/apache/mxnet/issues/309) and similar fix in threaded_engine.h.
-   * Without this, segfaults seen on CentOS7 in
+   * See also #309 (https://github.com/apache/mxnet/issues/309) and similar fix in
+   * threaded_engine.h. Without this, segfaults seen on CentOS7 in
    * test_operator_gpu.py:test_convolution_multiple_streams
    */
   std::shared_ptr<common::ObjectPool<NaiveOpr> > objpool_opr_ref_;
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index a9e08a80aadc..4aebd08a6efb 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -368,7 +368,7 @@ class ThreadedEngine : public Engine {
           new profiler::ProfileOperator(threaded_opr->opr_name.c_str(), attrs.release()));
       opr_block->opr_profile->startForDevice(ctx.dev_type, ctx.dev_id);
     }
-    const bool debug_info       = (engine_info_ && debug_push_opr_ == opr_block);
+    const bool debug_info = (engine_info_ && debug_push_opr_ == opr_block);
     if (debug_info) {
       LOG(INFO) << "ExecuteOprBlock " << opr_block << "shutdown_phase=" << shutdown_phase_;
     }
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index b566e4417a41..79e8eaa53909 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -311,12 +311,10 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 
     while (task_queue->Pop(&opr_block)) {
 #if MXNET_USE_NVTX
-      auto nvtx_name = opr_block->opr->opr_name != "" ? opr_block->opr->opr_name : "Op";
-      auto end_pos = nvtx_name.find('{');
-      auto name_prefix_len = end_pos != std::string::npos
-                             ? end_pos
-                             : nvtx_name.size();
-      auto color = common::cuda::nvtx::nameToColor(nvtx_name, name_prefix_len);
+      auto nvtx_name       = opr_block->opr->opr_name != "" ? opr_block->opr->opr_name : "Op";
+      auto end_pos         = nvtx_name.find('{');
+      auto name_prefix_len = end_pos != std::string::npos ? end_pos : nvtx_name.size();
+      auto color           = common::cuda::nvtx::nameToColor(nvtx_name, name_prefix_len);
       common::cuda::nvtx::gpuRangeStart(color, nvtx_name);
 #endif
       auto* info                  = ThreadedEngine::GPUWorkerSyncInfo::New();
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 0ec91b23e260..fd29f6daacc3 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -66,7 +66,7 @@ class ThreadedEnginePooled : public ThreadedEngine {
     thread_pool_    = nullptr;
     io_thread_pool_ = nullptr;
     streams_->Finalize();
-    streams_        = nullptr;
+    streams_ = nullptr;
   }
 
   void Stop() override {
@@ -154,8 +154,8 @@ class ThreadedEnginePooled : public ThreadedEngine {
     }
     bool is_copy = (opr_block->opr->prop == FnProperty::kCopyFromGPU ||
                     opr_block->opr->prop == FnProperty::kCopyToGPU);
-    auto&& rctx  = is_copy ? streams_->GetIORunContext(opr_block->ctx)
-                           : streams_->GetRunContext(opr_block->ctx);
+    auto&& rctx  = is_copy ? streams_->GetIORunContext(opr_block->ctx) :
+                             streams_->GetRunContext(opr_block->ctx);
 #if MXNET_USE_CUDA
     CallbackOnStart on_start;
     CallbackOnComplete callback;
diff --git a/src/imperative/attach_op_resource_pass.cc b/src/imperative/attach_op_resource_pass.cc
index f4ac4b1257bc..17d6d7a41dc3 100644
--- a/src/imperative/attach_op_resource_pass.cc
+++ b/src/imperative/attach_op_resource_pass.cc
@@ -52,8 +52,9 @@ void AttachOpResources(const Graph& g,
     const bool rsc_req    = (fresource.count(op) != 0);
     const bool rsc_ex_req = (fresource_ex.count(op) != 0);
     if (rsc_req || rsc_ex_req) {
-      auto reqs = rsc_ex_req ? fresource_ex[op](inode.source->attrs, dev_masks[nid], vdispatch[nid])
-                             : fresource[op](inode.source->attrs);
+      auto reqs = rsc_ex_req ?
+                      fresource_ex[op](inode.source->attrs, dev_masks[nid], vdispatch[nid]) :
+                      fresource[op](inode.source->attrs);
       // Get the resource of temporal space.
       for (const ResourceRequest& req : reqs) {
         switch (req.type) {
diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h
index acecd7080d2b..7667d97632fc 100644
--- a/src/imperative/exec_pass.h
+++ b/src/imperative/exec_pass.h
@@ -287,7 +287,7 @@ inline Graph MXGradient(
     std::string copy_op_str          = std::string(),
     mxnet::ShapeVector in_arg_shapes = mxnet::ShapeVector(),
     DTypeVector in_arg_dtypes        = DTypeVector(),
-    std::vector<NodeEntry> us        = std::vector<NodeEntry>() ) {
+    std::vector<NodeEntry> us        = std::vector<NodeEntry>()) {
   graph.attrs["grad_ys"]          = std::make_shared<any>(std::move(ys));
   graph.attrs["grad_xs"]          = std::make_shared<any>(std::move(xs));
   graph.attrs["grad_ys_out_grad"] = std::make_shared<any>(std::move(ys_out_grad));
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index af1ee097ac1e..b9bdaac9476f 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -161,7 +161,7 @@ void Imperative::MarkVariables(const std::vector<NDArray*>& variables,
     } else {
       AGInfo& info = AGInfo::Get(variables[i]->autograd_entry_.node);
       CHECK_EQ(info.out_grads.size(), 0)
-        <<"The node has already been marked. Cannot mark it again.";
+          << "The node has already been marked. Cannot mark it again.";
       info.out_grads.emplace_back(gradients[i]->Detach());
       info.grad_req = static_cast<OpReqType>(grad_reqs[i]);
       info.ctx      = variables[i]->ctx();
@@ -175,7 +175,7 @@ void Imperative::DropGrads(const std::vector<NDArray*>& variables) {
     if (variable->autograd_entry_.node) {
       AGInfo& info = AGInfo::Get(variable->autograd_entry_.node);
       CHECK_NE(info.out_grads.size(), 0)
-        <<"The node has empty out_grads already. Cannot DropGrads again.";
+          << "The node has empty out_grads already. Cannot DropGrads again.";
       for (auto grad : info.out_grads) {
         grad.ReInit();
       }
@@ -188,8 +188,8 @@ void Imperative::DropGrads(const std::vector<NDArray*>& variables) {
 void Imperative::GetBackwardDependency(const nnvm::ObjectPtr& node,
                                        uint32_t num_inputs,
                                        uint32_t num_outputs,
-                                       std::vector<bool> *p_save_inputs,
-                                       std::vector<bool> *p_save_outputs) {
+                                       std::vector<bool>* p_save_inputs,
+                                       std::vector<bool>* p_save_outputs) {
   static auto& fgradient          = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
   std::vector<bool>& save_inputs  = *p_save_inputs;
   std::vector<bool>& save_outputs = *p_save_outputs;
@@ -609,12 +609,11 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
     arrays[eid]    = x_grads[i - num_forward_outputs];
     ref_count[eid] = 1;
   }
-  const std::vector<NodeEntry>& us_grads =
-    g_graph.GetAttr<std::vector<NodeEntry>>("nleaf_grads");
+  const std::vector<NodeEntry>& us_grads = g_graph.GetAttr<std::vector<NodeEntry>>("nleaf_grads");
   CHECK_EQ(us_grads.size(), us.size())
-    << "Size of queried nleaf_vars and size of their gradients don't match.";
+      << "Size of queried nleaf_vars and size of their gradients don't match.";
   for (size_t i = 0; i < us_grads.size(); i++) {
-    size_t eid = idx.entry_id(us_grads[i]);
+    size_t eid   = idx.entry_id(us_grads[i]);
     AGInfo& info = AGInfo::Get(us[i].node);
     if (arrays[eid]->dtype_ == -1) {
       arrays[eid] = &info.out_grads[0];
@@ -676,8 +675,8 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
     array_reqs[eid] = x_reqs[i - num_forward_outputs];
   }
   for (size_t i = 0; i < us_grads.size(); i++) {
-    size_t eid = idx.entry_id(us_grads[i]);
-    AGInfo& info = AGInfo::Get(us[i].node);
+    size_t eid      = idx.entry_id(us_grads[i]);
+    AGInfo& info    = AGInfo::Get(us[i].node);
     array_reqs[eid] = info.grad_req;
   }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index b649958fa534..ce1a60fb2b20 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -353,8 +353,8 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs,
   if (rsc_req || rsc_ex_req) {
     int ntmp           = 0;
     auto resource_reqs = rsc_ex_req ? ftmp_resource_ex[attrs.op](
-                                          attrs, static_cast<int>(ctx.dev_mask()), dispatch_mode)
-                                    : ftmp_resource[attrs.op](attrs);
+                                          attrs, static_cast<int>(ctx.dev_mask()), dispatch_mode) :
+                                      ftmp_resource[attrs.op](attrs);
     for (const auto& req : resource_reqs) {
       switch (req.type) {
         case ResourceRequest::kTempSpace:
@@ -1318,9 +1318,9 @@ inline void CreateEngineOpSeg(const nnvm::IndexedGraph& idx,
     const auto& inode = idx[nid];
     opr_names += op_name;
     opr_names += "{name=" + inode.source->attrs.name + ";";
-    const std::unordered_map<std::string, std::string> &dict = inode.source->attrs.dict;
-    auto num_dict_entries = dict.size();
-    for (auto &k : dict) {
+    const std::unordered_map<std::string, std::string>& dict = inode.source->attrs.dict;
+    auto num_dict_entries                                    = dict.size();
+    for (auto& k : dict) {
       opr_names += k.first + "=" + k.second;
       if (--num_dict_entries != 0)
         opr_names += ";";
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 91d70576bc9d..5f859b3d2bfe 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -87,9 +87,9 @@ class PrefetcherIter : public IIterator<DataBatch> {
             (*dptr)->index.resize(batch.batch_size);
             for (size_t i = 0; i < batch.data.size(); ++i) {
               auto dtype = param_.dtype ? param_.dtype.value() : batch.data[i].type_flag_;
-              auto ctx   = ((param_.ctx == PrefetcherParam::kCPUPinned) && (param_.device_id >= 0))
-                               ? Context::CPUPinned(param_.device_id)
-                               : Context::CPU();
+              auto ctx = ((param_.ctx == PrefetcherParam::kCPUPinned) && (param_.device_id >= 0)) ?
+                             Context::CPUPinned(param_.device_id) :
+                             Context::CPU();
               (*dptr)->data.at(i) = NDArray(batch.data[i].shape_, ctx, false, dtype);
             }
           }
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 5a1df937f6eb..5fdb0e912103 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -206,9 +206,9 @@ class CommCPU : public Comm {
                                           Engine::CallbackOnComplete on_complete) {
             on_start();
             NDArray out = buf_merged;
-            is_serial_push_
-                ? ReduceSumCPUExSerial(reduce, &out)
-                : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
+            is_serial_push_ ?
+                ReduceSumCPUExSerial(reduce, &out) :
+                mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
             on_complete();
           },
           Context::CPU(),
@@ -263,10 +263,10 @@ class CommCPU : public Comm {
       const bool is_same_ctx = out->ctx() == src.ctx();
       const bool is_diff_var = out->var() != src.var();
       NDArray retained_cpu =
-          (is_same_ctx && is_diff_var)
-              ? *out
-              : NDArray(
-                    kRowSparseStorage, src.shape(), src.ctx(), true, src.dtype(), src.aux_types());
+          (is_same_ctx && is_diff_var) ?
+              *out :
+              NDArray(
+                  kRowSparseStorage, src.shape(), src.ctx(), true, src.dtype(), src.aux_types());
       if (!is_diff_var) {
         common::LogOnce("The output of row_sparse_pull() on key " + std::to_string(key) +
                         "refers to the same NDArray as the one stored in KVStore."
@@ -670,13 +670,11 @@ class CommDevice : public Comm {
       // retain according to indices
       const bool is_same_ctx = out->ctx() == src.ctx();
       const bool is_diff_var = out->var() != src.var();
-      NDArray retained_gpu   = (is_same_ctx && is_diff_var) ? *out
-                                                            : NDArray(kRowSparseStorage,
-                                                                    out->shape(),
-                                                                    src.ctx(),
-                                                                    true,
-                                                                    out->dtype(),
-                                                                    out->aux_types());
+      NDArray retained_gpu =
+          (is_same_ctx && is_diff_var) ?
+              *out :
+              NDArray(
+                  kRowSparseStorage, out->shape(), src.ctx(), true, out->dtype(), out->aux_types());
       if (!is_diff_var) {
         common::LogOnce("The output of row_sparse_pull() on key " + std::to_string(key) +
                         "refers to the same NDArray as the one stored in KVStore."
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 319b04000da7..b21cac3c6d48 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -588,8 +588,8 @@ inline int KLGenerateBinaryTree(const std::vector<T>& W,
       parent    = (parent == -1) ? GetRoot(P, color, *roots) : parent;
 
       int from_cluster = color;
-      int dest_cluster = (from_cluster == (*cluster_pairs)[i].first) ? (*cluster_pairs)[i].second
-                                                                     : (*cluster_pairs)[i].first;
+      int dest_cluster = (from_cluster == (*cluster_pairs)[i].first) ? (*cluster_pairs)[i].second :
+                                                                       (*cluster_pairs)[i].first;
 
       std::vector<int> candidates;
       T weight;
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 09612a5aeb60..a80176494e1b 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -507,16 +507,16 @@ class KVStoreDist : public KVStoreLocal {
       size_t size         = recv_buf.shape().Size();
       const int dtype     = recv_buf.dtype();
       const int num_bytes = mshadow::mshadow_sizeof(dtype);
-      PSKV& pskv          = (gradient_compression_->get_type() == CompressionType::kNone)
-                                ? EncodeDefaultKey(key, size, num_bytes)
-                                : EncodeCompressedKey(key, size, false, num_bytes);
+      PSKV& pskv          = (gradient_compression_->get_type() == CompressionType::kNone) ?
+                                EncodeDefaultKey(key, size, num_bytes) :
+                                EncodeCompressedKey(key, size, false, num_bytes);
       char* data          = static_cast<char*>(recv_buf.data().dptr_);
       // false means not to delete data when SArray is deleted
       auto vals = new ps::SArray<char>(data, size * num_bytes, false);
       // issue pull
-      RequestType mode = (gradient_compression_->get_type() != CompressionType::kNone)
-                             ? RequestType::kCompressedPushPull
-                             : RequestType::kDefaultPushPull;
+      RequestType mode = (gradient_compression_->get_type() != CompressionType::kNone) ?
+                             RequestType::kCompressedPushPull :
+                             RequestType::kDefaultPushPull;
       const int cmd    = GetCommandType(mode, dtype);
       CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, cmd, [vals, cb]() {
         delete vals;
diff --git a/src/kvstore/p3store_dist.h b/src/kvstore/p3store_dist.h
index 56912cd7abcf..5b5a13f2e346 100644
--- a/src/kvstore/p3store_dist.h
+++ b/src/kvstore/p3store_dist.h
@@ -88,7 +88,7 @@ class P3StoreDist : public KVStoreDist {
       char* data        = static_cast<char*>(send_buf.data().dptr_);
       // do push. false means no delete
       ps::SArray<char> vals(data, size, false);
-      int cmd = GetCommandType(RequestType::kDefaultPushPull, dtype);
+      int cmd      = GetCommandType(RequestType::kDefaultPushPull, dtype);
       size_t off   = 0;
       auto counter = new std::atomic<int>(pskv.keys.size());
       for (size_t idx = 0; idx < pskv.keys.size(); idx++) {
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index cfcdab2e60cf..cdbb764bc535 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -62,13 +62,13 @@ void NDArray::ReInit(const NDArrayStorageType stype,
     if (!sparseStorage && stype != kCSRStorage)
       LOG(FATAL) << "Unknown storage type " << stype;
 
-    const auto& aux_types = (pAux_types && pAux_types->size())
-                                ? *pAux_types
-                                : std::vector<int>(sparseStorage ? 1 : 2, mshadow::kInt64);
+    const auto& aux_types = (pAux_types && pAux_types->size()) ?
+                                *pAux_types :
+                                std::vector<int>(sparseStorage ? 1 : 2, mshadow::kInt64);
 
-    const auto& aux_shapes = (pAux_shapes && pAux_shapes->size())
-                                 ? *pAux_shapes
-                                 : ShapeVector(sparseStorage ? 1 : 2, TShape(mshadow::Shape1(0)));
+    const auto& aux_shapes = (pAux_shapes && pAux_shapes->size()) ?
+                                 *pAux_shapes :
+                                 ShapeVector(sparseStorage ? 1 : 2, TShape(mshadow::Shape1(0)));
 
     mxnet::TShape storage_shape;
     if (!pStorage_shapes || !pStorage_shapes->Size()) {
@@ -2435,9 +2435,7 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushSync(
-        [&](RunContext rctx) {
-          common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
-        },
+        [&](RunContext rctx) { common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check); },
         this->ctx(),
         {this->var()},
         {},
diff --git a/src/nnvm/gradient.cc b/src/nnvm/gradient.cc
index 038d287a83d2..f0f48f625a40 100644
--- a/src/nnvm/gradient.cc
+++ b/src/nnvm/gradient.cc
@@ -88,8 +88,7 @@ Graph Gradient(Graph src) {
   const std::vector<NodeEntry>& ys_out_grad =
       src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
   CHECK_EQ(ys.size(), ys_out_grad.size());
-  const std::vector<NodeEntry>& us =
-      src.GetAttr<std::vector<NodeEntry> >("grad_us");
+  const std::vector<NodeEntry>& us = src.GetAttr<std::vector<NodeEntry> >("grad_us");
 
   // initialize a topological order of the graph nodes and `output_grads`
   // that maps every operator node to its gradient entries
@@ -506,7 +505,6 @@ inline bool CheckGradAllZero(const std::vector<NodeEntry>& grads,
   return true;
 }
 
-
 Graph BuildGradientGraph(const Graph& src,
                          const std::vector<NodeEntry>& xs,
                          const std::vector<ObjectPtr>& topo_order,
@@ -546,9 +544,9 @@ Graph BuildGradientGraph(const Graph& src,
   if (src.attrs.count("zero_ops") != 0) {
     zero_ops = src.GetAttr<std::vector<const Op*> >("zero_ops");
   }
-  const Op* copy_op = (src.attrs.count("copy_op_str") != 0)
-                          ? Op::Get(src.GetAttr<std::string>("copy_op_str"))
-                          : nullptr;
+  const Op* copy_op = (src.attrs.count("copy_op_str") != 0) ?
+                          Op::Get(src.GetAttr<std::string>("copy_op_str")) :
+                          nullptr;
 
   std::vector<NodeEntry> out_agg_grads;
   for (auto topo_order_rit = topo_order.rbegin(); topo_order_rit != topo_order.rend();
diff --git a/src/nnvm/plan_memory.cc b/src/nnvm/plan_memory.cc
index 73f494334854..3859497e466f 100644
--- a/src/nnvm/plan_memory.cc
+++ b/src/nnvm/plan_memory.cc
@@ -379,9 +379,9 @@ Graph MXPlanMemory(Graph ret) {
   size_t min_allocated_bytes = -1;
   size_t max_match_range     = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
   size_t min_match_range =
-      dmlc::GetEnv("MXNET_MEMORY_OPT", 0) || dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false)
-          ? 1
-          : max_match_range;
+      dmlc::GetEnv("MXNET_MEMORY_OPT", 0) || dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ?
+          1 :
+          max_match_range;
   for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
     // Make a copy of related fields
     StorageVector storage_vec(storage);
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index c3b83f412ae9..b67ea10e26a3 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -29,7 +29,7 @@ namespace op {
 namespace adamw {
 
 template <>
-void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float* pScalef) {
+void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float* pScalef){
     MSHADOW_REAL_TYPE_SWITCH(
         scale_blob.type_flag_,
         DType,
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index acab01adf3d1..be57acc36ce1 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -132,8 +132,8 @@ static inline DType area_pixel_compute_scale(int64_t input_size,
    *     src_idx + 0.5 = scale * (dst_index + 0.5)
    */
   if (output_size > 1) {
-    return align_corners ? static_cast<DType>(input_size - 1) / (output_size - 1)
-                         : static_cast<DType>(input_size) / output_size;
+    return align_corners ? static_cast<DType>(input_size - 1) / (output_size - 1) :
+                           static_cast<DType>(input_size) / output_size;
   } else {
     return DType(0);
   }
@@ -270,12 +270,12 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
       break;
     }
     case bilinear_resize::odd_scale: {
-      new_height = ((dshape[2] % 2) == 0)
-                       ? (int16_t)(dshape[2] * param.scale_height.value())
-                       : (int16_t)((dshape[2] - 1) * param.scale_height.value()) + 1;
-      new_width  = ((dshape[3] % 2) == 0)
-                       ? (int16_t)(dshape[3] * param.scale_width.value())
-                       : (int16_t)((dshape[3] - 1) * param.scale_width.value()) + 1;
+      new_height = ((dshape[2] % 2) == 0) ?
+                       (int16_t)(dshape[2] * param.scale_height.value()) :
+                       (int16_t)((dshape[2] - 1) * param.scale_height.value()) + 1;
+      new_width  = ((dshape[3] % 2) == 0) ?
+                       (int16_t)(dshape[3] * param.scale_width.value()) :
+                       (int16_t)((dshape[3] - 1) * param.scale_width.value()) + 1;
       break;
     }
     case bilinear_resize::like: {
diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
index 192605316fb7..1fc00e1b1483 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -943,21 +943,21 @@ struct box_encode {
     out_masks[a_index + 2] = valid;
     out_masks[a_index + 3] = valid;
     out_targets[a_index + 0] =
-        valid > static_cast<DType>(0.5)
-            ? ((ref_x - a_x) / a_width - static_cast<DType>(means[0])) / static_cast<DType>(stds[0])
-            : static_cast<DType>(0.0);
-    out_targets[a_index + 1] = valid > static_cast<DType>(0.5)
-                                   ? ((ref_y - a_y) / a_height - static_cast<DType>(means[1])) /
-                                         static_cast<DType>(stds[1])
-                                   : static_cast<DType>(0.0);
-    out_targets[a_index + 2] = valid > static_cast<DType>(0.5)
-                                   ? (log(ref_width / a_width) - static_cast<DType>(means[2])) /
-                                         static_cast<DType>(stds[2])
-                                   : static_cast<DType>(0.0);
-    out_targets[a_index + 3] = valid > static_cast<DType>(0.5)
-                                   ? (log(ref_height / a_height) - static_cast<DType>(means[3])) /
-                                         static_cast<DType>(stds[3])
-                                   : static_cast<DType>(0.0);
+        valid > static_cast<DType>(0.5) ?
+            ((ref_x - a_x) / a_width - static_cast<DType>(means[0])) / static_cast<DType>(stds[0]) :
+            static_cast<DType>(0.0);
+    out_targets[a_index + 1] = valid > static_cast<DType>(0.5) ?
+                                   ((ref_y - a_y) / a_height - static_cast<DType>(means[1])) /
+                                       static_cast<DType>(stds[1]) :
+                                   static_cast<DType>(0.0);
+    out_targets[a_index + 2] = valid > static_cast<DType>(0.5) ?
+                                   (log(ref_width / a_width) - static_cast<DType>(means[2])) /
+                                       static_cast<DType>(stds[2]) :
+                                   static_cast<DType>(0.0);
+    out_targets[a_index + 3] = valid > static_cast<DType>(0.5) ?
+                                   (log(ref_height / a_height) - static_cast<DType>(means[3])) /
+                                       static_cast<DType>(stds[3]) :
+                                   static_cast<DType>(0.0);
   }
 };
 
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 95fedde22491..ef2b7be50a37 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -489,9 +489,9 @@ __launch_bounds__(NMS<DType>::THRESHOLD) __global__
 #pragma unroll
   for (int i = 0; i < n_threads / warp_size; ++i) {
     uint32_t my_mask = my_next_mask;
-    my_next_mask     = (((i + 1) < n_threads / warp_size) && (my_element_in_batch < topk))
-                           ? nms_results[(i + 1) * topk * num_batches + my_element]
-                           : full_mask;
+    my_next_mask     = (((i + 1) < n_threads / warp_size) && (my_element_in_batch < topk)) ?
+                           nms_results[(i + 1) * topk * num_batches + my_element] :
+                           full_mask;
     if (my_warp == i && !__all_sync(full_mask, my_mask == full_mask)) {
       my_mask = my_mask | earlier_threads_mask;
       // Loop over warp_size - 1 because the last
diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc
index 411802c031fa..ea878998dc19 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cc
+++ b/src/operator/contrib/deformable_psroi_pooling.cc
@@ -94,17 +94,17 @@ inline void DeformablePSROIPoolForwardCPU(const index_t count,
     index_t part_w   = floor(static_cast<DType>(pw) / pooled_width * part_size);
     index_t class_id = ctop / channels_each_class;
     DType trans_x =
-        no_trans
-            ? static_cast<DType>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
-                           part_w] *
-                  trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
     DType trans_y =
-        no_trans ? static_cast<DType>(0)
-                 : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) *
-                                    part_size +
-                                part_w] *
-                       trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
 
     DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
     wstart += trans_x * roi_width;
@@ -246,17 +246,17 @@ inline void DeformablePSROIPoolBackwardAccCPU(const index_t count,
     index_t part_w   = floor(static_cast<DType>(pw) / pooled_width * part_size);
     index_t class_id = ctop / channels_each_class;
     DType trans_x =
-        no_trans
-            ? static_cast<DType>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
-                           part_w] *
-                  trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
     DType trans_y =
-        no_trans ? static_cast<DType>(0)
-                 : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) *
-                                    part_size +
-                                part_w] *
-                       trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
 
     DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
     wstart += trans_x * roi_width;
diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
index b629fb90887c..82f53a03e0fd 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cu
+++ b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -94,17 +94,17 @@ __global__ void DeformablePSROIPoolForwardKernel(const index_t count,
     index_t part_w   = floor(static_cast<DType>(pw) / pooled_width * part_size);
     index_t class_id = ctop / channels_each_class;
     DType trans_x =
-        no_trans
-            ? static_cast<DType>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
-                           part_w] *
-                  trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
     DType trans_y =
-        no_trans ? static_cast<DType>(0)
-                 : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) *
-                                    part_size +
-                                part_w] *
-                       trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
 
     DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
     wstart += trans_x * roi_width;
@@ -248,17 +248,17 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(const index_t count,
     index_t part_w   = floor(static_cast<DType>(pw) / pooled_width * part_size);
     index_t class_id = ctop / channels_each_class;
     DType trans_x =
-        no_trans
-            ? static_cast<DType>(0)
-            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
-                           part_w] *
-                  trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
     DType trans_y =
-        no_trans ? static_cast<DType>(0)
-                 : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) *
-                                    part_size +
-                                part_w] *
-                       trans_std;
+        no_trans ?
+            static_cast<DType>(0) :
+            bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size +
+                         part_w] *
+                trans_std;
 
     DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
     wstart += trans_x * roi_width;
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 798fe7621711..a7c3583193f8 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -160,9 +160,9 @@ The internal representation depends on register length.  So AVX512, AVX2, and SS
                                      [](const NodeAttrs& attrs) {
                                        const PrepareWeightParam& params =
                                            nnvm::get<PrepareWeightParam>(attrs.parsed);
-                                       return params.already_quantized
-                                                  ? std::vector<std::string>{"weight"}
-                                                  : std::vector<std::string>{"weight", "maxabs"};
+                                       return params.already_quantized ?
+                                                  std::vector<std::string>{"weight"} :
+                                                  std::vector<std::string>{"weight", "maxabs"};
                                      })
     .set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
     .set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
diff --git a/src/operator/contrib/multi_lamb.cc b/src/operator/contrib/multi_lamb.cc
index 9afb6503abfb..866567d6aa21 100644
--- a/src/operator/contrib/multi_lamb.cc
+++ b/src/operator/contrib/multi_lamb.cc
@@ -44,8 +44,8 @@ struct MultiLAMBKernelStep1 {
     using namespace mshadow_op;
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i]
-                                                  : MPDType(kernel_params.weights[index][i]);
+        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                                    MPDType(kernel_params.weights[index][i]);
         MPDType scaled_grad = static_cast<MPDType>(kernel_params.grads[index][i]) * rescale_grad;
         if (clip_gradient >= 0.0f)
           scaled_grad = mshadow_op::clip::Map(scaled_grad, static_cast<MPDType>(clip_gradient));
@@ -91,8 +91,8 @@ struct MultiLAMBKernelStep2 {
                                   const OpReqType req) {
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w = has_mixed_precision ? kernel_params.weights32[index][i]
-                                        : MPDType(kernel_params.weights[index][i]);
+        MPDType w = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                          MPDType(kernel_params.weights[index][i]);
         float r1  = sqrt(sum_sq_weigths[index]);
         float r2  = sqrt(sum_sq_temp_g[index]);
         if (lower_bound >= 0)
diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 24525f8d8f2d..118ec6348ed7 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -72,9 +72,9 @@ __global__ void KernelStep1(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
     for (int ii = 0; ii < ILP_LAMB; ii++) {
       int load_pos = i + ii * blockDim.x;
       if (load_pos < stop_pos && load_pos < kernel_params.sizes[tensor_id]) {
-        r_weight[ii] = has_mixed_precision
-                           ? kernel_params.weights32[tensor_id][load_pos]
-                           : static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
+        r_weight[ii] = has_mixed_precision ?
+                           kernel_params.weights32[tensor_id][load_pos] :
+                           static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
         r_grad[ii]   = static_cast<MPDType>(kernel_params.grads[tensor_id][load_pos]);
         r_mean[ii]   = kernel_params.mean[tensor_id][load_pos];
         r_var[ii]    = kernel_params.var[tensor_id][load_pos];
@@ -145,9 +145,9 @@ __global__ void KernelStep2(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
     for (int ii = 0; ii < ILP_LAMB; ii++) {
       int load_pos = i + ii * blockDim.x;
       if (load_pos < stop_pos && load_pos < kernel_params.sizes[tensor_id]) {
-        r_weight[ii] = has_mixed_precision
-                           ? kernel_params.weights32[tensor_id][load_pos]
-                           : static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
+        r_weight[ii] = has_mixed_precision ?
+                           kernel_params.weights32[tensor_id][load_pos] :
+                           static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
         r_g[ii]      = temp_g[kernel_params.tensor2temp_g[tensor_id] + load_pos];
       }
     }
diff --git a/src/operator/contrib/multi_lans.cc b/src/operator/contrib/multi_lans.cc
index 154a4ce8fb4e..a7bb3ab69a77 100644
--- a/src/operator/contrib/multi_lans.cc
+++ b/src/operator/contrib/multi_lans.cc
@@ -45,8 +45,8 @@ struct MultiLANSKernelStep1 {
     using namespace mshadow_op;
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i]
-                                                  : MPDType(kernel_params.weights[index][i]);
+        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                                    MPDType(kernel_params.weights[index][i]);
         float g_norm        = sqrt(g_sq_norm[index]);
         MPDType scaled_grad = static_cast<MPDType>(kernel_params.grads[index][i]) * rescale_grad;
         scaled_grad /= g_norm;
@@ -95,8 +95,8 @@ struct MultiLANSKernelStep2 {
                                   const OpReqType req) {
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w  = has_mixed_precision ? kernel_params.weights32[index][i]
-                                         : MPDType(kernel_params.weights[index][i]);
+        MPDType w  = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                           MPDType(kernel_params.weights[index][i]);
         float r1   = sqrt(sum_sq_weigths[index]);
         float r2_m = sqrt(sum_sq_temp_m[index]);
         float r2_g = sqrt(sum_sq_temp_g[index]);
diff --git a/src/operator/contrib/multi_lans.cu b/src/operator/contrib/multi_lans.cu
index a57a99e25854..a9f59478cca1 100644
--- a/src/operator/contrib/multi_lans.cu
+++ b/src/operator/contrib/multi_lans.cu
@@ -72,9 +72,9 @@ __global__ void KernelStep1(const MultiLANSKernelParam<DType, MPDType> kernel_pa
     for (int ii = 0; ii < ILP_LAMB; ii++) {
       int load_pos = i + ii * blockDim.x;
       if (load_pos < stop_pos && load_pos < kernel_params.sizes[tensor_id]) {
-        r_weight[ii] = has_mixed_precision
-                           ? kernel_params.weights32[tensor_id][load_pos]
-                           : static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
+        r_weight[ii] = has_mixed_precision ?
+                           kernel_params.weights32[tensor_id][load_pos] :
+                           static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
         r_grad[ii]   = static_cast<MPDType>(kernel_params.grads[tensor_id][load_pos]);
         r_mean[ii]   = kernel_params.mean[tensor_id][load_pos];
         r_var[ii]    = kernel_params.var[tensor_id][load_pos];
@@ -160,9 +160,9 @@ __global__ void KernelStep2(const MultiLANSKernelParam<DType, MPDType> kernel_pa
     for (int ii = 0; ii < ILP_LAMB; ii++) {
       int load_pos = i + ii * blockDim.x;
       if (load_pos < stop_pos && load_pos < kernel_params.sizes[tensor_id]) {
-        r_weight[ii] = has_mixed_precision
-                           ? kernel_params.weights32[tensor_id][load_pos]
-                           : static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
+        r_weight[ii] = has_mixed_precision ?
+                           kernel_params.weights32[tensor_id][load_pos] :
+                           static_cast<MPDType>(kernel_params.weights[tensor_id][load_pos]);
         r_m[ii]      = temp_m[kernel_params.tensor2temp_g[tensor_id] + load_pos];
         r_g[ii]      = temp_g[kernel_params.tensor2temp_g[tensor_id] + load_pos];
       }
diff --git a/src/operator/contrib/multi_lars-inl.h b/src/operator/contrib/multi_lars-inl.h
index c5fd528c57f1..884e090f759e 100644
--- a/src/operator/contrib/multi_lars-inl.h
+++ b/src/operator/contrib/multi_lars-inl.h
@@ -68,10 +68,10 @@ struct MultiLARSKernel {
     bool is_lars_valid = w_norm > 0. && grads_sum_sq[i] > 0.;
     KERNEL_ASSIGN(out_data[i],
                   req,
-                  is_lars_valid
-                      ? lrs[i] * eta * w_norm /
-                            (sqrtf(grads_sum_sq[i]) * rescale_grad + wds[i] * w_norm + eps)
-                      : lrs[i]);
+                  is_lars_valid ?
+                      lrs[i] * eta * w_norm /
+                          (sqrtf(grads_sum_sq[i]) * rescale_grad + wds[i] * w_norm + eps) :
+                      lrs[i]);
   }
 };
 
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 8d52b1aae1ff..4c663206031e 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -726,9 +726,9 @@ static void WhileLoopGradComputeExCPU(const OpStatePtr& state_ptr,
         }
         if (i < (size_t)params.num_args - 2U) {
           // a var
-          igrads[i] = (step == 0)
-                          ? outputs[i]
-                          : NDArray(outputs[i].shape(), outputs[i].ctx(), true, outputs[i].dtype());
+          igrads[i] = (step == 0) ?
+                          outputs[i] :
+                          NDArray(outputs[i].shape(), outputs[i].ctx(), true, outputs[i].dtype());
 
           iter_req[i] = (step == 0 || req[i] == kNullOp) ? req[i] : kWriteTo;
           ++i;
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
index 582dd28925a2..b57ce86b1a8c 100644
--- a/src/operator/correlation.cc
+++ b/src/operator/correlation.cc
@@ -135,18 +135,18 @@ inline void CorrelationBackward(const Tensor<cpu, 4, Dtype>& out_grad,
                   if ((y1 + h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) &&
                       (y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
                     Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
-                                  tmp2[nbatch][y2 + h][x2 + w][channel])
-                                     ? Dtype(1.0)
-                                     : Dtype(-1.0);
+                                  tmp2[nbatch][y2 + h][x2 + w][channel]) ?
+                                     Dtype(1.0) :
+                                     Dtype(-1.0);
                     in_grad1[nbatch][channel][y1 + h - pad_size_][x1 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] * sign / sumelems;
                   }
                   if ((y2 + h - pad_size_ >= 0) && (x2 + w - pad_size_ >= 0) &&
                       (y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
                     Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
-                                  tmp2[nbatch][y2 + h][x2 + w][channel])
-                                     ? Dtype(-1.0)
-                                     : Dtype(1.0);
+                                  tmp2[nbatch][y2 + h][x2 + w][channel]) ?
+                                     Dtype(-1.0) :
+                                     Dtype(1.0);
                     in_grad2[nbatch][channel][y2 + h - pad_size_][x2 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] * sign / sumelems;
                   }
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index dd331ade231c..ff2ce4aae2a4 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -181,17 +181,17 @@ The following modified ReLU Activation functions are supported:
                                      [](const NodeAttrs& attrs) {
                                        const LeakyReLUParam& param =
                                            nnvm::get<LeakyReLUParam>(attrs.parsed);
-                                       return param.act_type == leakyrelu::kPReLU
-                                                  ? std::vector<std::string>{"data", "gamma"}
-                                                  : std::vector<std::string>{"data"};
+                                       return param.act_type == leakyrelu::kPReLU ?
+                                                  std::vector<std::string>{"data", "gamma"} :
+                                                  std::vector<std::string>{"data"};
                                      })
     .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                       [](const NodeAttrs& attrs) {
                                         const LeakyReLUParam& param =
                                             nnvm::get<LeakyReLUParam>(attrs.parsed);
-                                        return param.act_type == leakyrelu::kRReLU
-                                                   ? std::vector<std::string>{"output", "mask"}
-                                                   : std::vector<std::string>{"output"};
+                                        return param.act_type == leakyrelu::kRReLU ?
+                                                   std::vector<std::string>{"output", "mask"} :
+                                                   std::vector<std::string>{"output"};
                                       })
     .set_attr<mxnet::FInferShape>("FInferShape", LeakyReLUShape)
     .set_attr<nnvm::FInferType>("FInferType", LeakyReLUType)
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 09e42481a66b..72f7b294b9f9 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -580,36 +580,34 @@ struct AccType<mshadow::half::half_t> {
       .add_enum("int64", mshadow::kInt64)       \
       .add_enum("bool", mshadow::kBool)
 
-#define MXNET_ADD_ALL_TYPES_EXT \
-  .add_enum("float32", mshadow::kFloat32) \
-  .add_enum("float64", mshadow::kFloat64) \
-  .add_enum("float16", mshadow::kFloat16) \
-  .add_enum("bfloat16", mshadow::kBfloat16) \
-  .add_enum("uint8", mshadow::kUint8) \
-  .add_enum("int8", mshadow::kInt8) \
-  .add_enum("int32", mshadow::kInt32) \
-  .add_enum("int64", mshadow::kInt64) \
-  .add_enum("int16", mshadow::kInt16) \
-  .add_enum("uint16", mshadow::kUint16) \
-  .add_enum("uint32", mshadow::kUint32) \
-  .add_enum("uint64", mshadow::kUint64)
-
-
-#define MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL \
-  .add_enum("float32", mshadow::kFloat32) \
-  .add_enum("float64", mshadow::kFloat64) \
-  .add_enum("float16", mshadow::kFloat16) \
-  .add_enum("bfloat16", mshadow::kBfloat16) \
-  .add_enum("uint8", mshadow::kUint8) \
-  .add_enum("int8", mshadow::kInt8) \
-  .add_enum("int32", mshadow::kInt32) \
-  .add_enum("int64", mshadow::kInt64) \
-  .add_enum("bool", mshadow::kBool) \
-  .add_enum("int16", mshadow::kInt16) \
-  .add_enum("uint16", mshadow::kUint16) \
-  .add_enum("uint32", mshadow::kUint32) \
-  .add_enum("uint64", mshadow::kUint64)
+#define MXNET_ADD_ALL_TYPES_EXT                 \
+  .add_enum("float32", mshadow::kFloat32)       \
+      .add_enum("float64", mshadow::kFloat64)   \
+      .add_enum("float16", mshadow::kFloat16)   \
+      .add_enum("bfloat16", mshadow::kBfloat16) \
+      .add_enum("uint8", mshadow::kUint8)       \
+      .add_enum("int8", mshadow::kInt8)         \
+      .add_enum("int32", mshadow::kInt32)       \
+      .add_enum("int64", mshadow::kInt64)       \
+      .add_enum("int16", mshadow::kInt16)       \
+      .add_enum("uint16", mshadow::kUint16)     \
+      .add_enum("uint32", mshadow::kUint32)     \
+      .add_enum("uint64", mshadow::kUint64)
 
+#define MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL       \
+  .add_enum("float32", mshadow::kFloat32)       \
+      .add_enum("float64", mshadow::kFloat64)   \
+      .add_enum("float16", mshadow::kFloat16)   \
+      .add_enum("bfloat16", mshadow::kBfloat16) \
+      .add_enum("uint8", mshadow::kUint8)       \
+      .add_enum("int8", mshadow::kInt8)         \
+      .add_enum("int32", mshadow::kInt32)       \
+      .add_enum("int64", mshadow::kInt64)       \
+      .add_enum("bool", mshadow::kBool)         \
+      .add_enum("int16", mshadow::kInt16)       \
+      .add_enum("uint16", mshadow::kUint16)     \
+      .add_enum("uint32", mshadow::kUint32)     \
+      .add_enum("uint64", mshadow::kUint64)
 
 /* \brief Compute flattened index given coordinates and shape. */
 template <int ndim>
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 8b5ff3c76f04..92eded093d9c 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -369,8 +369,8 @@ class BNTensor3 {
   inline BNTensor3(const TBlob& blob, const int indexOfChannel)
       : dptr_(blob.dptr<DType>()),
         indexOfChannel_(static_cast<size_t>(
-            indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
-                               : indexOfChannel)) {
+            indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel) :
+                                 indexOfChannel)) {
     CHECK_EQ(blob.type_flag_, mshadow::DataType<DType>::kFlag);
     shape_[OUTER] = 1;
     for (size_t i = 0; i < indexOfChannel_; ++i) {
@@ -385,9 +385,9 @@ class BNTensor3 {
 
   inline BNTensor3(DType* p, const mxnet::TShape& shape, const int indexOfChannel)
       : dptr_(p),
-        indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
-                                                ? (static_cast<int>(shape.ndim()) + indexOfChannel)
-                                                : indexOfChannel)) {
+        indexOfChannel_(static_cast<size_t>(indexOfChannel < 0 ?
+                                                (static_cast<int>(shape.ndim()) + indexOfChannel) :
+                                                indexOfChannel)) {
     shape_[OUTER] = 1;
     for (size_t i = 0; i < indexOfChannel_; ++i) {
       shape_[OUTER] *= shape[i];
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 195423bd1419..29f3f61b6808 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -280,13 +280,13 @@ __launch_bounds__(inference_forward_threads) __global__
         my_channel = my_channel % num_channels;
       AType current_input = static_cast<AType>(scratch.separate[j]);
 
-      AType invstd = small_num_channels ? saved_invstd[my_channel]
-                                        : variance_to_invstd(runningVar[my_channel], epsilon);
+      AType invstd = small_num_channels ? saved_invstd[my_channel] :
+                                          variance_to_invstd(runningVar[my_channel], epsilon);
       AType mean   = small_num_channels ? saved_mean[my_channel] : runningMean[my_channel];
       AType gamma =
-          small_num_channels
-              ? saved_weight[my_channel]
-              : ((weight != nullptr && (flags & FIX_GAMMA_FLAG) == 0) ? weight[my_channel] : 1);
+          small_num_channels ?
+              saved_weight[my_channel] :
+              ((weight != nullptr && (flags & FIX_GAMMA_FLAG) == 0) ? weight[my_channel] : 1);
       AType beta =
           small_num_channels ? saved_bias[my_channel] : ((bias != nullptr) ? bias[my_channel] : 0);
       current_input       = gamma * (current_input - mean) * invstd + beta;
@@ -346,11 +346,11 @@ __global__ void BatchNormalizationUpdateOutputKernel(DeviceTensor input,
   }
 
   // Write normalized and update the output
-  const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0)
-                            ? ScalarConvert<DType, AccReal>::to(weight[plane])
-                            : ScalarConvert<int, AccReal>::to(1);
-  const AccReal beta  = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane])
-                                               : ScalarConvert<int, AccReal>::to(0);
+  const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0) ?
+                            ScalarConvert<DType, AccReal>::to(weight[plane]) :
+                            ScalarConvert<int, AccReal>::to(1);
+  const AccReal beta  = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane]) :
+                                                 ScalarConvert<int, AccReal>::to(0);
   for (int batch = 0, nbatch = input.OuterSize(); batch < nbatch; ++batch) {
     for (int x = threadIdx.x, nx = input.InnerSize(); x < nx; x += blockDim.x) {
       const DType inp = input.get_ref(batch, plane, x);
@@ -648,9 +648,9 @@ static __global__ void BatchNormalizationBackwardKernel(const DeviceTensor input
   mean   = ScalarConvert<DType, AccReal>::to(tensors.saveMean[plane]);
   invstd = tensors.saveInvStd[plane];
 
-  const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0)
-                                ? ScalarConvert<DType, AccReal>::to(tensors.weight[plane])
-                                : AccReal(1);
+  const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
+                                ScalarConvert<DType, AccReal>::to(tensors.weight[plane]) :
+                                AccReal(1);
   const AccReal norm      = AccReal(1) / N;
 
   // Compute two values across (batch, x/y/z) in one pass:
@@ -951,9 +951,9 @@ static void BatchNormalizationBackward(mshadow::Stream<gpu>* s,
     if (tensors.gradBias.numElements() <= 0) {
       flags_copy = (flags_copy & ~WRITE_BETA_FLAG);
     }
-    AccReal* gamma = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0)
-                         ? tensors.weight.dptr_
-                         : nullptr;
+    AccReal* gamma = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
+                         tensors.weight.dptr_ :
+                         nullptr;
 
     if (param.axis == -1 || param.axis == in_data[batchnorm::kData].shape_.ndim() - 1) {
       const int C = gradOutput.ChannelCount();
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 6206c8e809bf..f5a6f7f52ca9 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -255,8 +255,8 @@ bool SupportDNNLConcat(const std::vector<NDArray>& arrs) {
     // DO not support zero-size tensors.
     if (arr.shape().Size() == 0)
       return false;
-    int ndim               = arr.shape().ndim();
-    const int dnnl_ndims   = arr.GetDNNLData()->get_desc().data.ndims;
+    int ndim             = arr.shape().ndim();
+    const int dnnl_ndims = arr.GetDNNLData()->get_desc().data.ndims;
     if ((ndim != 2 && ndim != 4) || ndim != dnnl_ndims) {
       return false;
     }
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 0e054c0ff07f..787fbc0ef497 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -126,9 +126,9 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] != -1
-                    ? (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1
-                    : -1;
+    oshape[2] = dshape[2] != -1 ?
+                    (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 :
+                    -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
@@ -177,12 +177,12 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<4> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] != -1
-                    ? (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1
-                    : -1;
-    oshape[3] = dshape[3] != -1
-                    ? (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1
-                    : -1;
+    oshape[2] = dshape[2] != -1 ?
+                    (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 :
+                    -1;
+    oshape[3] = dshape[3] != -1 ?
+                    (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 :
+                    -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
@@ -239,15 +239,15 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<5> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] != -1
-                    ? (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1
-                    : -1;
-    oshape[3] = dshape[3] != -1
-                    ? (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1
-                    : -1;
-    oshape[4] = dshape[4] != -1
-                    ? (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1
-                    : -1;
+    oshape[2] = dshape[2] != -1 ?
+                    (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 :
+                    -1;
+    oshape[3] = dshape[3] != -1 ?
+                    (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 :
+                    -1;
+    oshape[4] = dshape[4] != -1 ?
+                    (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 :
+                    -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu
index f9c387cebd20..ce3d1e1b1b9b 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cu
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu
@@ -60,18 +60,18 @@ void SetDescriptors(const BatchNormParam& param, const TBlob& x) {
   CHECK(param.axis == 1 || param.axis == x.shape_.ndim() - 1);
 
   cudnnTensorFormat_t format = param.axis == 1 ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
-  int n = x.shape_[0];
-  int c = x.shape_[param.axis];
-  size_t last_spatial_i = param.axis == 1 ? x.shape_.ndim() - 1 : x.shape_.ndim() - 2;
-  int w = x.shape_[last_spatial_i];
+  int n                      = x.shape_[0];
+  int c                      = x.shape_[param.axis];
+  size_t last_spatial_i      = param.axis == 1 ? x.shape_.ndim() - 1 : x.shape_.ndim() - 2;
+  int w                      = x.shape_[last_spatial_i];
   int h = x.shape_.ProdShape(last_spatial_i - (x.shape_.ndim() - 3), last_spatial_i);
 
   MSHADOW_REAL_TYPE_SWITCH(x.type_flag_, DType, {
-    CUDNN_CALL(cudnnSetTensor4dDescriptor(Globals::Get().io_desc, format,
-                                          mshadow::DataType<DType>::kCudnnFlag, n, c, h, w));
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(
+        Globals::Get().io_desc, format, mshadow::DataType<DType>::kCudnnFlag, n, c, h, w));
   })
-  CUDNN_CALL(cudnnDeriveBNTensorDescriptor(Globals::Get().mean_desc, Globals::Get().io_desc,
-                                           CUDNN_BATCHNORM_SPATIAL));
+  CUDNN_CALL(cudnnDeriveBNTensorDescriptor(
+      Globals::Get().mean_desc, Globals::Get().io_desc, CUDNN_BATCHNORM_SPATIAL));
 }
 
 mshadow::TypeFlag ParamType(int x_type) {
@@ -86,8 +86,10 @@ bool CudnnBatchNormSupports(const BatchNormParam& param, const TBlob& x) {
   return n >= 3 && (param.axis == 1 || param.axis == n - 1);
 }
 
-void CudnnBatchNormForward(const BatchNormParam& param, const OpContext& ctx,
-                           const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+void CudnnBatchNormForward(const BatchNormParam& param,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 5);
   if (ctx.is_train) {
@@ -106,13 +108,20 @@ void CudnnBatchNormForward(const BatchNormParam& param, const OpContext& ctx,
   MSHADOW_REAL_TYPE_SWITCH(ParamType(inputs[batchnorm::kData].type_flag_), DType, {
     DType a = 1.0f;
     DType b = 0.0f;
-    if (param.fix_gamma) inputs[batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 1.0f;
+    if (param.fix_gamma)
+      inputs[batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 1.0f;
     if (ctx.is_train) {
       size_t workspace_size = 0;
       CUDNN_CALL(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-          s->dnn_handle_, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN,
-          Globals::Get().io_desc, nullptr, Globals::Get().io_desc, Globals::Get().mean_desc,
-          nullptr, &workspace_size));
+          s->dnn_handle_,
+          CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+          CUDNN_BATCHNORM_OPS_BN,
+          Globals::Get().io_desc,
+          nullptr,
+          Globals::Get().io_desc,
+          Globals::Get().mean_desc,
+          nullptr,
+          &workspace_size));
       auto workspace = ctx.requested[0].get_space_internal(workspace_size, "CudnnBatchNormForward");
 
       // If the lock on the auxiliary states is set, then this implies that
@@ -122,30 +131,50 @@ void CudnnBatchNormForward(const BatchNormParam& param, const OpContext& ctx,
       // the `momentum` to `1` (or `factor` to `0`).
       double factor =
           ((dmlc::GetEnv("MXNET_BACKWARD_DO_MIRROR", 0) || dmlc::GetEnv("MXNET_MEMORY_OPT", 0)) &&
-           Globals::Get().internal_aux_states_lock)
-              ? 0
-              : (1 - param.momentum);
-      CUDNN_CALL(cudnnBatchNormalizationForwardTrainingEx(
-          s->dnn_handle_, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN, &a, &b,
-          Globals::Get().io_desc, inputs[batchnorm::kData].dptr_,
-          nullptr, nullptr,  // zDesc, zData
-          Globals::Get().io_desc, outputs[batchnorm::kOut].dptr_,
-          Globals::Get().mean_desc,
-          inputs[batchnorm::kGamma].dptr_, inputs[batchnorm::kBeta].dptr_,
-          factor, inputs[batchnorm::kInMovingMean].dptr_, inputs[batchnorm::kInMovingVar].dptr_,
-          param.eps, outputs[batchnorm::kMean].dptr_, outputs[batchnorm::kVar].dptr_,
-          nullptr,  // activation desc
-          workspace, workspace_size,
-          nullptr, 0));  // reserveSpace, reserveSpaceSizeInBytes
+           Globals::Get().internal_aux_states_lock) ?
+              0 :
+              (1 - param.momentum);
+      CUDNN_CALL(
+          cudnnBatchNormalizationForwardTrainingEx(s->dnn_handle_,
+                                                   CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+                                                   CUDNN_BATCHNORM_OPS_BN,
+                                                   &a,
+                                                   &b,
+                                                   Globals::Get().io_desc,
+                                                   inputs[batchnorm::kData].dptr_,
+                                                   nullptr,
+                                                   nullptr,  // zDesc, zData
+                                                   Globals::Get().io_desc,
+                                                   outputs[batchnorm::kOut].dptr_,
+                                                   Globals::Get().mean_desc,
+                                                   inputs[batchnorm::kGamma].dptr_,
+                                                   inputs[batchnorm::kBeta].dptr_,
+                                                   factor,
+                                                   inputs[batchnorm::kInMovingMean].dptr_,
+                                                   inputs[batchnorm::kInMovingVar].dptr_,
+                                                   param.eps,
+                                                   outputs[batchnorm::kMean].dptr_,
+                                                   outputs[batchnorm::kVar].dptr_,
+                                                   nullptr,  // activation desc
+                                                   workspace,
+                                                   workspace_size,
+                                                   nullptr,
+                                                   0));  // reserveSpace, reserveSpaceSizeInBytes
     } else {
-      CUDNN_CALL(cudnnBatchNormalizationForwardInference(
-          s->dnn_handle_, CUDNN_BATCHNORM_SPATIAL, &a, &b,
-          Globals::Get().io_desc, inputs[batchnorm::kData].dptr_,
-          Globals::Get().io_desc, outputs[batchnorm::kOut].dptr_,
-          Globals::Get().mean_desc,
-          inputs[batchnorm::kGamma].dptr_, inputs[batchnorm::kBeta].dptr_,
-          inputs[batchnorm::kInMovingMean].dptr_, inputs[batchnorm::kInMovingVar].dptr_,
-          param.eps));
+      CUDNN_CALL(cudnnBatchNormalizationForwardInference(s->dnn_handle_,
+                                                         CUDNN_BATCHNORM_SPATIAL,
+                                                         &a,
+                                                         &b,
+                                                         Globals::Get().io_desc,
+                                                         inputs[batchnorm::kData].dptr_,
+                                                         Globals::Get().io_desc,
+                                                         outputs[batchnorm::kOut].dptr_,
+                                                         Globals::Get().mean_desc,
+                                                         inputs[batchnorm::kGamma].dptr_,
+                                                         inputs[batchnorm::kBeta].dptr_,
+                                                         inputs[batchnorm::kInMovingMean].dptr_,
+                                                         inputs[batchnorm::kInMovingVar].dptr_,
+                                                         param.eps));
     }
   })
   // Set the lock on the auxiliary states.
@@ -154,23 +183,33 @@ void CudnnBatchNormForward(const BatchNormParam& param, const OpContext& ctx,
   Globals::Get().internal_aux_states_lock = true;
 }
 
-void CudnnBatchNormBackward(const BatchNormParam& param, const OpContext& ctx,
-                            const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+void CudnnBatchNormBackward(const BatchNormParam& param,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 8);
   CHECK_EQ(outputs.size(), 3);
   CHECK_EQ(req.size(), 3);
 
   SetDescriptors(param, inputs[3 + batchnorm::kData]);
-  auto s = ctx.get_stream<gpu>();
+  auto s                = ctx.get_stream<gpu>();
   size_t workspace_size = 0;
-  CUDNN_CALL(cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-      s->dnn_handle_, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN,
-      Globals::Get().io_desc, Globals::Get().io_desc, Globals::Get().io_desc, nullptr,
-      Globals::Get().io_desc, Globals::Get().mean_desc, nullptr, &workspace_size));
+  CUDNN_CALL(cudnnGetBatchNormalizationBackwardExWorkspaceSize(s->dnn_handle_,
+                                                               CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+                                                               CUDNN_BATCHNORM_OPS_BN,
+                                                               Globals::Get().io_desc,
+                                                               Globals::Get().io_desc,
+                                                               Globals::Get().io_desc,
+                                                               nullptr,
+                                                               Globals::Get().io_desc,
+                                                               Globals::Get().mean_desc,
+                                                               nullptr,
+                                                               &workspace_size));
   auto workspace = ctx.requested[0].get_space_internal(workspace_size, "CudnnBatchNormBackward");
   MSHADOW_REAL_TYPE_SWITCH(ParamType(inputs[3 + batchnorm::kData].type_flag_), DType, {
-    if (param.fix_gamma) inputs[3 + batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 1.0f;
+    if (param.fix_gamma)
+      inputs[3 + batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 1.0f;
     bool grad_add_gamma_beta = req[batchnorm::kGamma] == kAddTo || req[batchnorm::kBeta] == kAddTo;
     if (grad_add_gamma_beta) {
       if (IsBNWriting(req[batchnorm::kGamma]))
@@ -178,28 +217,43 @@ void CudnnBatchNormBackward(const BatchNormParam& param, const OpContext& ctx,
       if (IsBNWriting(req[batchnorm::kBeta]))
         outputs[batchnorm::kBeta].FlatTo1D<gpu, DType>(s) = 0.0f;
     }
-    DType a = 1.0f;
-    DType b = 0.0f;
-    DType b_add = 1.0f;
+    DType a                 = 1.0f;
+    DType b                 = 0.0f;
+    DType b_add             = 1.0f;
     const bool global_stats = !ctx.is_train || param.use_global_stats;
-    CUDNN_CALL(cudnnBatchNormalizationBackwardEx(
-        s->dnn_handle_, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, CUDNN_BATCHNORM_OPS_BN,
-        &a, req[batchnorm::kData] == kAddTo ? &b_add : &b,
-        &a, grad_add_gamma_beta ? &b_add : &b,
-        Globals::Get().io_desc, inputs[3 + batchnorm::kData].dptr_,
-        nullptr, nullptr,  // yDesc, yData
-        Globals::Get().io_desc, inputs[batchnorm::kOut].dptr_,
-        nullptr, nullptr,  // dzDesc, dzData
-        Globals::Get().io_desc, outputs[batchnorm::kData].dptr_,
-        Globals::Get().mean_desc,
-        inputs[3 + batchnorm::kGamma].dptr_, inputs[3 + batchnorm::kBeta].dptr_,
-        outputs[batchnorm::kGamma].dptr_, outputs[batchnorm::kBeta].dptr_, param.eps,
-        global_stats ? nullptr : inputs[batchnorm::kMean].dptr_,
-        global_stats ? nullptr : inputs[batchnorm::kVar].dptr_,
-        nullptr,  // activationDesc
-        workspace, workspace_size,
-        nullptr, 0));  // reserveSpace, reserveSpaceSizeInBytes
-    if (param.fix_gamma) outputs[batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 0.0f;
+    CUDNN_CALL(
+        cudnnBatchNormalizationBackwardEx(s->dnn_handle_,
+                                          CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+                                          CUDNN_BATCHNORM_OPS_BN,
+                                          &a,
+                                          req[batchnorm::kData] == kAddTo ? &b_add : &b,
+                                          &a,
+                                          grad_add_gamma_beta ? &b_add : &b,
+                                          Globals::Get().io_desc,
+                                          inputs[3 + batchnorm::kData].dptr_,
+                                          nullptr,
+                                          nullptr,  // yDesc, yData
+                                          Globals::Get().io_desc,
+                                          inputs[batchnorm::kOut].dptr_,
+                                          nullptr,
+                                          nullptr,  // dzDesc, dzData
+                                          Globals::Get().io_desc,
+                                          outputs[batchnorm::kData].dptr_,
+                                          Globals::Get().mean_desc,
+                                          inputs[3 + batchnorm::kGamma].dptr_,
+                                          inputs[3 + batchnorm::kBeta].dptr_,
+                                          outputs[batchnorm::kGamma].dptr_,
+                                          outputs[batchnorm::kBeta].dptr_,
+                                          param.eps,
+                                          global_stats ? nullptr : inputs[batchnorm::kMean].dptr_,
+                                          global_stats ? nullptr : inputs[batchnorm::kVar].dptr_,
+                                          nullptr,  // activationDesc
+                                          workspace,
+                                          workspace_size,
+                                          nullptr,
+                                          0));  // reserveSpace, reserveSpaceSizeInBytes
+    if (param.fix_gamma)
+      outputs[batchnorm::kGamma].FlatTo1D<gpu, DType>(s) = 0.0f;
   })
   Globals::Get().internal_aux_states_lock = false;
 }
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.h b/src/operator/nn/cudnn/cudnn_batch_norm.h
index 0f6bebce70b6..4a9905367763 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.h
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.h
@@ -21,7 +21,7 @@
  * \file cudnn_batch_norm.h
  * \brief
  * \author Junyuan Xie
-*/
+ */
 
 #ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_BATCH_NORM_H_
 #define MXNET_OPERATOR_NN_CUDNN_CUDNN_BATCH_NORM_H_
@@ -39,12 +39,16 @@ STATIC_ASSERT_CUDNN_VERSION_GE(7401);
 
 bool CudnnBatchNormSupports(const BatchNormParam& param, const TBlob& x);
 
-void CudnnBatchNormForward(const BatchNormParam& param, const OpContext& ctx,
-                           const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+void CudnnBatchNormForward(const BatchNormParam& param,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs);
 
-void CudnnBatchNormBackward(const BatchNormParam& param, const OpContext& ctx,
-                            const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+void CudnnBatchNormBackward(const BatchNormParam& param,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs);
 
 #endif  // MXNET_USE_CUDNN == 1
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
new file mode 100644
index 000000000000..f295f144efe3
--- /dev/null
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -0,0 +1,831 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cudnn_convolution-inl.h
+ * \brief
+ * \author Bing Xu
+ */
+#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_
+
+#include <mxnet/storage.h>
+#include <algorithm>
+#include <vector>
+#include <mutex>
+#include <string>
+#include "../convolution-inl.h"
+#include "./cudnn_algoreg-inl.h"
+#include "../../../common/cuda/utils.h"
+
+namespace mxnet {
+namespace op {
+#if MXNET_USE_CUDNN == 1
+
+/*!
+ * \brief The Operator used to perform convolution using cuDNN kernels.
+ */
+template <typename DType>
+class CuDNNConvolutionOp {
+  STATIC_ASSERT_CUDNN_VERSION_GE(7000);
+
+ public:
+  CuDNNConvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+    parallelize_backward_kernels_ = Context::GetGPUStreamsPerWorker() >= 2;
+  }
+
+  void Init(const ConvolutionParam& param,
+            int forward_compute_type,
+            int backward_compute_type,
+            const mxnet::ShapeVector& in_shape,
+            const mxnet::ShapeVector& out_shape,
+            const RunContext& rctx,
+            bool add_to_weight) {
+    using namespace mshadow;
+    this->param_         = param;
+    this->add_to_weight_ = add_to_weight;
+    InitBufferForParam();
+    auto cudnn_forward_compute_type  = convertToCuDNNDataType(forward_compute_type);
+    auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
+    // convert MB to words
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    dtype_           = DataType<DType>::kCudnnFlag;
+    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
+    cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
+
+    auto effective_layout = param_.layout.value();
+    switch (effective_layout) {
+      // 1D convolutions will be executed as 2D convolutions with a height of 1.
+      case mshadow::kNCW:
+        effective_layout = mshadow::kNCHW;
+        break;
+      case mshadow::kNWC:
+        effective_layout = mshadow::kNHWC;
+        break;
+      case mshadow::kCWN:
+        effective_layout = mshadow::kCHWN;
+        break;
+      default:
+        break;
+    }
+
+    MSHADOW_LAYOUT_SWITCH(effective_layout, Layout, { format_ = LayoutType<Layout>::kCudnnFlag; });
+    // Double check to make sure this class supports the operation
+    if (!Supports(param, forward_compute_type, backward_compute_type, rctx.ctx.dev_id))
+      LOG(FATAL) << "Convolution parameters not supported by cuDNN implementation.";
+
+    InitDescriptors(in_shape, out_shape, cudnn_forward_compute_type, cudnn_backward_compute_type);
+
+    if (!param_.cudnn_tune) {
+      param_.cudnn_tune = dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 1);
+    }
+    // In cuDNN_v6, dilated convolution descriptors are compatible with only a
+    // single convolution algorithm.  Despite this, we go through the algorithm
+    // selection process, which will return the only algorithm supported.  This
+    // approach keeps the treatment of convolution cases uniform and will
+    // naturally respond to more algorithms supporting dilated convolutions in
+    // future cuDNN releases.
+    SelectAlgo(rctx, in_shape, out_shape, cudnn_forward_compute_type, cudnn_backward_compute_type);
+    GetTempSize(rctx);
+  }
+
+  ~CuDNNConvolutionOp() {
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+  }
+
+  void Forward(const OpContext& ctx,
+               const std::vector<TBlob>& in_data,
+               const std::vector<OpReqType>& req,
+               const std::vector<TBlob>& out_data) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<gpu>* s                  = ctx.get_stream<gpu>();
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
+    size_t workspace_size           = TensorSizeBytes(workspace);
+
+    // I/O's should have 2 more dims than the kernel dim
+    DType* data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
+    DType* wmat_ptr = GetNdPtr(in_data[conv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* out_ptr  = GetNdPtr(out_data[conv::kOut], param_.kernel.ndim() + 2, s);
+
+    typename DataType<DType>::ScaleType alpha    = 1.0f;
+    typename DataType<DType>::ScaleType beta     = 0.0f;
+    typename DataType<DType>::ScaleType beta_add = 1.0f;
+    CUDNN_CALL(cudnnConvolutionForward(s->dnn_handle_,
+                                       &alpha,
+                                       in_desc_,
+                                       data_ptr,
+                                       filter_desc_,
+                                       wmat_ptr,
+                                       forward_conv_desc_,
+                                       forward_algo_.AlgoNumber(),
+                                       workspace.dptr_,
+                                       workspace_size,
+                                       req[conv::kOut] == kAddTo ? &beta_add : &beta,
+                                       out_desc_,
+                                       out_ptr));
+
+    if (!param_.no_bias) {
+      Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(s);
+      CUDNN_CALL(cudnnAddTensor(
+          s->dnn_handle_, &alpha, bias_desc_, bias.dptr_, &beta_add, out_desc_, out_ptr));
+    }
+  }
+
+  void Backward(const OpContext& ctx,
+                const std::vector<TBlob>& out_grad,
+                const std::vector<TBlob>& in_data,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
+    Stream<gpu>* s = ctx.get_stream<gpu>();
+    // RAII object to handle syncing of the underlying auxiliary stream with the primary stream
+    SyncedGPUAuxStream s_dgrad = ctx.get_gpu_aux_stream();
+
+    // I/O's should have 2 more dims than the kernel dim
+    DType* grad_ptr  = GetNdPtr(out_grad[conv::kOut], param_.kernel.ndim() + 2, s);
+    DType* wmat_ptr  = GetNdPtr(in_data[conv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* gwmat_ptr = GetNdPtr(in_grad[conv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* data_ptr  = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
+    DType* gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s);
+
+    size_t backward_workspace_byte =
+        parallelize_backward_kernels_ ?
+            back_workspace_byte_dgrad_ + back_workspace_byte_wgrad_ :
+            std::max(back_workspace_byte_dgrad_, back_workspace_byte_wgrad_);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte);
+    size_t workspace_size           = TensorSizeBytes(workspace);
+    DType* workspace_dptr_wgrad     = workspace.dptr_;
+    DType* workspace_dptr_dgrad     = workspace.dptr_;
+    if (parallelize_backward_kernels_) {
+      CHECK_LE(back_workspace_byte_dgrad_ + back_workspace_byte_wgrad_, workspace_size);
+      // Large allocations at some point will be given their own page.  Pass this alignment on to
+      // the larger of the two separate dgrad/wgrad workspaces.  This probably doesn't matter, but
+      // corresponds more closely to the workspace alignments used during cudnnFind.
+      if (back_workspace_byte_dgrad_ > back_workspace_byte_wgrad_)
+        workspace_dptr_wgrad = workspace.dptr_ + back_workspace_byte_dgrad_ / sizeof(DType);
+      else
+        workspace_dptr_dgrad = workspace.dptr_ + back_workspace_byte_wgrad_ / sizeof(DType);
+    } else {
+      CHECK_LE(back_workspace_byte_dgrad_, workspace_size);
+      CHECK_LE(back_workspace_byte_wgrad_, workspace_size);
+    }
+    typename DataType<DType>::ScaleType alpha    = 1.0f;
+    typename DataType<DType>::ScaleType beta     = 0.0f;
+    typename DataType<DType>::ScaleType beta_add = 1.0f;
+    if (req[conv::kWeight] != kNullOp) {
+      CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo);
+      CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
+                                                &alpha,
+                                                in_desc_,
+                                                data_ptr,
+                                                out_desc_,
+                                                grad_ptr,
+                                                back_conv_desc_w_,
+                                                back_algo_w_.AlgoNumber(),
+                                                workspace_dptr_wgrad,
+                                                back_workspace_byte_wgrad_,
+                                                req[conv::kWeight] == kAddTo ? &beta_add : &beta,
+                                                filter_desc_,
+                                                gwmat_ptr));
+    }
+    if (!param_.no_bias && (req[conv::kBias] != kNullOp)) {
+      Tensor<gpu, 1, DType> gbias = in_grad[conv::kBias].get<gpu, 1, DType>(s);
+      CUDNN_CALL(cudnnConvolutionBackwardBias(s->dnn_handle_,
+                                              &alpha,
+                                              out_desc_,
+                                              grad_ptr,
+                                              req[conv::kBias] == kAddTo ? &beta_add : &beta,
+                                              bias_desc_,
+                                              gbias.dptr_));
+    }
+    if (req[conv::kData] != kNullOp) {
+      CUDNN_CALL(cudnnConvolutionBackwardData(s_dgrad.GetStream()->dnn_handle_,
+                                              &alpha,
+                                              filter_desc_,
+                                              wmat_ptr,
+                                              out_desc_,
+                                              grad_ptr,
+                                              back_conv_desc_,
+                                              back_algo_.AlgoNumber(),
+                                              workspace_dptr_dgrad,
+                                              back_workspace_byte_dgrad_,
+                                              req[conv::kData] == kAddTo ? &beta_add : &beta,
+                                              in_desc_,
+                                              gdata_ptr));
+    }
+  }
+
+  /*!
+   * \brief Returns whether the cuDNN library version supports the convolution
+   * operation described by `param`: cuDNN v5 and earlier does not support
+   * dilated convolutions.  Dilation only enabled after v6.0.20.
+   */
+  static bool Supports(ConvolutionParam param,
+                       int forward_compute_type,
+                       int backward_compute_type,
+                       int dev_id) {
+    using namespace mshadow;
+
+    // NDHWC not supported, NHWC not supported in true fp16
+    auto layout_val = param.layout.value();
+    auto true_fp16  = DataType<DType>::kFlag == kFloat16 &&
+                     (forward_compute_type == kFloat16 || backward_compute_type == kFloat16);
+    if (layout_val == kNDHWC || layout_val == kNWC || layout_val == kNHWC && true_fp16)
+      return false;
+
+    // Permits graceful fallback to pseudo-fp16 on heterogenous systems
+    if (!SupportsFloat16Compute(dev_id) &&
+        (forward_compute_type == kFloat16 || backward_compute_type == kFloat16)) {
+      return false;
+    }
+
+    return true;
+  }
+
+ private:
+  /*!
+   * \brief Translate an mxnet datatype to the corresponding cudnnDataType_t.
+   */
+  cudnnDataType_t convertToCuDNNDataType(int dtype) {
+    cudnnDataType_t converted = CUDNN_DATA_FLOAT;
+    // The following will always assign to `converted` or throw an exception.
+    MSHADOW_REAL_TYPE_SWITCH(
+        dtype, mxDType, { converted = mshadow::DataType<mxDType>::kCudnnFlag; })
+    return converted;
+  }
+
+  void InitDescriptors(const mxnet::ShapeVector& in_shape,
+                       const mxnet::ShapeVector& out_shape,
+                       cudnnDataType_t cudnn_forward_compute_type,
+                       cudnnDataType_t cudnn_backward_compute_type) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_shape.size(), expected);
+    CHECK_EQ(out_shape.size(), 1U);
+
+    mxnet::TShape dshape = in_shape[conv::kData];
+    mxnet::TShape wshape = in_shape[conv::kWeight];
+    mxnet::TShape oshape = out_shape[conv::kOut];
+    mxnet::TShape dstride, ostride;
+
+    if (param_.kernel.ndim() == 1 || param_.kernel.ndim() == 2) {
+      // 1d or 2d conv
+      auto pad = param_.kernel.ndim() == 2 ? param_.pad : mxnet::TShape({0, param_.pad[0]});
+      auto stride =
+          param_.kernel.ndim() == 2 ? param_.stride : mxnet::TShape({1, param_.stride[0]});
+      auto dilate =
+          param_.kernel.ndim() == 2 ? param_.dilate : mxnet::TShape({1, param_.dilate[0]});
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_forward_compute_type));
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+      if (param_.kernel.ndim() == 2) {
+        wshape  = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+        dstride = ConvertLayout(Strides<4>(dshape), param_.layout.value(), kNCHW);
+        dshape  = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+        ostride = ConvertLayout(Strides<4>(oshape), param_.layout.value(), kNCHW);
+        oshape  = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
+      } else {
+        wshape  = ConvertLayout(wshape.get<3>(), param_.layout.value(), kNCW);
+        wshape  = mxnet::TShape({wshape[0], wshape[1], 1, wshape[2]});
+        dstride = ConvertLayout(Strides<3>(dshape), param_.layout.value(), kNCW);
+        dstride = mxnet::TShape({dstride[0], dstride[1], dstride[1], dstride[2]});
+        dshape  = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+        dshape  = mxnet::TShape({dshape[0], dshape[1], 1, dshape[2]});
+        ostride = ConvertLayout(Strides<3>(oshape), param_.layout.value(), kNCW);
+        ostride = mxnet::TShape({ostride[0], ostride[1], ostride[1], ostride[2]});
+        oshape  = ConvertLayout(oshape.get<3>(), param_.layout.value(), kNCW);
+        oshape  = mxnet::TShape({oshape[0], oshape[1], 1, oshape[2]});
+      }
+      CUDNN_CALL(cudnnSetFilter4dDescriptor(
+          filter_desc_, dtype_, format_, wshape[0], wshape[1], wshape[2], wshape[3]));
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h    = pad[0];
+      auto pad_w    = pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      CHECK_EQ(param_.layout.value(), kNCDHW) << "CuDNN only support 3D conv with NCDHW layout";
+      std::vector<int> wshape_buffer(wshape.ndim());
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(filter_desc_,
+                                            dtype_,
+                                            CUDNN_TENSOR_NCHW,
+                                            static_cast<int>(wshape.ndim()),
+                                            CastTShapeToIntPtr(wshape, &wshape_buffer)));
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(forward_conv_desc_,
+                                                 3,
+                                                 param_pad_.data(),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_forward_compute_type));
+
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(back_conv_desc_,
+                                                 3,
+                                                 param_pad_.data(),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(back_conv_desc_w_,
+                                                 3,
+                                                 param_pad_.data(),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+
+      dstride = ConvertLayout(Strides<5>(dshape), param_.layout.value(), kNCDHW);
+      dshape  = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+      ostride = ConvertLayout(Strides<5>(oshape), param_.layout.value(), kNCDHW);
+      oshape  = ConvertLayout(oshape.get<5>(), param_.layout.value(), kNCDHW);
+    }
+    // Set "allow tensor core" flag in convolution descriptors, if available.
+    cudnnMathType_t math_type = cudnn_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+#if CUDNN_VERSION >= 7200
+    if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+        (DataType<DType>::kFlag != kFloat16))
+      math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+#endif
+    CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, math_type));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, math_type));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, math_type));
+    CUDNN_CALL(cudnnSetConvolutionGroupCount(forward_conv_desc_, param_.num_group));
+    CUDNN_CALL(cudnnSetConvolutionGroupCount(back_conv_desc_, param_.num_group));
+    CUDNN_CALL(cudnnSetConvolutionGroupCount(back_conv_desc_w_, param_.num_group));
+
+    std::vector<int> dshape_buffer(dshape.ndim());
+    nnvm::ShapeTypeCast(dshape.begin(), dshape.end(), dshape_buffer.data());
+    std::vector<int> dstride_buffer(dstride.ndim());
+    nnvm::ShapeTypeCast(dstride.begin(), dstride.end(), dstride_buffer.data());
+
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
+                                          dtype_,
+                                          static_cast<int>(dshape.ndim()),
+                                          dshape_buffer.data(),
+                                          dstride_buffer.data()));
+
+    std::vector<int> oshape_buffer(oshape.ndim());
+    nnvm::ShapeTypeCast(oshape.begin(), oshape.end(), oshape_buffer.data());
+    std::vector<int> ostride_buffer(ostride.ndim());
+    nnvm::ShapeTypeCast(ostride.begin(), ostride.end(), ostride_buffer.data());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
+                                          dtype_,
+                                          static_cast<int>(oshape.ndim()),
+                                          oshape_buffer.data(),
+                                          ostride_buffer.data()));
+
+    if (!param_.no_bias) {
+      mxnet::TShape bias           = in_shape[conv::kBias];
+      int bias_dim                 = static_cast<int>(bias[0]);
+      std::vector<int> bias_shape  = {1, bias_dim, 1, 1};
+      std::vector<int> bias_stride = {bias_dim, 1, bias_dim, bias_dim};
+      if (param_.kernel.ndim() == 3) {
+        bias_shape.push_back(1);
+        bias_stride.push_back(bias_dim);
+      }
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(bias_desc_,
+                                            dtype_,
+                                            static_cast<int>(bias_shape.size()),
+                                            &bias_shape[0],
+                                            &bias_stride[0]));
+    }
+  }
+
+  void CuDNNAlgoSetter(const RunContext& rctx,
+                       const mxnet::ShapeVector& in_shape,
+                       const mxnet::ShapeVector& out_shape,
+                       cudnnDataType_t cudnn_forward_compute_type,
+                       cudnnDataType_t cudnn_backward_compute_type,
+                       CuDNNAlgo<cudnnConvolutionFwdAlgo_t>* fwd,
+                       CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t>* bwd,
+                       CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t>* flt) {
+    // Not in algo registry, must determine via *Get*() or *Find*()
+    mshadow::Stream<gpu>* s = rctx.get_stream<gpu>();
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
+
+    // Since the function signature of *Get*_v7() matches that of *Find*(),
+    // we can unify the find-vs-get logic by using function pointers.
+
+    // Forward Algorithm Find/Get() v7
+    std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_));
+    int actual_fwd_algos     = 0;
+    auto fwd_algo_discoverer = param_.cudnn_tune.value() == conv::kOff ?
+                                   cudnnGetConvolutionForwardAlgorithm_v7 :
+                                   cudnnFindConvolutionForwardAlgorithm;
+    CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_,
+                                      in_desc_,
+                                      filter_desc_,
+                                      forward_conv_desc_,
+                                      out_desc_,
+                                      fwd_results.size(),
+                                      &actual_fwd_algos,
+                                      fwd_results.data()));
+    fwd_results.resize(actual_fwd_algos);
+    AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, cudnnConvolutionFwdAlgo_t>(
+        fwd_results, "forward", workspace_byte, fwd);
+
+    // Backprop-to-Filter Algorithm Find/Get() v7
+    auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
+    std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
+    int actual_bwd_filter_algos = 0;
+    // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
+    // were summing into the output (i.e. beta != 0).  Get() returned OK algos though.
+    auto bwd_filter_algo_discoverer = param_.cudnn_tune.value() == conv::kOff ?
+                                          cudnnGetConvolutionBackwardFilterAlgorithm_v7 :
+                                          cudnnFindConvolutionBackwardFilterAlgorithm;
+    CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_,
+                                             in_desc_,
+                                             out_desc_,
+                                             back_conv_desc_w_,
+                                             filter_desc_,
+                                             bwd_filt_results.size(),
+                                             &actual_bwd_filter_algos,
+                                             bwd_filt_results.data()));
+    bwd_filt_results.resize(actual_bwd_filter_algos);
+    AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, cudnnConvolutionBwdFilterAlgo_t>(
+        bwd_filt_results, "backprop-to-filter", workspace_byte, flt);
+
+    // Backprop-to-Data Algorithm Find/Get() v7
+    auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_);
+    std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos);
+    int actual_bwd_data_algos     = 0;
+    auto bwd_data_algo_discoverer = param_.cudnn_tune.value() == conv::kOff ?
+                                        cudnnGetConvolutionBackwardDataAlgorithm_v7 :
+                                        cudnnFindConvolutionBackwardDataAlgorithm;
+    CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_,
+                                           filter_desc_,
+                                           out_desc_,
+                                           back_conv_desc_,
+                                           in_desc_,
+                                           bwd_data_results.size(),
+                                           &actual_bwd_data_algos,
+                                           bwd_data_results.data()));
+    bwd_data_results.resize(actual_bwd_data_algos);
+    AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, cudnnConvolutionBwdDataAlgo_t>(
+        bwd_data_results, "backprop-to-data", workspace_byte, bwd, exclude_dgrad_algo_);
+
+    // Fix for issue #11241
+    int cudnn_find_issue_max_features = 64 * 1024;
+    if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) {
+      flt->Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
+    }
+  }
+
+  void SelectAlgo(const RunContext& rctx,
+                  const mxnet::ShapeVector& in_shape,
+                  const mxnet::ShapeVector& out_shape,
+                  cudnnDataType_t cudnn_forward_compute_type,
+                  cudnnDataType_t cudnn_backward_compute_type) {
+    auto algo_setter = [&](CuDNNAlgo<cudnnConvolutionFwdAlgo_t>* fwd,
+                           CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t>* bwd,
+                           CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t>* flt) {
+      if (param_.cudnn_tune.value() == conv::kOff) {
+        // The routine will only be calling cudnnGet, so no need to grab the Storage lock.
+        this->CuDNNAlgoSetter(rctx,
+                              in_shape,
+                              out_shape,
+                              cudnn_forward_compute_type,
+                              cudnn_backward_compute_type,
+                              fwd,
+                              bwd,
+                              flt);
+      } else {
+        // One potential problem is that cudnnFind() uses cudaMalloc() to directly allocate
+        // I/O and workspace areas, and these allocations may result in an out-of-memory
+        // error even though the StorageMangager free pool is not empty.  Ideally, cudnnFind
+        // would use MXNet's storage allocator for its I/O and workspace areas, instead of using
+        // the area carved out by MXNET_GPU_MEM_POOL_RESERVE.
+        // To get somewhat the same effect as this, we can pre-allocate the areas needed for the
+        // I/Os (possibly triggering a desirable StorageManager::ReleaseAll()), followed by a
+        // DirectFree(), which makes these areas available for cudnn's subsequent cudaMalloc().
+
+        // Allocate for x (or dx), w (or dw) and y (or dy).
+        ReserveElements({in_shape[conv::kData].Size(),
+                         in_shape[conv::kWeight].Size(),
+                         out_shape[conv::kOut].Size()});
+
+        // We're about to call cudnnFind so we need to quiet the system by grabbing
+        // the Storage lock.  Concurrent cudaMalloc's can disrupt the accurate timing
+        // measurements of the algos, and can prevent the cuda driver's proper freeing
+        // of cudnnFind's internal temporary allocations.  Grabbing the lock might also
+        // impede other threads from launching work on the GPU.
+        std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+        this->CuDNNAlgoSetter(rctx,
+                              in_shape,
+                              out_shape,
+                              cudnn_forward_compute_type,
+                              cudnn_backward_compute_type,
+                              fwd,
+                              bwd,
+                              flt);
+      }
+    };
+
+    CuDNNConvAlgoReg::Get()->FindOrElseRegister(param_,
+                                                in_shape,
+                                                out_shape,
+                                                dtype_,
+                                                cudnn_forward_compute_type,
+                                                cudnn_backward_compute_type,
+                                                SMArch(rctx.ctx.dev_id),
+                                                add_to_weight_,
+                                                &forward_algo_,
+                                                &back_algo_,
+                                                &back_algo_w_,
+                                                algo_setter);
+
+    // If we're allowing Tensor Core variants of the algos to be considered in
+    // *Find*() or *Get*(), but a non-Tensor-Core algo variant is the fastest,
+    // we must change the descriptor to preclude Tensor Core.  Simplest is to
+    // once again set the mathType in all cases.
+    CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, forward_algo_.MathType()));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, back_algo_.MathType()));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, back_algo_w_.MathType()));
+  }
+
+  // Look over the results from *Find*() or *Get*() and pick the fastest algo given possible
+  // workspace constraints.
+  template <typename PerfType, typename AlgoType>
+  void AlgoFinalSelect(const std::vector<PerfType>& perf_results,
+                       std::string kernel_name,
+                       size_t workspace_byte,
+                       CuDNNAlgo<AlgoType>* algo,
+                       int32_t algo_exclude = -1) {
+    // Determine the fastest acceptable algo that matches the algo_preference (-1 = any),
+    // regardless of mathType.
+    bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
+    for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
+      const auto& result       = perf_results[i];
+      bool algo_exclusion      = static_cast<int32_t>(result.algo) == algo_exclude;
+      bool algo_is_tensor_core = false;
+      algo_is_tensor_core      = result.mathType == CUDNN_TENSOR_OP_MATH;
+      if (result.status == CUDNN_STATUS_SUCCESS &&
+          (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
+          (param_.cudnn_tune.value() == conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
+        algo->Set(result.algo, algo_is_tensor_core);
+        return;
+      }
+    }
+    auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm. "
+               << " with workspace size of " << workspace_byte << " bytes,"
+               << " please consider reducing batch/model size or increasing the workspace size";
+  }
+
+  void GetTempSize(const RunContext& rctx) {
+    mshadow::Stream<gpu>* s = rctx.get_stream<gpu>();
+    CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+                                                            filter_desc_,
+                                                            out_desc_,
+                                                            back_conv_desc_,
+                                                            in_desc_,
+                                                            back_algo_.AlgoNumber(),
+                                                            &back_workspace_byte_dgrad_));
+    CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+                                                              in_desc_,
+                                                              out_desc_,
+                                                              back_conv_desc_w_,
+                                                              filter_desc_,
+                                                              back_algo_w_.AlgoNumber(),
+                                                              &back_workspace_byte_wgrad_));
+    // cudaMalloc returns addresses that are aligned for large accesses (e.g. to 512 bytes).
+    // Since we only make one allocation and divide it into two parts when we parallelize
+    // the dgrad and wgrad kernels, we round the sizes up to this alignment size so the
+    // dptrs respect this alignment, even if the separate areas are stacked.
+    const size_t dptr_alignment = 512;
+    back_workspace_byte_dgrad_  = RoundToMultiple(back_workspace_byte_dgrad_, dptr_alignment);
+    back_workspace_byte_wgrad_  = RoundToMultiple(back_workspace_byte_wgrad_, dptr_alignment);
+
+    CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+                                                       in_desc_,
+                                                       filter_desc_,
+                                                       forward_conv_desc_,
+                                                       out_desc_,
+                                                       forward_algo_.AlgoNumber(),
+                                                       &forward_workspace_byte_));
+  }
+
+  int* CastTShapeToIntPtr(const mxnet::TShape& s, std::vector<int>* buffer) {
+    buffer->resize(s.ndim());
+    nnvm::ShapeTypeCast(s.begin(), s.end(), buffer->data());
+    return buffer->data();
+  }
+
+  // Converts a TBlob to a dptr, checking for the expected dim and that it's contiguous.
+  DType* GetNdPtr(const TBlob& tb, int dim, Stream<gpu>* s) {
+    DType* data_ptr = nullptr;
+    if (dim == 3) {
+      Tensor<gpu, 3, DType> data = tb.get<gpu, 3, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else if (dim == 4) {
+      Tensor<gpu, 4, DType> data = tb.get<gpu, 4, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else if (dim == 5) {
+      Tensor<gpu, 5, DType> data = tb.get<gpu, 5, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else {
+      LOG(FATAL) << "Unexpected Tensor size " << dim << ", supporting only 3, 4 or 5.";
+    }
+    return data_ptr;
+  }
+
+  // Converts a mxnet::TShape to a Shape<> of strides.
+  // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
+  template <int dim>
+  inline Shape<dim> Strides(const mxnet::TShape& s) {
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
+      strides[i] = s.ProdShape(i + 1, ndim);
+    return strides.get<dim>();
+  }
+
+  void InitBufferForParam() {
+    CastTShapeToIntPtr(param_.stride, &param_stride_);
+    CastTShapeToIntPtr(param_.dilate, &param_dilate_);
+    CastTShapeToIntPtr(param_.pad, &param_pad_);
+  }
+
+  // Round a value 'x' up to the next multiple of 'multiple'
+  size_t RoundToMultiple(size_t x, size_t multiple) {
+    size_t retVal = ((x + multiple - 1) / multiple) * multiple;
+    return retVal;
+  }
+
+  // Allocates a 1D Tensor of words with size in bytes >= `size_bytes`.
+  // Always allocates at least one word.
+  mshadow::Tensor<gpu, 1, DType> AllocateTempWorkspace(const OpContext& ctx, size_t size_bytes) {
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    size_t size_words =
+        std::max<size_t>(1, RoundToMultiple(size_bytes, sizeof(DType)) / sizeof(DType));
+    return ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
+        mshadow::Shape1(size_words), s);
+  }
+
+  // Returns the size in bytes of the 1D Tensor of words.
+  size_t TensorSizeBytes(const mshadow::Tensor<gpu, 1, DType>& tensor) {
+    return tensor.MSize() * sizeof(DType);
+  }
+
+  // Given a tensor shape of this operation, return the number of features 'c'
+  int64_t Features(const mxnet::TShape& dshape) {
+    int c = 0;
+    switch (dshape.ndim()) {
+      case 3:
+        c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1];
+        break;
+      case 4:
+        c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1];
+        break;
+      case 5:
+        c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1];
+        break;
+      default:
+        LOG(FATAL) << "Unexpected convolution data dimension " << dshape.ndim();
+    }
+    return c;
+  }
+
+  // Make a number of allocations and directly free them, ensuring room for an equivalent set of
+  // cudaMalloc() calls by (say) cudnnFind().  `elements` spec the alloc size in DTypes, not bytes.
+  void ReserveElements(const std::vector<size_t>& elements) {
+    std::vector<Storage::Handle> handles;
+    for (size_t alloc_element : elements) {
+      handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU()));
+      handles.back().profiler_scope = "<ephemeral>:";
+      handles.back().name           = "reserve_elements";
+    }
+    for (auto& handle : handles)
+      Storage::Get()->DirectFree(handle);
+  }
+
+  // Log that no suitable algo was found that met the workspace constraints, then exit.
+  void LogNoSuitableAlgoAndExit(int num_algos_tried,
+                                size_t min_memory_needs,
+                                size_t workspace_byte,
+                                std::string algo_kind) {
+    LOG(FATAL) << num_algos_tried << " " << algo_kind << " with minimum memory requirement "
+               << min_memory_needs << " bytes have been tried. Workspace size is set to "
+               << workspace_byte << " bytes, please consider reducing the batch/model size, "
+               << "or increasing workspace size.";
+  }
+
+  std::vector<int> param_stride_;
+  std::vector<int> param_dilate_;
+  std::vector<int> param_pad_;
+
+  // Temp workspace size in bytes needed for Forward() operation.
+  size_t forward_workspace_byte_;
+  // Temp workspace size in bytes needed for Backward() dgrad (data gradient) operation.
+  size_t back_workspace_byte_dgrad_;
+  // Temp workspace size in bytes needed for Backward() wgrad (weight gradient) operation.
+  size_t back_workspace_byte_wgrad_;
+  cudnnDataType_t dtype_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  // Convolution descriptor for forward inference operation
+  cudnnConvolutionDescriptor_t forward_conv_desc_;
+  // Convolution descriptor for back-prop operations to the data
+  cudnnConvolutionDescriptor_t back_conv_desc_;
+  // Convolution descriptor for back-prop operations to the weights
+  cudnnConvolutionDescriptor_t back_conv_desc_w_;
+  // Should dgrad and wgrad be launched into separate streams
+  bool parallelize_backward_kernels_;
+  // Algorithm for the forward inference operation
+  CuDNNAlgo<cudnnConvolutionFwdAlgo_t> forward_algo_;
+  // Algorithm for the back-prop operation to the data
+  CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> back_algo_;
+  // Algorithm for the back-prop operation to the weights
+  CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> back_algo_w_;
+  cudnnTensorFormat_t format_;
+  // Allow TensorCore algo policy
+  bool cudnn_tensor_core_;
+  // Is req[kWeight] == conv::kAddTo ?
+  bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
+  ConvolutionParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
new file mode 100644
index 000000000000..b6dddf318d26
--- /dev/null
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -0,0 +1,852 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cudnn_deconvolution-inl.h
+ * \brief
+ * \author Wei Wu, Leonard Lausen
+ */
+#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_
+
+#include <mxnet/storage.h>
+#include <algorithm>
+#include <vector>
+#include <mutex>
+#include <string>
+#include "../deconvolution-inl.h"
+#include "./cudnn_algoreg-inl.h"
+#include "../../../common/cuda/utils.h"
+
+namespace mxnet {
+namespace op {
+#if MXNET_USE_CUDNN == 1
+
+template <typename DType>
+class CuDNNDeconvolutionOp {
+  STATIC_ASSERT_CUDNN_VERSION_GE(7000);
+
+ public:
+  CuDNNDeconvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+  }
+
+  void Init(DeconvolutionParam param,
+            int forward_compute_type,
+            int backward_compute_type,
+            const mxnet::ShapeVector& in_shape,
+            const mxnet::ShapeVector& out_shape,
+            const RunContext& rctx,
+            bool add_to_weight) {
+    using namespace mshadow;
+    this->param_         = param;
+    this->add_to_weight_ = add_to_weight;
+    InitBufferForParam();
+    auto cudnn_forward_compute_type  = convertToCuDNNDataType(forward_compute_type);
+    auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
+    // convert MB to words
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    dtype_           = mshadow::DataType<DType>::kCudnnFlag;
+    // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy.
+    cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
+
+    auto effective_layout = param_.layout.value();
+    switch (effective_layout) {
+      // 1D convolutions will be executed as 2D convolutions with a height of 1.
+      case mshadow::kNCW:
+        effective_layout = mshadow::kNCHW;
+        break;
+      case mshadow::kNWC:
+        effective_layout = mshadow::kNHWC;
+        break;
+      case mshadow::kCWN:
+        effective_layout = mshadow::kCHWN;
+        break;
+      default:
+        break;
+    }
+
+    MSHADOW_LAYOUT_SWITCH(effective_layout, Layout, { format_ = LayoutType<Layout>::kCudnnFlag; });
+    // Double check to make sure this class supports the operation
+    if (!Supports(param, forward_compute_type, backward_compute_type, rctx.ctx.dev_id))
+      LOG(FATAL) << "Deconvolution parameters not supported by cuDNN implementation.";
+
+    InitDescriptors(in_shape, out_shape, cudnn_forward_compute_type, cudnn_backward_compute_type);
+
+    if (!param_.cudnn_tune) {
+      param_.cudnn_tune = dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 1);
+    }
+    // In cuDNN_v6, dilated convolution descriptors are compatible with only a
+    // single convolution algorithm.  Despite this, we go through the algorithm
+    // selection process, which will return the only algorithm supported.  This
+    // approach keeps the treatment of convolution cases uniform and will
+    // naturally respond to more algorithms supporting dilated convolutions in
+    // future cuDNN releases.
+    SelectAlgo(rctx, in_shape, out_shape, cudnn_forward_compute_type, cudnn_backward_compute_type);
+  }
+
+  ~CuDNNDeconvolutionOp() {
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+  }
+
+  void Forward(const OpContext& ctx,
+               const std::vector<TBlob>& in_data,
+               const std::vector<OpReqType>& req,
+               const std::vector<TBlob>& out_data) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<gpu>* s = ctx.get_stream<gpu>();
+    GetTempSize(ctx);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
+    size_t workspace_size           = TensorSizeBytes(workspace);
+
+    // I/O's should have 2 more dims than the kernel dim
+    DType* data_ptr = GetNdPtr(in_data[deconv::kData], param_.kernel.ndim() + 2, s);
+    DType* wmat_ptr = GetNdPtr(in_data[deconv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* out_ptr  = GetNdPtr(out_data[deconv::kOut], param_.kernel.ndim() + 2, s);
+
+    for (uint32_t g = 0; g < param_.num_group; ++g) {
+      typename DataType<DType>::ScaleType alpha = 1.0f;
+      typename DataType<DType>::ScaleType beta  = 0.0f;
+      CUDNN_CALL(cudnnConvolutionBackwardData(
+          s->dnn_handle_,
+          &alpha,
+          filter_desc_,
+          wmat_ptr + weight_offset_ * g,
+          in_desc_,
+          data_ptr + data_offset_ * g,
+          forward_conv_desc_,  // this backward algorithm used for inference
+          back_algo_.AlgoNumber(),
+          workspace.dptr_,
+          workspace_size,
+          &beta,
+          out_desc_,
+          out_ptr + out_offset_ * g));
+      if (!param_.no_bias) {
+        beta                       = 1.0f;
+        Tensor<gpu, 1, DType> bias = in_data[deconv::kBias].get<gpu, 1, DType>(s);
+        CUDNN_CALL(cudnnAddTensor(s->dnn_handle_,
+                                  &alpha,
+                                  bias_desc_,
+                                  bias.dptr_ + bias_offset_ * g,
+                                  &beta,
+                                  out_desc_,
+                                  out_ptr + out_offset_ * g));
+      }
+    }
+  }
+
+  void Backward(const OpContext& ctx,
+                const std::vector<TBlob>& out_grad,
+                const std::vector<TBlob>& in_data,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U);
+    CHECK_EQ(in_grad.size(), expected);
+    Stream<gpu>* s = ctx.get_stream<gpu>();
+
+    // I/O's should have 2 more dims than the kernel dim
+    DType* grad_ptr  = GetNdPtr(out_grad[deconv::kOut], param_.kernel.ndim() + 2, s);
+    DType* wmat_ptr  = GetNdPtr(in_data[deconv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* gwmat_ptr = GetNdPtr(in_grad[deconv::kWeight], param_.kernel.ndim() + 2, s);
+    DType* data_ptr  = GetNdPtr(in_data[deconv::kData], param_.kernel.ndim() + 2, s);
+    DType* gdata_ptr = GetNdPtr(in_grad[deconv::kData], param_.kernel.ndim() + 2, s);
+
+    CHECK_NE(req[deconv::kWeight], kWriteInplace);
+    if (!param_.no_bias) {
+      CHECK_NE(req[deconv::kBias], kWriteInplace);
+    }
+    CHECK_NE(req[deconv::kData], kWriteInplace);
+    GetTempSize(ctx);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
+    size_t workspace_size           = TensorSizeBytes(workspace);
+    for (uint32_t g = 0; g < param_.num_group; ++g) {
+      typename DataType<DType>::ScaleType alpha     = 1.0f;
+      typename DataType<DType>::ScaleType bias_beta = 0.0f;
+      if (!param_.no_bias && req[deconv::kBias] == kAddTo) {
+        bias_beta = 1.0f;
+      }
+      typename DataType<DType>::ScaleType data_beta = req[deconv::kData] == kAddTo ? 1.0f : 0.0f;
+      typename DataType<DType>::ScaleType weight_beta =
+          req[deconv::kWeight] == kAddTo ? 1.0f : 0.0f;
+      if (req[deconv::kWeight] != kNullOp) {
+        CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo);
+        CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
+                                                  &alpha,
+                                                  out_desc_,
+                                                  grad_ptr + out_offset_ * g,
+                                                  in_desc_,
+                                                  data_ptr + data_offset_ * g,
+                                                  back_conv_desc_,
+                                                  back_algo_w_.AlgoNumber(),
+                                                  workspace.dptr_,
+                                                  workspace_size,
+                                                  &weight_beta,
+                                                  filter_desc_,
+                                                  gwmat_ptr + weight_offset_ * g));
+      }
+      if (!param_.no_bias && (req[deconv::kBias] != kNullOp)) {
+        Tensor<gpu, 1, DType> gbias = in_grad[deconv::kBias].get<gpu, 1, DType>(s);
+        CUDNN_CALL(cudnnConvolutionBackwardBias(s->dnn_handle_,
+                                                &alpha,
+                                                out_desc_,
+                                                grad_ptr + out_offset_ * g,
+                                                &bias_beta,
+                                                bias_desc_,
+                                                gbias.dptr_ + bias_offset_ * g));
+      }
+      if (req[deconv::kData] != kNullOp) {
+        CUDNN_CALL(cudnnConvolutionForward(s->dnn_handle_,
+                                           &alpha,
+                                           out_desc_,
+                                           grad_ptr + out_offset_ * g,
+                                           filter_desc_,
+                                           wmat_ptr + weight_offset_ * g,
+                                           back_conv_desc_,
+                                           forward_algo_.AlgoNumber(),
+                                           workspace.dptr_,
+                                           workspace_size,
+                                           &data_beta,
+                                           in_desc_,
+                                           gdata_ptr + data_offset_ * g));
+      }
+    }
+  }
+
+  /*!
+   * \brief Returns whether the cuDNN library version supports the deconvolution
+   * operation described by `param`: cuDNN v5 and earlier does not support
+   * dilated convolutions.
+   */
+  static bool Supports(DeconvolutionParam param,
+                       int forward_compute_type,
+                       int backward_compute_type,
+                       int dev_id) {
+    using namespace mshadow;
+
+    // NDHWC not supported, NHWC not supported in true fp16
+    auto layout_val = param.layout.value();
+    auto true_fp16  = DataType<DType>::kFlag == kFloat16 &&
+                     (forward_compute_type == kFloat16 || backward_compute_type == kFloat16);
+    if (layout_val == kNDHWC || layout_val == kNWC || layout_val == kNHWC && true_fp16)
+      return false;
+
+    // Permits graceful fallback to pseudo-fp16 on heterogenous systems
+    if (!SupportsFloat16Compute(dev_id) &&
+        (forward_compute_type == kFloat16 || backward_compute_type == kFloat16)) {
+      return false;
+    }
+
+    // The factor by which the effective filter size grows based on dilation.
+    auto filterDilationFactor = param.dilate.Size();
+
+    return true;
+  }
+
+ private:
+  /*!
+   * \brief Translate an mxnet datatype to the corresponding cudnnDataType_t.
+   */
+  cudnnDataType_t convertToCuDNNDataType(int dtype) {
+    cudnnDataType_t converted = CUDNN_DATA_FLOAT;
+    // The following will always assign to `converted` or throw an exception.
+    MSHADOW_REAL_TYPE_SWITCH(
+        dtype, mxDType, { converted = mshadow::DataType<mxDType>::kCudnnFlag; })
+    return converted;
+  }
+
+  inline void InitDescriptors(const mxnet::ShapeVector& in_shape,
+                              const mxnet::ShapeVector& out_shape,
+                              cudnnDataType_t cudnn_forward_compute_type,
+                              cudnnDataType_t cudnn_backward_compute_type) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_shape.size(), expected);
+    CHECK_EQ(out_shape.size(), 1U);
+
+    mxnet::TShape dshape = in_shape[deconv::kData];
+    mxnet::TShape wshape = in_shape[deconv::kWeight];
+    mxnet::TShape oshape = out_shape[deconv::kOut];
+    mxnet::TShape dstride, ostride;
+    wshape[0] /= param_.num_group;
+    if (param_.kernel.ndim() == 1 || param_.kernel.ndim() == 2) {
+      // 1d or 2d conv
+      index_t o_pad[2];
+      index_t o_adj[2];
+      if (param_.kernel.ndim() == 2) {
+        param_.InferPad(dshape, o_pad, o_adj);
+      } else {
+        index_t o_pad_1D[1];
+        index_t o_adj_1D[1];
+        param_.InferPad(dshape, o_pad_1D, o_adj_1D);
+        o_pad[0] = 0;
+        o_pad[1] = o_pad_1D[0];
+      }
+      auto stride =
+          param_.kernel.ndim() == 2 ? param_.stride : mxnet::TShape({1, param_.stride[0]});
+      auto dilate =
+          param_.kernel.ndim() == 2 ? param_.dilate : mxnet::TShape({1, param_.dilate[0]});
+
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
+                                                 o_pad[0],
+                                                 o_pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_forward_compute_type));
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
+                                                 o_pad[0],
+                                                 o_pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+      CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
+                                                 o_pad[0],
+                                                 o_pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilate[0],
+                                                 dilate[1],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+      if (param_.kernel.ndim() == 2) {
+        wshape  = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+        dstride = ConvertLayout(Strides<4>(dshape), param_.layout.value(), kNCHW);
+        dshape  = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+        ostride = ConvertLayout(Strides<4>(oshape), param_.layout.value(), kNCHW);
+        oshape  = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
+      } else {
+        wshape  = ConvertLayout(wshape.get<3>(), param_.layout.value(), kNCW);
+        wshape  = mxnet::TShape({wshape[0], wshape[1], 1, wshape[2]});
+        dstride = ConvertLayout(Strides<3>(dshape), param_.layout.value(), kNCW);
+        dstride = mxnet::TShape({dstride[0], dstride[1], dstride[1], dstride[2]});
+        dshape  = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+        dshape  = mxnet::TShape({dshape[0], dshape[1], 1, dshape[2]});
+        ostride = ConvertLayout(Strides<3>(oshape), param_.layout.value(), kNCW);
+        ostride = mxnet::TShape({ostride[0], ostride[1], ostride[1], ostride[2]});
+        oshape  = ConvertLayout(oshape.get<3>(), param_.layout.value(), kNCW);
+        oshape  = mxnet::TShape({oshape[0], oshape[1], 1, oshape[2]});
+      }
+      CUDNN_CALL(cudnnSetFilter4dDescriptor(
+          filter_desc_, dtype_, format_, wshape[0], wshape[1], wshape[2], wshape[3]));
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h    = o_pad[0];
+      auto pad_w    = o_pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      index_t o_pad[3];
+      index_t o_adj[3];
+      param_.InferPad(dshape, o_pad, o_adj);
+
+      CHECK_EQ(param_.layout.value(), kNCDHW) << "CuDNN only support 3D conv with NCDHW layout";
+      std::vector<int> wshape_buffer(wshape.ndim());
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(filter_desc_,
+                                            dtype_,
+                                            CUDNN_TENSOR_NCHW,
+                                            static_cast<int>(wshape.ndim()),
+                                            CastTShapeToIntPtr(wshape, &wshape_buffer)));
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(forward_conv_desc_,
+                                                 3,
+                                                 reinterpret_cast<int*>(&o_pad[0]),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_forward_compute_type));
+
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(back_conv_desc_,
+                                                 3,
+                                                 reinterpret_cast<int*>(&o_pad[0]),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+
+      CUDNN_CALL(cudnnSetConvolutionNdDescriptor(back_conv_desc_w_,
+                                                 3,
+                                                 reinterpret_cast<int*>(&o_pad[0]),
+                                                 param_stride_.data(),
+                                                 param_dilate_.data(),
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 cudnn_backward_compute_type));
+
+      dstride = ConvertLayout(Strides<5>(dshape), param_.layout.value(), kNCDHW);
+      dshape  = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+      ostride = ConvertLayout(Strides<5>(oshape), param_.layout.value(), kNCDHW);
+      oshape  = ConvertLayout(oshape.get<5>(), param_.layout.value(), kNCDHW);
+    }
+    // Set "allow tensor core" flag in convolution descriptors, if available.
+    cudnnMathType_t math_type = cudnn_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+    CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, math_type));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, math_type));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, math_type));
+    dshape[1] /= param_.num_group;
+    oshape[1] /= param_.num_group;
+    weight_offset_ = wshape.Size();
+    data_offset_   = dstride[1] * dshape[1];
+    out_offset_    = ostride[1] * oshape[1];
+
+    std::vector<int> dshape_buffer(dshape.ndim());
+    std::vector<int> dstride_buffer(dstride.ndim());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
+                                          dtype_,
+                                          static_cast<int>(dshape.ndim()),
+                                          CastTShapeToIntPtr(dshape, &dshape_buffer),
+                                          CastTShapeToIntPtr(dstride, &dstride_buffer)))
+
+    std::vector<int> oshape_buffer(oshape.ndim());
+    std::vector<int> ostride_buffer(ostride.ndim());
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
+                                          dtype_,
+                                          static_cast<int>(oshape.ndim()),
+                                          CastTShapeToIntPtr(oshape, &oshape_buffer),
+                                          CastTShapeToIntPtr(ostride, &ostride_buffer)));
+
+    if (!param_.no_bias) {
+      mxnet::TShape bias           = in_shape[deconv::kBias];
+      bias_offset_                 = bias[0] / param_.num_group;
+      int bias_dim                 = static_cast<int>(bias_offset_);
+      std::vector<int> bias_shape  = {1, bias_dim, 1, 1};
+      std::vector<int> bias_stride = {bias_dim, 1, bias_dim, bias_dim};
+      if (param_.kernel.ndim() == 3) {
+        bias_shape.push_back(1);
+        bias_stride.push_back(bias_dim);
+      }
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(bias_desc_,
+                                            dtype_,
+                                            static_cast<int>(bias_shape.size()),
+                                            &bias_shape[0],
+                                            &bias_stride[0]));
+    }
+  }
+
+  void CuDNNAlgoSetter(const RunContext& rctx,
+                       const mxnet::ShapeVector& in_shape,
+                       const mxnet::ShapeVector& out_shape,
+                       cudnnDataType_t cudnn_forward_compute_type,
+                       cudnnDataType_t cudnn_backward_compute_type,
+                       CuDNNAlgo<cudnnConvolutionFwdAlgo_t>* fwd,
+                       CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t>* bwd,
+                       CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t>* flt) {
+    // Not in algo registry, must determine via *Get*() or *Find*()
+    mshadow::Stream<gpu>* s = rctx.get_stream<gpu>();
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
+
+    // Since the function signature of *Get*_v7() matches that of *Find*(),
+    // we can unify the find-vs-get logic by using function pointers.
+
+    // Forward Algorithm Find/Get() v7
+    std::vector<cudnnConvolutionFwdAlgoPerf_t> fwd_results(MaxForwardAlgos(s->dnn_handle_));
+    int actual_fwd_algos     = 0;
+    auto fwd_algo_discoverer = param_.cudnn_tune.value() == deconv::kOff ?
+                                   cudnnGetConvolutionForwardAlgorithm_v7 :
+                                   cudnnFindConvolutionForwardAlgorithm;
+    CUDNN_CALL((*fwd_algo_discoverer)(s->dnn_handle_,
+                                      out_desc_,
+                                      filter_desc_,
+                                      back_conv_desc_,  // fwd algo used to backprop-to-data
+                                      in_desc_,
+                                      fwd_results.size(),
+                                      &actual_fwd_algos,
+                                      fwd_results.data()));
+    fwd_results.resize(actual_fwd_algos);
+    AlgoFinalSelect<cudnnConvolutionFwdAlgoPerf_t, cudnnConvolutionFwdAlgo_t>(
+        fwd_results, "forward", workspace_byte, fwd);
+
+    // Backprop-to-Filter Algorithm Find/Get() v7
+    auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
+    std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
+    int actual_bwd_filter_algos = 0;
+    // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
+    // were summing into the output (i.e. beta != 0).  Get() returned OK algos though.
+    auto bwd_filter_algo_discoverer = param_.cudnn_tune.value() == deconv::kOff ?
+                                          cudnnGetConvolutionBackwardFilterAlgorithm_v7 :
+                                          cudnnFindConvolutionBackwardFilterAlgorithm;
+    CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_,
+                                             out_desc_,
+                                             in_desc_,
+                                             back_conv_desc_,
+                                             filter_desc_,
+                                             bwd_filt_results.size(),
+                                             &actual_bwd_filter_algos,
+                                             bwd_filt_results.data()));
+    bwd_filt_results.resize(actual_bwd_filter_algos);
+    AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t, cudnnConvolutionBwdFilterAlgo_t>(
+        bwd_filt_results, "backprop-to-filter", workspace_byte, flt);
+    // Backprop-to-Data Algorithm Find/Get() v7
+    auto max_bwd_data_algos = MaxBackwardDataAlgos(s->dnn_handle_);
+    std::vector<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_results(max_bwd_data_algos);
+    int actual_bwd_data_algos     = 0;
+    auto bwd_data_algo_discoverer = param_.cudnn_tune.value() == deconv::kOff ?
+                                        cudnnGetConvolutionBackwardDataAlgorithm_v7 :
+                                        cudnnFindConvolutionBackwardDataAlgorithm;
+    CUDNN_CALL((*bwd_data_algo_discoverer)(s->dnn_handle_,
+                                           filter_desc_,
+                                           in_desc_,
+                                           forward_conv_desc_,  // bwd algo used in inference
+                                           out_desc_,
+                                           bwd_data_results.size(),
+                                           &actual_bwd_data_algos,
+                                           bwd_data_results.data()));
+    bwd_data_results.resize(actual_bwd_data_algos);
+    AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t, cudnnConvolutionBwdDataAlgo_t>(
+        bwd_data_results, "backprop-to-data", workspace_byte, bwd, exclude_dgrad_algo_);
+
+    // Fix for issue #11241
+    int cudnn_find_issue_max_features = 64 * 1024;
+    // With deconvolution, the algo sensitivity is to a large number of output features
+    if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) {
+      flt->Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
+    }
+  }
+
+  void SelectAlgo(const RunContext& rctx,
+                  const mxnet::ShapeVector& in_shape,
+                  const mxnet::ShapeVector& out_shape,
+                  cudnnDataType_t cudnn_forward_compute_type,
+                  cudnnDataType_t cudnn_backward_compute_type) {
+    auto algo_setter = [&](CuDNNAlgo<cudnnConvolutionFwdAlgo_t>* fwd,
+                           CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t>* bwd,
+                           CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t>* flt) {
+      if (param_.cudnn_tune.value() == deconv::kOff) {
+        // The routine will only be calling cudnnGet, so no need to grab the Storage lock.
+        this->CuDNNAlgoSetter(rctx,
+                              in_shape,
+                              out_shape,
+                              cudnn_forward_compute_type,
+                              cudnn_backward_compute_type,
+                              fwd,
+                              bwd,
+                              flt);
+      } else {
+        // One potential problem is that cudnnFind() uses cudaMalloc() to directly allocate
+        // I/O and workspace areas, and these allocations may result in an out-of-memory
+        // error even though the StorageMangager free pool is not empty.  Ideally, cudnnFind
+        // would use MXNet's storage allocator for its I/O and workspace areas, instead of using
+        // the area carved out by MXNET_GPU_MEM_POOL_RESERVE.
+        // To get somewhat the same effect as this, we can pre-allocate the areas needed for the
+        // I/Os (possibly triggering a desirable StorageManager::ReleaseAll()), followed by a
+        // DirectFree(), which makes these areas available for cudnn's subsequent cudaMalloc().
+
+        // Allocate for x (or dx), w (or dw) and y (or dy).
+        ReserveElements({in_shape[deconv::kData].Size(),
+                         in_shape[deconv::kWeight].Size(),
+                         out_shape[deconv::kOut].Size()});
+
+        // We're about to call cudnnFind so we need to quiet the system by grabbing
+        // the Storage lock.  Concurrent cudaMalloc's can disrupt the accurate timing
+        // measurements of the algos, and can prevent the cuda driver's proper freeing
+        // of cudnnFind's internal temporary allocations.  Grabbing the lock might also
+        // impede other threads from launching work on the GPU.
+        std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+        this->CuDNNAlgoSetter(rctx,
+                              in_shape,
+                              out_shape,
+                              cudnn_forward_compute_type,
+                              cudnn_backward_compute_type,
+                              fwd,
+                              bwd,
+                              flt);
+      }
+    };
+
+    // An algo specification by the user may be cached here, but another
+    // convolution will match only if identically specified.
+    // We're caching results of *Get* as well as *Find*, but these records
+    // will be held distinctly because param_.cudnn_tune is part of the key.
+    CuDNNDeconvAlgoReg::Get()->FindOrElseRegister(param_,
+                                                  in_shape,
+                                                  out_shape,
+                                                  dtype_,
+                                                  cudnn_forward_compute_type,
+                                                  cudnn_backward_compute_type,
+                                                  SMArch(rctx.ctx.dev_id),
+                                                  add_to_weight_,
+                                                  &forward_algo_,
+                                                  &back_algo_,
+                                                  &back_algo_w_,
+                                                  algo_setter);
+
+    // If we're allowing Tensor Core variants of the algos to be considered in
+    // *Find*() or *Get*(), but a non-Tensor-Core algo variant is the fastest,
+    // we must change the descriptor to preclude Tensor Core.  Simplest is to
+    // once again set the mathType in all cases.
+
+    // The next two code lines will look like they have typos, but they don't!
+    // The forward_conv_desc_ is used during inference, which invokes the back_algo_.
+    // Thus, the mathType of the back_algo_ should be stored in the forward_conv_desc_.
+    // Conversely, the back_conv_desc_ is used during training backprop, which invokes
+    // the forward_algo_.  Thus, the mathType of the forward_algo_ should be stored
+    // in the back_conv_desc_.
+    CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, back_algo_.MathType()));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, forward_algo_.MathType()));
+    CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, back_algo_w_.MathType()));
+  }
+
+  // Look over the results from *Find*() or *Get*() and pick the fastest algo given possible
+  // workspace constraints and a possible user algo preference.
+  template <typename PerfType, typename AlgoType>
+  void AlgoFinalSelect(const std::vector<PerfType>& perf_results,
+                       std::string kernel_name,
+                       size_t workspace_byte,
+                       CuDNNAlgo<AlgoType>* algo,
+                       int32_t algo_exclude = -1) {
+    // Determine the fastest acceptable algo regardless of mathType.
+    bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
+    for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
+      const auto& result       = perf_results[i];
+      bool algo_exclusion      = static_cast<int32_t>(result.algo) == algo_exclude;
+      bool algo_is_tensor_core = false;
+      algo_is_tensor_core      = result.mathType == CUDNN_TENSOR_OP_MATH;
+      if (result.status == CUDNN_STATUS_SUCCESS &&
+          (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
+          (param_.cudnn_tune.value() != deconv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
+        algo->Set(result.algo, algo_is_tensor_core);
+        return;
+      }
+    }
+    auto mode = param_.cudnn_tune.value() == deconv::kOff ? " get " : " find ";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm"
+               << " with workspace size of " << workspace_byte << " bytes,"
+               << " please consider reducing batch/model size or increasing the workspace size";
+  }
+
+  void GetTempSize(const OpContext& ctx) {
+    mshadow::Stream<gpu>* s                = ctx.get_stream<gpu>();
+    size_t back_data_algo_workspace_size   = 0;
+    size_t back_filter_algo_workspace_size = 0;
+    size_t forward_algo_workspace_size     = 0;
+    CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+                                                            filter_desc_,
+                                                            in_desc_,
+                                                            forward_conv_desc_,
+                                                            out_desc_,
+                                                            back_algo_.AlgoNumber(),
+                                                            &back_data_algo_workspace_size));
+    CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+                                                              out_desc_,
+                                                              in_desc_,
+                                                              back_conv_desc_,
+                                                              filter_desc_,
+                                                              back_algo_w_.AlgoNumber(),
+                                                              &back_filter_algo_workspace_size));
+    CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+                                                       out_desc_,
+                                                       filter_desc_,
+                                                       back_conv_desc_,
+                                                       in_desc_,
+                                                       forward_algo_.AlgoNumber(),
+                                                       &forward_algo_workspace_size));
+
+    forward_workspace_byte_ = back_data_algo_workspace_size;
+    backward_workspace_byte_ =
+        std::max(forward_algo_workspace_size, back_filter_algo_workspace_size);
+  }
+
+  int* CastTShapeToIntPtr(const mxnet::TShape& s, std::vector<int>* buffer) {
+    buffer->resize(s.ndim());
+    nnvm::ShapeTypeCast(s.begin(), s.end(), buffer->data());
+    return buffer->data();
+  }
+
+  // Converts a TBlob to a dptr, checking for the expected dim and that it's contiguous.
+  DType* GetNdPtr(const TBlob& tb, int dim, Stream<gpu>* s) {
+    DType* data_ptr = nullptr;
+    if (dim == 3) {
+      Tensor<gpu, 3, DType> data = tb.get<gpu, 3, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else if (dim == 4) {
+      Tensor<gpu, 4, DType> data = tb.get<gpu, 4, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else if (dim == 5) {
+      Tensor<gpu, 5, DType> data = tb.get<gpu, 5, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+    } else {
+      LOG(FATAL) << "Unexpected Tensor size " << dim << ", supporting only 3, 4 or 5.";
+    }
+    return data_ptr;
+  }
+
+  // Converts a mxnet::TShape to a Shape<> of strides.
+  // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
+  template <int dim>
+  inline Shape<dim> Strides(const mxnet::TShape& s) {
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
+      strides[i] = s.ProdShape(i + 1, ndim);
+    return strides.get<dim>();
+  }
+
+  void InitBufferForParam() {
+    CastTShapeToIntPtr(param_.stride, &param_stride_);
+    CastTShapeToIntPtr(param_.dilate, &param_dilate_);
+  }
+
+  // Allocates a 1D Tensor of words with size in bytes >= `size_bytes`.
+  // Always allocates at least one word.
+  mshadow::Tensor<gpu, 1, DType> AllocateTempWorkspace(const OpContext& ctx, size_t size_bytes) {
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    size_t size_words       = size_bytes / sizeof(DType) + 1;
+    return ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
+        mshadow::Shape1(size_words), s);
+  }
+
+  // Returns the size in bytes of the 1D Tensor of words.
+  size_t TensorSizeBytes(const mshadow::Tensor<gpu, 1, DType>& tensor) {
+    return tensor.MSize() * sizeof(DType);
+  }
+
+  // Given a tensor shape of this operation, return the number of features 'c'
+  int64_t Features(const mxnet::TShape& dshape) {
+    int c = 0;
+    switch (dshape.ndim()) {
+      case 3:
+        c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1];
+        break;
+      case 4:
+        c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1];
+        break;
+      case 5:
+        c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1];
+        break;
+      default:
+        LOG(FATAL) << "Unexpected deconvolution data dimension " << dshape.ndim();
+    }
+    return c;
+  }
+
+  // Make a number of allocations and directly free them, ensuring room for an equivalent set of
+  // cudaMalloc() calls by (say) cudnnFind().  `elements` spec the alloc size in DTypes, not bytes.
+  void ReserveElements(const std::vector<size_t>& elements) {
+    std::vector<Storage::Handle> handles;
+    for (size_t alloc_element : elements) {
+      handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU()));
+      handles.back().profiler_scope = "<ephemeral>:";
+      handles.back().name           = "reserve_elements";
+    }
+    for (auto& handle : handles)
+      Storage::Get()->DirectFree(handle);
+  }
+
+  // Log that no suitable algo was found that met the workspace constraints, then exit.
+  void LogNoSuitableAlgoAndExit(int num_algos_tried,
+                                size_t min_memory_needs,
+                                size_t workspace_byte,
+                                std::string algo_kind) {
+    LOG(FATAL) << num_algos_tried << " " << algo_kind << " with minimum memory requirement "
+               << min_memory_needs << " bytes have been tried. Workspace size is set to "
+               << workspace_byte << " bytes, please consider reducing the batch/model size, "
+               << "or increasing workspace size.";
+  }
+
+  std::vector<int> param_stride_;
+  std::vector<int> param_dilate_;
+
+  int forward_compute_type_;
+  int backward_compute_type_;
+  const mxnet::ShapeVector in_shapes_;
+  const mxnet::ShapeVector out_shapes_;
+
+  // Temp workspace size in bytes needed for Forward() operation.  Note that
+  // in deconvolution, this is handled by the cuDNN backprop-to-data kernel.
+  size_t forward_workspace_byte_;
+  // Temp workspace size in bytes needed for Backward() operation.  Note that
+  // in deconvolution, this is handled by the cuDNN forward kernel and the
+  // the cuDNN backprop-to-filter kernel.
+  size_t backward_workspace_byte_;
+  size_t data_offset_;
+  size_t out_offset_;
+  size_t weight_offset_;
+  size_t bias_offset_;
+  cudnnDataType_t dtype_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  // Convolution descriptor for "forward" inference operation.
+  // Note that in deconvolution, the forward operation is handled
+  // by the cuDNN backprop-to-data kernel.
+  cudnnConvolutionDescriptor_t forward_conv_desc_;
+  // Convolution descriptor for "back-prop" operations to data .
+  // Note that in deconvolution, the backprop-to-data operation is handled
+  // by the cuDNN forward kernel.
+  cudnnConvolutionDescriptor_t back_conv_desc_;
+  // Convolution descriptor for "back-prop" operations to filter.
+  // Note that in deconvolution, the backprop-to-data operation is handled
+  // by the backprop-to-filter kernel (so consistent with the treatment
+  // in convolution).
+  cudnnConvolutionDescriptor_t back_conv_desc_w_;
+  // Algorithm for the cuDNN forward kernel (used in gradient backprop to input)
+  CuDNNAlgo<cudnnConvolutionFwdAlgo_t> forward_algo_;
+  // Algorithm for the cuDNN backprop-to-data kernel (used in inference)
+  CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> back_algo_;
+  // Algorithm for the cuDNN backprop-to-filter kernel
+  CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> back_algo_w_;
+  cudnnTensorFormat_t format_;
+  // Allow TensorCore algo policy
+  bool cudnn_tensor_core_;
+  // Is req[kWeight] == deconv::kAddTo ?
+  bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
+  DeconvolutionParam param_;
+};
+#endif  // CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index b807234e1d7b..ad7872025ee9 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -49,8 +49,8 @@ class CuDNNPoolingOp {
     param_ = p;
     switch (param_.pool_type) {
       case pool_enum::kMaxPooling:
-        mode_ = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false) ? CUDNN_POOLING_MAX_DETERMINISTIC
-                                                                 : CUDNN_POOLING_MAX;
+        mode_ = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false) ? CUDNN_POOLING_MAX_DETERMINISTIC :
+                                                                   CUDNN_POOLING_MAX;
         break;
       case pool_enum::kAvgPooling:
         if (param_.count_include_pad.has_value() && !param_.count_include_pad.value()) {
@@ -210,8 +210,8 @@ class CuDNNPoolingOp {
       // Perform shape calculations in a standard (NCHW) layout space
       mshadow::Shape<4> input_shape = input.shape_.get<4>();
       mshadow::Shape<4> dshape_nchw =
-          (layout == mshadow::kNHWC) ? ConvertLayout(input_shape, mshadow::kNHWC, mshadow::kNCHW)
-                                     : input_shape;
+          (layout == mshadow::kNHWC) ? ConvertLayout(input_shape, mshadow::kNHWC, mshadow::kNCHW) :
+                                       input_shape;
       int kernel_height = param.global_pool ? dshape_nchw[2] : param.kernel[0];
       int kernel_width  = param.global_pool ? dshape_nchw[3] : param.kernel[1];
       if (kernel_height > 8 || kernel_width > 8)
@@ -258,11 +258,11 @@ class CuDNNPoolingOp {
       Tensor<gpu, 4, DType> out  = out_data.get<gpu, 4, DType>(s);
       // Perform shape calculations in a standard (NCHW) layout space
       mshadow::Shape<4> dshape_nchw =
-          (layout == mshadow::kNHWC) ? ConvertLayout(data.shape_, mshadow::kNHWC, mshadow::kNCHW)
-                                     : data.shape_;
+          (layout == mshadow::kNHWC) ? ConvertLayout(data.shape_, mshadow::kNHWC, mshadow::kNCHW) :
+                                       data.shape_;
       mshadow::Shape<4> oshape_nchw =
-          (layout == mshadow::kNHWC) ? ConvertLayout(out.shape_, mshadow::kNHWC, mshadow::kNCHW)
-                                     : out.shape_;
+          (layout == mshadow::kNHWC) ? ConvertLayout(out.shape_, mshadow::kNHWC, mshadow::kNCHW) :
+                                       out.shape_;
       CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
                                             cudnn_layout,
                                             dtype_,
@@ -314,18 +314,18 @@ class CuDNNPoolingOp {
                                                   oshape.ProdShape(5, 5));
       // Convert to a standard (NCDHW) layout space to create args for cuDNN
 
-      mshadow::Shape<5> dshape_ncdhw = (layout == mshadow::kNDHWC)
-                                           ? ConvertLayout(dshape, mshadow::kNDHWC, mshadow::kNCDHW)
-                                           : dshape;
+      mshadow::Shape<5> dshape_ncdhw = (layout == mshadow::kNDHWC) ?
+                                           ConvertLayout(dshape, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                           dshape;
       mshadow::Shape<5> dstride_ncdhw =
-          (layout == mshadow::kNDHWC) ? ConvertLayout(dstride, mshadow::kNDHWC, mshadow::kNCDHW)
-                                      : dstride;
-      mshadow::Shape<5> oshape_ncdhw = (layout == mshadow::kNDHWC)
-                                           ? ConvertLayout(oshape, mshadow::kNDHWC, mshadow::kNCDHW)
-                                           : oshape;
+          (layout == mshadow::kNDHWC) ? ConvertLayout(dstride, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                        dstride;
+      mshadow::Shape<5> oshape_ncdhw = (layout == mshadow::kNDHWC) ?
+                                           ConvertLayout(oshape, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                           oshape;
       mshadow::Shape<5> ostride_ncdhw =
-          (layout == mshadow::kNDHWC) ? ConvertLayout(ostride, mshadow::kNDHWC, mshadow::kNCDHW)
-                                      : ostride;
+          (layout == mshadow::kNDHWC) ? ConvertLayout(ostride, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                        ostride;
       // Create int arrays for passing into cuDNN
       std::array<int, 5> dshape_ncdhw_int, dstride_ncdhw_int, oshape_ncdhw_int, ostride_ncdhw_int;
       for (int i = 0; i < 5; ++i) {
@@ -335,12 +335,12 @@ class CuDNNPoolingOp {
         ostride_ncdhw_int[i] = static_cast<int>(ostride_ncdhw[i]);
       }
 
-      std::array<int, 3> kernel_vec = {param_.global_pool ? static_cast<int>(dshape_ncdhw[2])
-                                                          : static_cast<int>(param_.kernel[0]),
-                                       param_.global_pool ? static_cast<int>(dshape_ncdhw[3])
-                                                          : static_cast<int>(param_.kernel[1]),
-                                       param_.global_pool ? static_cast<int>(dshape_ncdhw[4])
-                                                          : static_cast<int>(param_.kernel[2])};
+      std::array<int, 3> kernel_vec = {param_.global_pool ? static_cast<int>(dshape_ncdhw[2]) :
+                                                            static_cast<int>(param_.kernel[0]),
+                                       param_.global_pool ? static_cast<int>(dshape_ncdhw[3]) :
+                                                            static_cast<int>(param_.kernel[1]),
+                                       param_.global_pool ? static_cast<int>(dshape_ncdhw[4]) :
+                                                            static_cast<int>(param_.kernel[2])};
 
       std::array<int, 3> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
                                     param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
diff --git a/src/operator/nn/dnnl/dnnl_base-inl.h b/src/operator/nn/dnnl/dnnl_base-inl.h
index 4bf8b372b4ef..3ec2e32750b8 100644
--- a/src/operator/nn/dnnl/dnnl_base-inl.h
+++ b/src/operator/nn/dnnl/dnnl_base-inl.h
@@ -607,9 +607,9 @@ class DNNLMemory {
       dnnl::memory::data_type data_type = dnnl::memory::data_type::undef) const {
     dnnl::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
     dnnl::memory::data_type cpp_type =
-        (data_type == dnnl::memory::data_type::undef)
-            ? static_cast<dnnl::memory::data_type>(desc.data.data_type)
-            : data_type;
+        (data_type == dnnl::memory::data_type::undef) ?
+            static_cast<dnnl::memory::data_type>(desc.data.data_type) :
+            data_type;
     dnnl::memory::desc data_md(dims, cpp_type, static_cast<dnnl::memory::format_tag>(format));
     return data_md;
   }
diff --git a/src/operator/nn/dnnl/dnnl_base.cc b/src/operator/nn/dnnl/dnnl_base.cc
index d1e8918c3bde..54af44c80fe4 100644
--- a/src/operator/nn/dnnl/dnnl_base.cc
+++ b/src/operator/nn/dnnl/dnnl_base.cc
@@ -242,31 +242,30 @@ const dnnl::memory* GetWeights(const NDArray& arr, int num_groups) {
     tz         = dnnl::memory::dims{arr.shape()[O], arr.shape()[I]};
     format_tag = dnnl::memory::format_tag::oi;
   } else if (ndim == 3) {
-    tz = num_groups > 1 ? dnnl::memory::dims{num_groups,
-                                             arr.shape()[O] / num_groups,
-                                             arr.shape()[I],
-                                             arr.shape()[H]}
-                        : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+    tz         = num_groups > 1 ?
+                     dnnl::memory::dims{
+                 num_groups, arr.shape()[O] / num_groups, arr.shape()[I], arr.shape()[H]} :
+                     dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
     format_tag = num_groups > 1 ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw;
   } else if (ndim == 4) {
-    tz = num_groups > 1
-             ? dnnl::memory::dims{num_groups,
-                                  arr.shape()[O] / num_groups,
-                                  arr.shape()[I],
-                                  arr.shape()[H],
-                                  arr.shape()[W]}
-             : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+    tz         = num_groups > 1 ?
+                     dnnl::memory::dims{num_groups,
+                                arr.shape()[O] / num_groups,
+                                arr.shape()[I],
+                                arr.shape()[H],
+                                arr.shape()[W]} :
+                     dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
     format_tag = num_groups > 1 ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw;
   } else if (ndim == 5) {
-    tz = num_groups > 1
-             ? dnnl::memory::dims{num_groups,
-                                  arr.shape()[O] / num_groups,
-                                  arr.shape()[I],
-                                  arr.shape()[D],
-                                  arr.shape()[H],
-                                  arr.shape()[W]}
-             : dnnl::memory::dims{
-                   arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+    tz = num_groups > 1 ?
+             dnnl::memory::dims{num_groups,
+                                arr.shape()[O] / num_groups,
+                                arr.shape()[I],
+                                arr.shape()[D],
+                                arr.shape()[H],
+                                arr.shape()[W]} :
+             dnnl::memory::dims{
+                 arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
     format_tag =
         num_groups > 1 ? dnnl::memory::format_tag::goidhw : dnnl::memory::format_tag::oidhw;
   } else {
diff --git a/src/operator/nn/dnnl/dnnl_convolution.cc b/src/operator/nn/dnnl/dnnl_convolution.cc
index 9754f7fa4505..7910f65d21eb 100644
--- a/src/operator/nn/dnnl/dnnl_convolution.cc
+++ b/src/operator/nn/dnnl/dnnl_convolution.cc
@@ -53,8 +53,8 @@ std::shared_ptr<dnnl::convolution_forward::primitive_desc> GetConvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.dnnl_param.quantized);
   auto out_md    = GetMemDesc(output);
   auto bias_md =
-      bias ? (param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
-           : dnnl::memory::desc{{}, dnnl::memory::data_type::undef, dnnl::memory::format_tag::any};
+      bias ? (param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias)) :
+             dnnl::memory::desc{{}, dnnl::memory::data_type::undef, dnnl::memory::format_tag::any};
   auto bias_md_ptr = bias ? &bias_md : nullptr;
 
   dnnl::memory::dims strides(param.conv_param.kernel.ndim());
diff --git a/src/operator/nn/dnnl/dnnl_deconvolution-inl.h b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
index 301537967df3..92c1d6bed1f2 100644
--- a/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
+++ b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
@@ -289,9 +289,9 @@ inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad) co
 
 inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad,
                                                      const dnnl::memory* const out_grad_mem) const {
-  return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
-             ? out_grad_mem
-             : out_grad.GetDNNLDataReorder(bwd_weights_pd->diff_dst_desc());
+  return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc()) ?
+             out_grad_mem :
+             out_grad.GetDNNLDataReorder(bwd_weights_pd->diff_dst_desc());
 }
 
 inline dnnl_output_t DNNLDeconvBwd::DataGradMem(const OpReqType req,
@@ -315,8 +315,8 @@ inline dnnl_output_t DNNLDeconvBwd::WeightsGradMem(const uint32_t num_group,
 
 inline dnnl_output_t DNNLDeconvBwd::BiasGradMem(const OpReqType req,
                                                 const NDArray* const bias) const {
-  return bias ? CreateDNNLMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
-              : dnnl_output_t(OutDataOp::Noop, nullptr);
+  return bias ? CreateDNNLMem(*bias, bwd_weights_pd->diff_bias_desc(), req) :
+                dnnl_output_t(OutDataOp::Noop, nullptr);
 }
 
 // Utility class for creating operation descriptors of deconvolution primitives
diff --git a/src/operator/nn/dnnl/dnnl_fully_connected.cc b/src/operator/nn/dnnl/dnnl_fully_connected.cc
index 5bb3c9d79ec0..7879497954ae 100644
--- a/src/operator/nn/dnnl/dnnl_fully_connected.cc
+++ b/src/operator/nn/dnnl/dnnl_fully_connected.cc
@@ -39,9 +39,9 @@ dnnl::inner_product_forward::primitive_desc GetFCFwdImpl(const DNNLFCFullParam&
                                                          const dnnl::memory::desc& out_md) {
   auto engine    = CpuEngine::Get()->get_engine();
   auto data_md   = GetMemDesc(data);
-  auto weight_md = full_param.dnnl_param.quantized
-                       ? GetFCWeightDesc(weight, data.shape()[0], mshadow::kInt8)
-                       : GetFCWeightDesc(weight, data.shape()[0]);
+  auto weight_md = full_param.dnnl_param.quantized ?
+                       GetFCWeightDesc(weight, data.shape()[0], mshadow::kInt8) :
+                       GetFCWeightDesc(weight, data.shape()[0]);
   auto propagation =
       is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
 
diff --git a/src/operator/nn/dnnl/dnnl_rnn.cc b/src/operator/nn/dnnl/dnnl_rnn.cc
index 844bad99c845..5ebad89089c3 100644
--- a/src/operator/nn/dnnl/dnnl_rnn.cc
+++ b/src/operator/nn/dnnl/dnnl_rnn.cc
@@ -184,9 +184,9 @@ RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
   memory::data_type data_type   = get_dnnl_type(data.dtype());
   memory::data_type weight_type = get_dnnl_type(params.dtype());
   const prop_kind prop = is_train ? prop_kind::forward_training : prop_kind::forward_inference;
-  const rnn_direction dnnl_rnn_direction = layer_param.bidirectional
-                                               ? rnn_direction::bidirectional_concat
-                                               : rnn_direction::unidirectional;
+  const rnn_direction dnnl_rnn_direction = layer_param.bidirectional ?
+                                               rnn_direction::bidirectional_concat :
+                                               rnn_direction::unidirectional;
 
   auto src_layer_desc    = memory::desc(layer_param.src_dims, data_type, tag::tnc);
   auto weight_layer_desc = memory::desc(layer_param.weight_layer_dims, weight_type, tag::any);
@@ -196,15 +196,15 @@ RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
   auto src_state_desc    = memory::desc(layer_param.state_dims, data_type, tag::ldnc);
   auto src_cell_desc     = memory::desc(layer_param.cell_dims, data_type, tag::ldnc);
   auto weight_peep_desc  = memory::desc();
-  auto weight_proj_desc  = layer_param.proj_size > 0
-                              ? memory::desc(layer_param.weight_proj_dims, weight_type, tag::any)
-                              : memory::desc();
-  auto dst_state_desc = layer_param.state_outputs
-                            ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
-                            : memory::desc();
-  auto dst_cell_desc = layer_param.state_outputs
-                           ? memory::desc(layer_param.cell_dims, data_type, tag::ldnc)
-                           : memory::desc();
+  auto weight_proj_desc  = layer_param.proj_size > 0 ?
+                               memory::desc(layer_param.weight_proj_dims, weight_type, tag::any) :
+                               memory::desc();
+  auto dst_state_desc    = layer_param.state_outputs ?
+                               memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
+                               memory::desc();
+  auto dst_cell_desc     = layer_param.state_outputs ?
+                               memory::desc(layer_param.cell_dims, data_type, tag::ldnc) :
+                               memory::desc();
 
   auto fwd = RnnPrimitive();
   switch (mode) {
@@ -265,8 +265,9 @@ RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
   memory::data_type data_type          = get_dnnl_type(data.dtype());
   memory::data_type weight_type        = get_dnnl_type(params.dtype());
   const prop_kind prop                 = prop_kind::backward;
-  rnn_direction dnnl_rnn_direction = layer_param.bidirectional ? rnn_direction::bidirectional_concat
-                                                               : rnn_direction::unidirectional;
+  rnn_direction dnnl_rnn_direction     = layer_param.bidirectional ?
+                                             rnn_direction::bidirectional_concat :
+                                             rnn_direction::unidirectional;
 
   auto src_layer_desc    = memory::desc(layer_param.src_dims, data_type, tag::tnc);
   auto weight_layer_desc = memory::desc(layer_param.weight_layer_dims, weight_type, tag::any);
@@ -274,9 +275,9 @@ RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
   auto bias_desc         = memory::desc(layer_param.bias_dims, data_type, tag::ldgo);
   auto dst_layer_desc    = memory::desc(layer_param.dst_dims, data_type, tag::tnc);
   auto src_state_desc    = memory::desc(layer_param.state_dims, data_type, tag::ldnc);
-  auto dst_state_desc    = layer_param.state_outputs
-                            ? memory::desc(layer_param.state_dims, data_type, tag::ldnc)
-                            : memory::desc();
+  auto dst_state_desc    = layer_param.state_outputs ?
+                               memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
+                               memory::desc();
 
   const void* fwd_pd = fwd.GetPrimDesc();
   auto bwd           = RnnBwdPrimitive();
@@ -1125,9 +1126,9 @@ void DNNLRnnOp::Forward(const OpContext& ctx,
   const int seq_length = default_param.seq_length_;
   const int batch_size = default_param.batch_size_;
   const int state_size = default_param.state_size;
-  const int iter_size  = default_param.projection_size.has_value()
-                            ? default_param.projection_size.value()
-                            : default_param.state_size;
+  const int iter_size  = default_param.projection_size.has_value() ?
+                             default_param.projection_size.value() :
+                             default_param.state_size;
   const int directions = default_param.bidirectional ? 2 : 1;
   dnnl::memory::desc dst_desc({seq_length, batch_size, directions * iter_size},
                               get_dnnl_type(data_dtype),
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 898309579054..30ad7aa01b54 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -296,9 +296,9 @@ class PoolingOp {
       }
       stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
-    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value())
-                            ? param_.p_value.value()
-                            : 1;
+    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
+                            param_.p_value.value() :
+                            1;
     const bool count_include_pad =
         (param_.count_include_pad.has_value()) ? param_.count_include_pad.value() : true;
     switch (p_value) {
@@ -377,9 +377,9 @@ class PoolingOp {
       stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
 
-    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value())
-                            ? param_.p_value.value()
-                            : 1;
+    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
+                            param_.p_value.value() :
+                            1;
     const bool count_include_pad =
         (param_.count_include_pad.has_value()) ? param_.count_include_pad.value() : true;
     switch (p_value) {
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 47114f8cc897..8fe054b54f89 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -157,8 +157,8 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     CHECK(layout == mshadow::kNCW || layout == mshadow::kNWC) << "Need 1D layout";
     // Perform shape calculations in a standard (NCW) layout space
     mshadow::Shape<3> dshape_ncw =
-        (layout == mshadow::kNWC) ? ConvertLayout(dshape.get<3>(), mshadow::kNWC, mshadow::kNCW)
-                                  : dshape.get<3>();
+        (layout == mshadow::kNWC) ? ConvertLayout(dshape.get<3>(), mshadow::kNWC, mshadow::kNCW) :
+                                    dshape.get<3>();
     mshadow::Shape<3> oshape_ncw = dshape_ncw;
     CHECK(param.kernel[0] <= dshape_ncw[2] + 2 * param.pad[0])
         << "kernel size (" << param.kernel[0] << ") exceeds input (" << dshape[2] << " padded to "
@@ -175,9 +175,9 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
           std::ceil(static_cast<float>(dshape_ncw[2] + 2 * param.pad[0]) / param.stride[0]));
     }
     // Convert back from standard (NCW) layout space to the actual layout type
-    mxnet::TShape oshape = (layout == mshadow::kNWC)
-                               ? ConvertLayout(oshape_ncw, mshadow::kNCW, mshadow::kNWC)
-                               : oshape_ncw;
+    mxnet::TShape oshape = (layout == mshadow::kNWC) ?
+                               ConvertLayout(oshape_ncw, mshadow::kNCW, mshadow::kNWC) :
+                               oshape_ncw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
@@ -189,8 +189,9 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     CHECK(layout == mshadow::kNCHW || layout == mshadow::kNHWC) << "Need 2D layout";
     // Perform shape calculations in a standard (NCHW) layout space
     mshadow::Shape<4> dshape_nchw =
-        (layout == mshadow::kNHWC) ? ConvertLayout(dshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW)
-                                   : dshape.get<4>();
+        (layout == mshadow::kNHWC) ?
+            ConvertLayout(dshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW) :
+            dshape.get<4>();
     mshadow::Shape<4> oshape_nchw = dshape_nchw;
     CHECK(param.kernel[0] <= dshape_nchw[2] + 2 * param.pad[0])
         << "kernel size (" << param.kernel[0] << ") exceeds input (" << dshape_nchw[2]
@@ -212,9 +213,9 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
                        param.stride[1]));
     }
     // Convert back from standard (NCHW) layout space to the actual layout type
-    mxnet::TShape oshape = (layout == mshadow::kNHWC)
-                               ? ConvertLayout(oshape_nchw, mshadow::kNCHW, mshadow::kNHWC)
-                               : oshape_nchw;
+    mxnet::TShape oshape = (layout == mshadow::kNHWC) ?
+                               ConvertLayout(oshape_nchw, mshadow::kNCHW, mshadow::kNHWC) :
+                               oshape_nchw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
@@ -226,9 +227,9 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
     CHECK(layout == mshadow::kNCDHW || layout == mshadow::kNDHWC) << "Need 3D layout";
     // Perform shape calculations in a standard (NCDHW) layout space
     mshadow::Shape<5> dshape_ncdhw =
-        (layout == mshadow::kNDHWC)
-            ? ConvertLayout(dshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW)
-            : dshape.get<5>();
+        (layout == mshadow::kNDHWC) ?
+            ConvertLayout(dshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW) :
+            dshape.get<5>();
     mshadow::Shape<5> oshape_ncdhw = dshape_ncdhw;
     CHECK_LE(param.kernel[0], dshape_ncdhw[2] + 2 * param.pad[0]) << "kernel size exceeds input";
     CHECK_LE(param.kernel[1], dshape_ncdhw[3] + 2 * param.pad[1]) << "kernel size exceeds input";
@@ -255,9 +256,9 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs,
                        param.stride[2]));
     }
     // Convert back from standard (NCDHW) layout space to the actual layout type
-    mxnet::TShape oshape = (layout == mshadow::kNDHWC)
-                               ? ConvertLayout(oshape_ncdhw, mshadow::kNCDHW, mshadow::kNDHWC)
-                               : oshape_ncdhw;
+    mxnet::TShape oshape = (layout == mshadow::kNDHWC) ?
+                               ConvertLayout(oshape_ncdhw, mshadow::kNCDHW, mshadow::kNDHWC) :
+                               oshape_ncdhw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 2787e419a156..9ee41cb8f9a6 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -284,16 +284,16 @@ inline void SoftmaxGrad(Stream<cpu>* s,
       DType final_result;
       if (temperature == 1.0) {
         for (index_t j = 0; j < M; ++j) {
-          final_result = negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum)
-                                : OP2::Map(ograd[base + j * sa], out[base + j * sa], sum);
+          final_result = negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) :
+                                  OP2::Map(ograd[base + j * sa], out[base + j * sa], sum);
           final_result = (j < len) ? final_result : DType(0.0f);
           KERNEL_ASSIGN(igrad[base + j * sa], Req, final_result);
         }
       } else {
         for (index_t j = 0; j < M; ++j) {
           final_result =
-              negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature
-                     : OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature;
+              negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature :
+                       OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature;
           final_result = (j < len) ? final_result : DType(0.0f);
           KERNEL_ASSIGN(igrad[base + j * sa], Req, final_result);
         }
@@ -314,15 +314,15 @@ inline void SoftmaxGrad(Stream<cpu>* s,
       DType final_result;
       if (temperature == 1.0) {
         for (index_t j = 0; j < M; ++j) {
-          final_result = negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum)
-                                : OP2::Map(ograd[base + j * sa], out[base + j * sa], sum);
+          final_result = negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) :
+                                  OP2::Map(ograd[base + j * sa], out[base + j * sa], sum);
           KERNEL_ASSIGN(igrad[base + j * sa], Req, final_result);
         }
       } else {
         for (index_t j = 0; j < M; ++j) {
           final_result =
-              negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature
-                     : OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature;
+              negate ? -OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature :
+                       OP2::Map(ograd[base + j * sa], out[base + j * sa], sum) / temperature;
           KERNEL_ASSIGN(igrad[base + j * sa], Req, final_result);
         }
       }
@@ -449,9 +449,9 @@ __global__ void masked_softmax_kernel(DType* in,
   for (index_t i = x; i < M; i += x_size) {
     val                = (negate ? -in[base + i * sa] : in[base + i * sa]);
     bool mask_value    = bcst_mask_axis ? in_mask[base_mask] : in_mask[base_mask + i * sa_mask];
-    out[base + i * sa] = mask_value
-                             ? DType(OP::Map((val - smax) / static_cast<DType>(temperature), ssum))
-                             : DType(masked_value);
+    out[base + i * sa] = mask_value ?
+                             DType(OP::Map((val - smax) / static_cast<DType>(temperature), ssum)) :
+                             DType(masked_value);
   }
 }
 
@@ -578,8 +578,8 @@ __global__ void masked_softmax_stride1_kernel(const DType* in,
     masked_value = -INFINITY;
   for (index_t i = my_id; i < M; i += threads_per_row) {
     const DType val = (negate ? -row[i] : row[i]);
-    row[i] = row_mask[i] ? DType(OP::Map((val - smax) / static_cast<DType>(temperature), ssum))
-                         : DType(masked_value);
+    row[i] = row_mask[i] ? DType(OP::Map((val - smax) / static_cast<DType>(temperature), ssum)) :
+                           DType(masked_value);
   }
   __syncthreads();
 
@@ -852,8 +852,8 @@ __global__ void masked_softmax_grad_kernel(OType* out,
   DType final_result;
   for (index_t i = x; i < M; i += x_size) {
     bool mask_value = bcst_mask_axis ? in_mask[base_mask] : in_mask[base_mask + i * sa_mask];
-    final_result    = negate ? -OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum)
-                             : OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum);
+    final_result    = negate ? -OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum) :
+                               OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum);
     final_result    = mask_value ? final_result / static_cast<DType>(temperature) : DType(0.0f);
     KERNEL_ASSIGN(igrad[base + i * sa], Req, final_result);
   }
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 8c88d53de939..5b9c4ae41a46 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -140,9 +140,9 @@ Example::
                                       [](const NodeAttrs& attrs) {
                                         const SoftmaxParam& param =
                                             nnvm::get<SoftmaxParam>(attrs.parsed);
-                                        return (param.use_length.value())
-                                                   ? std::vector<std::string>{"data", "length"}
-                                                   : std::vector<std::string>{"data"};
+                                        return (param.use_length.value()) ?
+                                                   std::vector<std::string>{"data", "length"} :
+                                                   std::vector<std::string>{"data"};
                                       })
     .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                       [](const NodeAttrs& attrs) {
diff --git a/src/operator/npx_control_flow.cc b/src/operator/npx_control_flow.cc
index a1dd419513e9..0e154d3f1354 100644
--- a/src/operator/npx_control_flow.cc
+++ b/src/operator/npx_control_flow.cc
@@ -720,9 +720,9 @@ static void WhileLoopGradComputeExCPU(const OpStatePtr& state_ptr,
         }
         if (i < (size_t)params.num_args) {
           // a var
-          igrads[i] = (step == 0)
-                          ? outputs[i]
-                          : NDArray(outputs[i].shape(), outputs[i].ctx(), true, outputs[i].dtype());
+          igrads[i] = (step == 0) ?
+                          outputs[i] :
+                          NDArray(outputs[i].shape(), outputs[i].ctx(), true, outputs[i].dtype());
 
           iter_req[i] = (step == 0 || req[i] == kNullOp) ? req[i] : kWriteTo;
           ++i;
diff --git a/src/operator/numpy/linalg/np_lstsq.cc b/src/operator/numpy/linalg/np_lstsq.cc
index cfcfa4b3c4d3..2b867bc8134c 100644
--- a/src/operator/numpy/linalg/np_lstsq.cc
+++ b/src/operator/numpy/linalg/np_lstsq.cc
@@ -54,9 +54,9 @@ inline bool LstsqOpType(const nnvm::NodeAttrs& attrs,
   CHECK(b_type == mshadow::kFloat32 || b_type == mshadow::kFloat64)
       << "lstsq operation only supports 32-bit and 64-bit floating point";
 
-  const mshadow::TypeFlag floatFlag = (mshadow::kFloat32 == a_type && mshadow::kFloat32 == b_type)
-                                          ? mshadow::kFloat32
-                                          : mshadow::kFloat64;
+  const mshadow::TypeFlag floatFlag = (mshadow::kFloat32 == a_type && mshadow::kFloat32 == b_type) ?
+                                          mshadow::kFloat32 :
+                                          mshadow::kFloat64;
   TYPE_ASSIGN_CHECK(*out_attrs, 0, floatFlag);
   TYPE_ASSIGN_CHECK(*out_attrs, 1, floatFlag);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, index_type_flag);
diff --git a/src/operator/numpy/linalg/np_norm.cc b/src/operator/numpy/linalg/np_norm.cc
index 735a6655b0b5..9838c9f59e39 100644
--- a/src/operator/numpy/linalg/np_norm.cc
+++ b/src/operator/numpy/linalg/np_norm.cc
@@ -165,8 +165,8 @@ bool NumpyNormShape(const nnvm::NodeAttrs& attrs,
   } else {
     TShape axis(param.axis.value().ndim(), 0);
     for (int i = 0; i < param.axis.value().ndim(); ++i) {
-      axis[i] = param.axis.value()[i] < 0 ? (*in_attrs)[0].ndim() + param.axis.value()[i]
-                                          : param.axis.value()[i];
+      axis[i] = param.axis.value()[i] < 0 ? (*in_attrs)[0].ndim() + param.axis.value()[i] :
+                                            param.axis.value()[i];
     }
     const_cast<NumpyNormParam&>(param).axis = axis;
     if (param.axis.value().ndim() == 2) {
diff --git a/src/operator/numpy/np_bincount_op.cc b/src/operator/numpy/np_bincount_op.cc
index 6ede3a69f721..13d1c880fcf3 100644
--- a/src/operator/numpy/np_bincount_op.cc
+++ b/src/operator/numpy/np_bincount_op.cc
@@ -114,9 +114,9 @@ NNVM_REGISTER_OP(_npi_bincount)
                                      [](const NodeAttrs& attrs) {
                                        const NumpyBincountParam& params =
                                            nnvm::get<NumpyBincountParam>(attrs.parsed);
-                                       return params.has_weights
-                                                  ? std::vector<std::string>{"data", "weights"}
-                                                  : std::vector<std::string>{"data"};
+                                       return params.has_weights ?
+                                                  std::vector<std::string>{"data", "weights"} :
+                                                  std::vector<std::string>{"data"};
                                      })
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& attrs) {
diff --git a/src/operator/numpy/np_boolean_mask_assign.cc b/src/operator/numpy/np_boolean_mask_assign.cc
index 3687a10ed749..4283821ccfb3 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cc
+++ b/src/operator/numpy/np_boolean_mask_assign.cc
@@ -262,9 +262,9 @@ void NumpyBooleanAssignForwardCPU(const nnvm::NodeAttrs& attrs,
                                                           trailing,
                                                           inputs[2].dptr<DType>());
       } else {
-        bool need_broadcast = (vshape.ndim() == (dshape.ndim() - mshape.ndim() + 1))
-                                  ? (vshape[start_axis] == 1)
-                                  : true;
+        bool need_broadcast = (vshape.ndim() == (dshape.ndim() - mshape.ndim() + 1)) ?
+                                  (vshape[start_axis] == 1) :
+                                  true;
         Kernel<BooleanAssignCPUKernel<false>, cpu>::Launch(s,
                                                            valid_num,
                                                            data.dptr<DType>(),
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.h b/src/operator/numpy/np_broadcast_reduce_op_value.h
index 68b475bf87e0..bf171133509f 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.h
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.h
@@ -79,9 +79,9 @@ inline void TVMOpReduce(const OpContext& ctx,
       << "TVMOpReduce only supports ndim <= " << max_reduce_ndim;
 
   const TBlob expanded_output =
-      (input.ndim() == output.ndim()
-           ? output
-           : output.reshape(NumpyReduceAxesShapeImpl(input.shape_, axis, true)));
+      (input.ndim() == output.ndim() ?
+           output :
+           output.reshape(NumpyReduceAxesShapeImpl(input.shape_, axis, true)));
   CHECK_EQ(input.ndim(), expanded_output.ndim());
   int reduce1st_dim = 0;
   if (input.ndim() > 0 && input.size(0) != expanded_output.size(0)) {
diff --git a/src/operator/numpy/np_delete_op-inl.h b/src/operator/numpy/np_delete_op-inl.h
index 5bb737fa75a7..901b15f204e3 100644
--- a/src/operator/numpy/np_delete_op-inl.h
+++ b/src/operator/numpy/np_delete_op-inl.h
@@ -281,8 +281,8 @@ void NumpyDeleteCompute(const nnvm::NodeAttrs& attrs,
   char* is_delete_ptr = nullptr;
   MSHADOW_TYPE_SWITCH(
       ((inputs.size() == 2U) ?  // obj is tensor
-           inputs[delete_::kObj].dtype()
-                             : mshadow::DataType<int64_t>::kFlag),
+           inputs[delete_::kObj].dtype() :
+           mshadow::DataType<int64_t>::kFlag),
       IType,
       {
         size_t temp_mem_size = sizeof(int64_t) * arr.shape()[axis] + sizeof(IType) * numtodel +
@@ -342,8 +342,8 @@ void NumpyDeleteCompute(const nnvm::NodeAttrs& attrs,
   }
 
   MSHADOW_TYPE_SWITCH(((inputs.size() == 2U) ?  // obj is tensor
-                           inputs[delete_::kObj].dtype()
-                                             : mshadow::DataType<int64_t>::kFlag),
+                           inputs[delete_::kObj].dtype() :
+                           mshadow::DataType<int64_t>::kFlag),
                       IType,
                       {
                         MXNET_NDIM_SWITCH(outshape.ndim(), ndim, {
diff --git a/src/operator/numpy/np_delete_op.cc b/src/operator/numpy/np_delete_op.cc
index 47026883beb2..36a4c9f6bb57 100644
--- a/src/operator/numpy/np_delete_op.cc
+++ b/src/operator/numpy/np_delete_op.cc
@@ -81,9 +81,9 @@ NNVM_REGISTER_OP(_npi_delete)
                                        const NumpyDeleteParam& params =
                                            nnvm::get<NumpyDeleteParam>(attrs.parsed);
                                        return (params.step.has_value() ||
-                                               params.int_ind.has_value())
-                                                  ? std::vector<std::string>{"arr"}
-                                                  : std::vector<std::string>{"arr", "obj"};
+                                               params.int_ind.has_value()) ?
+                                                  std::vector<std::string>{"arr"} :
+                                                  std::vector<std::string>{"arr", "obj"};
                                      })
     .set_attr<nnvm::FInferType>("FInferType", NumpyDeleteType)
     .set_attr<mxnet::FComputeEx>("FComputeEx<cpu>", NumpyDeleteCompute<cpu>)
diff --git a/src/operator/numpy/np_einsum_op-inl.h b/src/operator/numpy/np_einsum_op-inl.h
index 5525b9209fc1..56e6f90b77c6 100644
--- a/src/operator/numpy/np_einsum_op-inl.h
+++ b/src/operator/numpy/np_einsum_op-inl.h
@@ -436,8 +436,8 @@ struct numpy_einsum {
     AType sum                      = 0;
     do {
       AType tmp =
-          back ? static_cast<AType>(out_grad[dot(oidx, ostride[nop]) + dot(ridx, rstride[nop])])
-               : (AType)1;
+          back ? static_cast<AType>(out_grad[dot(oidx, ostride[nop]) + dot(ridx, rstride[nop])]) :
+                 (AType)1;
       for (int iop = 0; iop < nop; ++iop) {
         if (iop != iop0) {
           index_t k = dot(oidx, ostride[iop]) + dot(ridx, rstride[iop]);
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.h b/src/operator/numpy/np_elemwise_broadcast_logic_op.h
index 9d25615757a6..fafee3faedfa 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.h
@@ -64,8 +64,8 @@ static constexpr char func_logical_xor_gpu[]   = "logical_xor_gpu";
 #pragma clang diagnostic pop
 
 inline bool NumpyBinaryLogicOpType(const nnvm::NodeAttrs& attrs,
-                            std::vector<int>* in_attrs,
-                            std::vector<int>* out_attrs) {
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   if (in_attrs->at(0) == -1 && in_attrs->at(1) == -1)
@@ -260,17 +260,17 @@ struct GetBinaryBroadcastCompute {
 
 #if MXNET_USE_CUDA
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(name)                                     \
-  NNVM_REGISTER_OP(_npi_##name)                                                               \
-  .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"np_" #name})
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(name) \
+  NNVM_REGISTER_OP(_npi_##name)                           \
+      .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"np_" #name})
 
 #endif  // MXNET_USE_CUDA
 
 #endif  // MXNET_USE_TVM_OP
 
 inline bool NumpyBinaryScalarLogicOpType(const nnvm::NodeAttrs& attrs,
-                                  std::vector<int>* in_attrs,
-                                  std::vector<int>* out_attrs) {
+                                         std::vector<int>* in_attrs,
+                                         std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   if (in_attrs->at(0) == -1)
@@ -342,7 +342,6 @@ struct TVMBinaryBroadcastScalarCompute {
       .add_argument("data", "NDArray-or-Symbol", "First input to the function")           \
       .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
-
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-const-variable"
 static constexpr char func_equal_scalar_cpu[]         = "equal_scalar_cpu";
@@ -393,9 +392,9 @@ static constexpr char func_logical_xor_scalar_gpu[]   = "logical_xor_scalar_gpu"
 
 #if MXNET_USE_CUDA
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(name)                               \
-  NNVM_REGISTER_OP(_npi_##name##_scalar)                                                       \
-  .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"np_" #name})
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(name) \
+  NNVM_REGISTER_OP(_npi_##name##_scalar)                         \
+      .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"np_" #name})
 
 #endif  // MXNET_USE_CUDA
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index da40fe4044e7..97373d724324 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -549,8 +549,8 @@ void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs,
       .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 inline bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
-                                   std::vector<int>* in_attrs,
-                                   std::vector<int>* out_attrs) {
+                                          std::vector<int>* in_attrs,
+                                          std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   const int ltype = in_attrs->at(0);
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_add.cc b/src/operator/numpy/np_elemwise_broadcast_op_add.cc
index fd7fa3a62e73..50a79ab5dc2f 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_add.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_add.cc
@@ -28,26 +28,27 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_add)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
-                                      op::mshadow_op::mixed_plus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_add"});
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastComputeWithBool<cpu,
+                                                            op::mshadow_op::plus,
+                                                            op::mshadow_op::mixed_plus,
+                                                            op::mshadow_op::mixed_plus>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_add"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_add)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
-                                                                mshadow_op::posone>);
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone, mshadow_op::posone>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_add.cu b/src/operator/numpy/np_elemwise_broadcast_op_add.cu
index ad8cc6053c40..43802971ed36 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_add.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_add.cu
@@ -27,11 +27,10 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"add"});
+NNVM_REGISTER_OP(_npi_add).set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "one"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "one"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_mod.cc b/src/operator/numpy/np_elemwise_broadcast_op_mod.cc
index 0dfe0999a3ed..e47a2f2bc96f 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_mod.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_mod.cc
@@ -28,26 +28,27 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_mod)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
-                                      op::mshadow_op::mixed_rmod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mod"});
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastCompute<cpu,
+                                                    op::mshadow_op::mod,
+                                                    op::mshadow_op::mixed_mod,
+                                                    op::mshadow_op::mixed_rmod>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mod"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::mod_grad,
-                                                              mshadow_op::mod_rgrad>);
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 1}};
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBackwardUseIn<cpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_mod.cu b/src/operator/numpy/np_elemwise_broadcast_op_mod.cu
index 642b2f5ccc7c..20ca4e311ba7 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_mod.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_mod.cu
@@ -27,11 +27,10 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
+NNVM_REGISTER_OP(_npi_mod).set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"mod_grad", "mod_rgrad"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"mod_grad", "mod_rgrad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_mul.cc b/src/operator/numpy/np_elemwise_broadcast_op_mul.cc
index c5180e41faee..3e627c8c7e10 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_mul.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_mul.cc
@@ -28,26 +28,27 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
-                                      op::mshadow_op::mixed_mul>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"});
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastComputeWithBool<cpu,
+                                                            op::mshadow_op::mul,
+                                                            op::mshadow_op::mixed_mul,
+                                                            op::mshadow_op::mixed_mul>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::right,
-                                                              mshadow_op::left>);
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 1}};
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBackwardUseIn<cpu, mshadow_op::right, mshadow_op::left>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_mul.cu b/src/operator/numpy/np_elemwise_broadcast_op_mul.cu
index c720b79f4c0d..882855ddc264 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_mul.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_mul.cu
@@ -28,10 +28,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_multiply)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"right", "left"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"right", "left"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_pow.cc b/src/operator/numpy/np_elemwise_broadcast_op_pow.cc
index c281d125a45c..aa5f4c4dbb5d 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_pow.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_pow.cc
@@ -28,26 +28,28 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_power)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
-                                      op::mshadow_op::mixed_rpower>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_power"});
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastComputeWithBool<cpu,
+                                                            op::mshadow_op::power,
+                                                            op::mshadow_op::mixed_power,
+                                                            op::mshadow_op::mixed_rpower>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::power_grad,
-                                                              mshadow_op::power_rgrad>);
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 1}};
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>(
+        "FCompute<cpu>",
+        NumpyBinaryBackwardUseIn<cpu, mshadow_op::power_grad, mshadow_op::power_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_pow.cu b/src/operator/numpy/np_elemwise_broadcast_op_pow.cu
index 3a78ba6fd8d7..9e79578a9413 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_pow.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_pow.cu
@@ -28,10 +28,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"power_grad", "power_rgrad"});
+    .set_attr<FCompute>("FCompute<gpu>",
+                        BinaryBroadcastRTCBackwardUseIn{"power_grad", "power_rgrad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
index 4fd1f2c84070..e4e61d12262a 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
@@ -30,36 +30,36 @@ namespace op {
 DMLC_REGISTER_PARAMETER(NumpyBinaryScalarParam);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_add_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_subtract_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rsubtract_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_multiply_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_mod_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rmod_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_power_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_floor_divide_scalar)
     .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::floor_divide>)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
index c7bbeefb4445..21a8aeddf41d 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
@@ -28,28 +28,28 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_add_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"add"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_npi_subtract_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"sub"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_npi_rsubtract_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rsub"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rsub"});
 
 NNVM_REGISTER_OP(_npi_multiply_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_npi_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mod"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_npi_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rmod"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rmod"});
 
 NNVM_REGISTER_OP(_npi_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_npi_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
 NNVM_REGISTER_OP(_npi_floor_divide_scalar)
     .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"floor_divide"});
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_sub.cc b/src/operator/numpy/np_elemwise_broadcast_op_sub.cc
index ff6501d3d413..5f3ba7653549 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_sub.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_sub.cc
@@ -28,26 +28,27 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_subtract)
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
-                              op::mshadow_op::mixed_rminus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_sub"});
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastCompute<cpu,
+                                                    op::mshadow_op::minus,
+                                                    op::mshadow_op::mixed_minus,
+                                                    op::mshadow_op::mixed_rminus>)
+    .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_sub"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
-                                                                mshadow_op::negone>);
+    .set_num_inputs(3)
+    .set_num_outputs(2)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& attrs) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone, mshadow_op::negone>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_sub.cu b/src/operator/numpy/np_elemwise_broadcast_op_sub.cu
index 2709dc3eec09..943e8fd96683 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_sub.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_sub.cu
@@ -28,10 +28,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_subtract)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "negone"});
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "negone"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_insert_op_scalar-inl.h b/src/operator/numpy/np_insert_op_scalar-inl.h
index 7a9b8952682a..21ae59bf362d 100644
--- a/src/operator/numpy/np_insert_op_scalar-inl.h
+++ b/src/operator/numpy/np_insert_op_scalar-inl.h
@@ -56,9 +56,9 @@ void NumpyInsertScalarCompute(const nnvm::NodeAttrs& attrs,
   int axis                = param.axis.has_value() ? param.axis.value() : 0;
   TBlob arr;
   TBlob values =
-      param.val.has_value()
-          ? TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_)
-          : inputs[val_pos];
+      param.val.has_value() ?
+          TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_) :
+          inputs[val_pos];
   if (!param.axis.has_value()) {
     arr  = inputs[arr_pos].reshape(Shape1(inputs[arr_pos].shape_.Size()));
     ndim = 1;
diff --git a/src/operator/numpy/np_insert_op_slice-inl.h b/src/operator/numpy/np_insert_op_slice-inl.h
index 92768c3754d8..7c1ade35e6b3 100644
--- a/src/operator/numpy/np_insert_op_slice-inl.h
+++ b/src/operator/numpy/np_insert_op_slice-inl.h
@@ -55,9 +55,9 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs,
   int axis                = param.axis.has_value() ? param.axis.value() : 0;
   TBlob arr;
   TBlob values =
-      param.val.has_value()
-          ? TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_)
-          : inputs[val_pos];
+      param.val.has_value() ?
+          TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_) :
+          inputs[val_pos];
   if (!param.axis.has_value()) {
     arr  = inputs[arr_pos].reshape(Shape1(inputs[arr_pos].shape_.Size()));
     ndim = 1;
diff --git a/src/operator/numpy/np_insert_op_tensor-inl.h b/src/operator/numpy/np_insert_op_tensor-inl.h
index cb5fdce88134..594e135dd336 100644
--- a/src/operator/numpy/np_insert_op_tensor-inl.h
+++ b/src/operator/numpy/np_insert_op_tensor-inl.h
@@ -65,9 +65,9 @@ void NumpyInsertTensorCompute(const nnvm::NodeAttrs& attrs,
   int axis                = param.axis.has_value() ? param.axis.value() : 0;
   TBlob arr;
   TBlob values =
-      param.val.has_value()
-          ? TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_)
-          : inputs[val_pos];
+      param.val.has_value() ?
+          TBlob(nullptr, mxnet::TShape(0, 1), xpu::kDevMask, outputs[out_pos].type_flag_) :
+          inputs[val_pos];
   if (!param.axis.has_value()) {
     arr  = inputs[arr_pos].reshape(Shape1(inputs[arr_pos].shape_.Size()));
     ndim = 1;
diff --git a/src/operator/numpy/np_interp_op.cc b/src/operator/numpy/np_interp_op.cc
index a5d60b76194d..525460276419 100644
--- a/src/operator/numpy/np_interp_op.cc
+++ b/src/operator/numpy/np_interp_op.cc
@@ -68,9 +68,9 @@ NNVM_REGISTER_OP(_npi_interp)
                                      [](const NodeAttrs& attrs) {
                                        const NumpyInterpParam& param =
                                            nnvm::get<NumpyInterpParam>(attrs.parsed);
-                                       return param.x_is_scalar
-                                                  ? std::vector<std::string>{"xp", "fp"}
-                                                  : std::vector<std::string>{"xp", "fp", "x"};
+                                       return param.x_is_scalar ?
+                                                  std::vector<std::string>{"xp", "fp"} :
+                                                  std::vector<std::string>{"xp", "fp", "x"};
                                      })
     .set_attr<FCompute>("FCompute<cpu>", NumpyInterpForward<cpu, mshadow_op::mod>)
     .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/numpy/np_moments_op.cc b/src/operator/numpy/np_moments_op.cc
index a6b5cce67fd2..773f2e166465 100644
--- a/src/operator/numpy/np_moments_op.cc
+++ b/src/operator/numpy/np_moments_op.cc
@@ -157,9 +157,9 @@ NNVM_REGISTER_OP(_npi_average)
                                      [](const NodeAttrs& attrs) {
                                        const auto& param =
                                            nnvm::get<NumpyWeightedAverageParam>(attrs.parsed);
-                                       return param.weighted
-                                                  ? std::vector<std::string>{"a", "weights"}
-                                                  : std::vector<std::string>{"a"};
+                                       return param.weighted ?
+                                                  std::vector<std::string>{"a", "weights"} :
+                                                  std::vector<std::string>{"a"};
                                      })
     .add_argument("a", "NDArray-or-Symbol", "The input")
     .add_argument("weights", "NDArray-or-Symbol", "The weights to calculate average")
diff --git a/src/operator/numpy/np_percentile_op.cc b/src/operator/numpy/np_percentile_op.cc
index a15f17602ffc..57164dbcacc3 100644
--- a/src/operator/numpy/np_percentile_op.cc
+++ b/src/operator/numpy/np_percentile_op.cc
@@ -95,9 +95,9 @@ NNVM_REGISTER_OP(_npi_percentile)
                                      [](const NodeAttrs& attrs) {
                                        const NumpyPercentileParam& param =
                                            nnvm::get<NumpyPercentileParam>(attrs.parsed);
-                                       return param.q_scalar.has_value()
-                                                  ? std::vector<std::string>{"a"}
-                                                  : std::vector<std::string>{"a", "q"};
+                                       return param.q_scalar.has_value() ?
+                                                  std::vector<std::string>{"a"} :
+                                                  std::vector<std::string>{"a", "q"};
                                      })
     .set_attr<FCompute>("FCompute<cpu>", NumpyPercentileForward<cpu>)
     .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 13fb72ca970a..9696f3f3ec46 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -54,9 +54,9 @@ bool TrueDivideType(const nnvm::NodeAttrs& attrs,
 
   const int lhs_dtype = in_attrs->at(0);
   const int rhs_dtype =
-      (num_inputs == 2)
-          ? in_attrs->at(1)
-          : (common::is_float(lhs_dtype) ? lhs_dtype : mxnet::common::GetDefaultDtype());
+      (num_inputs == 2) ?
+          in_attrs->at(1) :
+          (common::is_float(lhs_dtype) ? lhs_dtype : mxnet::common::GetDefaultDtype());
   TYPE_ASSIGN_CHECK(*out_attrs, 0, TrueDivideOutType(lhs_dtype, rhs_dtype));
   return true;
 }
diff --git a/src/operator/numpy/np_unique_op.cc b/src/operator/numpy/np_unique_op.cc
index 0c4e7fceebe8..9c82122afab4 100644
--- a/src/operator/numpy/np_unique_op.cc
+++ b/src/operator/numpy/np_unique_op.cc
@@ -86,9 +86,10 @@ struct UniqueComputeMaskCPUKernel {
       out_data[i] = 1;
     } else {
       out_data[i] =
-          (std::memcmp(in_data + i * numel, in_data + (i - 1) * numel, numel * sizeof(DType)) == 0)
-              ? 0
-              : 1;
+          (std::memcmp(in_data + i * numel, in_data + (i - 1) * numel, numel * sizeof(DType)) ==
+           0) ?
+              0 :
+              1;
     }
   }
 };
diff --git a/src/operator/numpy/random/np_bernoulli_op.cc b/src/operator/numpy/random/np_bernoulli_op.cc
index 4d3546d53c69..fafd9170b2cb 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cc
+++ b/src/operator/numpy/random/np_bernoulli_op.cc
@@ -48,9 +48,9 @@ NNVM_REGISTER_OP(_npi_bernoulli)
                                        if (param.logit.has_value() || param.prob.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyBernoulliParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyBernoulliParam>)
diff --git a/src/operator/numpy/random/np_exponential_op.cc b/src/operator/numpy/random/np_exponential_op.cc
index 3d37ce5dcfaf..920cbfecffbc 100644
--- a/src/operator/numpy/random/np_exponential_op.cc
+++ b/src/operator/numpy/random/np_exponential_op.cc
@@ -51,9 +51,9 @@ NNVM_REGISTER_OP(_npi_exponential)
                                        if (param.scale.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyExponentialParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyExponentialParam>)
diff --git a/src/operator/numpy/random/np_pareto_op.cc b/src/operator/numpy/random/np_pareto_op.cc
index e0c7650b7ddf..f0c7a8d7dc17 100644
--- a/src/operator/numpy/random/np_pareto_op.cc
+++ b/src/operator/numpy/random/np_pareto_op.cc
@@ -51,9 +51,9 @@ NNVM_REGISTER_OP(_npi_pareto)
                                        if (param.a.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyParetoParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyParetoParam>)
diff --git a/src/operator/numpy/random/np_power_op.cc b/src/operator/numpy/random/np_power_op.cc
index 0376aa9d9f4c..336ae1502bc8 100644
--- a/src/operator/numpy/random/np_power_op.cc
+++ b/src/operator/numpy/random/np_power_op.cc
@@ -48,9 +48,9 @@ NNVM_REGISTER_OP(_npi_powerd)
                                        if (param.a.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyPowerParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", UnaryDistOpShape<NumpyPowerParam>)
diff --git a/src/operator/numpy/random/np_rayleigh_op.cc b/src/operator/numpy/random/np_rayleigh_op.cc
index 0b0085af9cd5..37cbd11f87ea 100644
--- a/src/operator/numpy/random/np_rayleigh_op.cc
+++ b/src/operator/numpy/random/np_rayleigh_op.cc
@@ -51,9 +51,9 @@ NNVM_REGISTER_OP(_npi_rayleigh)
                                        if (param.scale.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyRayleighParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyRayleighParam>)
diff --git a/src/operator/numpy/random/np_weibull_op.cc b/src/operator/numpy/random/np_weibull_op.cc
index 6e02114040b9..f1c490a2a8c4 100644
--- a/src/operator/numpy/random/np_weibull_op.cc
+++ b/src/operator/numpy/random/np_weibull_op.cc
@@ -51,9 +51,9 @@ NNVM_REGISTER_OP(_npi_weibull)
                                        if (param.a.has_value()) {
                                          num_inputs -= 1;
                                        }
-                                       return (num_inputs == 0)
-                                                  ? std::vector<std::string>()
-                                                  : std::vector<std::string>{"input1"};
+                                       return (num_inputs == 0) ?
+                                                  std::vector<std::string>() :
+                                                  std::vector<std::string>{"input1"};
                                      })
     .set_attr_parser(ParamParser<NumpyWeibullParam>)
     .set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyWeibullParam>)
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 122ae8a076c0..5cc23364c0db 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -1272,9 +1272,9 @@ struct FTMLKernel {
                                   const DType clip_grad,
                                   const OpReqType req) {
     using namespace mshadow_op;
-    const DType grad_i = clip_grad >= 0.0f
-                             ? clip::Map(rescale_grad * grad[i], clip_grad) + wd * weight[i]
-                             : (rescale_grad * grad[i] + wd * weight[i]);
+    const DType grad_i = clip_grad >= 0.0f ?
+                             clip::Map(rescale_grad * grad[i], clip_grad) + wd * weight[i] :
+                             (rescale_grad * grad[i] + wd * weight[i]);
     v[i]               = beta2 * v[i] + (1 - beta2) * square::Map(grad_i);
     const DType d_t    = (1 - power::Map(beta1, t)) / lr *
                       (square_root::Map(v[i] / (1 - power::Map(beta2, t))) + epsilon);
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 89b50aa61e15..c3fd47dadd17 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -228,8 +228,8 @@ struct AdamStdDnsRspDnsKernel<req, cpu> {
     const RType grad_i  = (prefix_sum[i] - 1) * row_length;
     for (index_t j = 0; j < row_length; j++) {
       const index_t data_i = row_i + j;
-      DType grad_rescaled  = non_zero ? static_cast<DType>(grad_data[grad_i + j] * rescale_grad)
-                                      : static_cast<DType>(0);
+      DType grad_rescaled  = non_zero ? static_cast<DType>(grad_data[grad_i + j] * rescale_grad) :
+                                        static_cast<DType>(0);
       if (clip_gradient >= 0.0f) {
         grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index f70e9fdd67cb..4c75eb0c72fc 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -163,8 +163,8 @@ struct AdamStdDnsRspDnsKernel<req, gpu> {
     const bool non_zero =
         (row_id == 0) ? prefix_sum[0] > 0 : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
-    DType grad_rescaled     = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad)
-                                       : static_cast<DType>(0);
+    DType grad_rescaled     = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad) :
+                                         static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index 7ed5529dc62e..296833c93999 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -370,10 +370,10 @@ struct SampleGeneralizedNegativeBinomialKernel {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       float lambda =
-          alpha[i / nBatch] == 0
-              ? static_cast<float>(mu[i / nBatch])
-              : SampleGamma<xpu, IType, float>(
-                    IType(1) / alpha[i / nBatch], alpha[i / nBatch] * mu[i / nBatch], &genImpl);
+          alpha[i / nBatch] == 0 ?
+              static_cast<float>(mu[i / nBatch]) :
+              SampleGamma<xpu, IType, float>(
+                  IType(1) / alpha[i / nBatch], alpha[i / nBatch] * mu[i / nBatch], &genImpl);
       out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
     });
   }
diff --git a/src/operator/random/shuffle_op.cu b/src/operator/random/shuffle_op.cu
index b66943e456bc..33e1ec28f9fd 100644
--- a/src/operator/random/shuffle_op.cu
+++ b/src/operator/random/shuffle_op.cu
@@ -76,8 +76,8 @@ void ShuffleForwardGPU(const nnvm::NodeAttrs& attrs,
       SortByKey(keys, out, true);
     } else {
       const size_t tmp_space_size =
-          req[0] == kWriteInplace ? 2 * first_axis_len * sizeof(index_t) + size * sizeof(DType)
-                                  : 2 * first_axis_len * sizeof(index_t);
+          req[0] == kWriteInplace ? 2 * first_axis_len * sizeof(index_t) + size * sizeof(DType) :
+                                    2 * first_axis_len * sizeof(index_t);
       Tensor<gpu, 1, char> tmp_space =
           ctx.requested[1].get_space_typed<gpu, 1, char>(Shape1(tmp_space_size), s);
       char* tmp_space_ptr = tmp_space.dptr_;
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index b6cfc79e1122..c37a65f31ecc 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -181,9 +181,9 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> out =
         out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(Shape2(batch, rest_size), s);
     Tensor<xpu, 1, IType> indices =
-        param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
-            : ctx.requested[seq_last::kTempSpace].get_space_typed<xpu, 1, IType>(Shape1(batch), s);
+        param_.use_sequence_length ?
+            in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s) :
+            ctx.requested[seq_last::kTempSpace].get_space_typed<xpu, 1, IType>(Shape1(batch), s);
     if (!param_.use_sequence_length)
       indices = max_seq_len;
 
@@ -223,9 +223,9 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> output_grad =
         out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(Shape2(batch, rest_size), s);
     Tensor<xpu, 1, IType> indices =
-        param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
-            : ctx.requested[seq_last::kTempSpace].get_space_typed<xpu, 1, IType>(Shape1(batch), s);
+        param_.use_sequence_length ?
+            in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s) :
+            ctx.requested[seq_last::kTempSpace].get_space_typed<xpu, 1, IType>(Shape1(batch), s);
 
     if (req[seq_last::kData] == kWriteTo)
       data_grad = 0.0f;
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 9af7f49178e1..ef1218b49df0 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -853,9 +853,9 @@ nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
 
   const SubgraphPropertyPtr& subg_prop = g.GetAttr<SubgraphPropertyPtr>("subgraph_property");
   if (verbose > 1) {
-    const std::string& prop_name = subg_prop->HasAttr("property_name")
-                                       ? subg_prop->GetAttr<std::string>("property_name")
-                                       : "partition graph";
+    const std::string& prop_name = subg_prop->HasAttr("property_name") ?
+                                       subg_prop->GetAttr<std::string>("property_name") :
+                                       "partition graph";
     LOG(INFO) << "start to execute " << prop_name << ".";
   }
   // top sort NodeEntry of all the nodes' inputs
diff --git a/src/operator/subgraph/dnnl/dnnl_conv.cc b/src/operator/subgraph/dnnl/dnnl_conv.cc
index f85ece31e450..e9fab47e6f44 100644
--- a/src/operator/subgraph/dnnl/dnnl_conv.cc
+++ b/src/operator/subgraph/dnnl/dnnl_conv.cc
@@ -414,9 +414,10 @@ static uint32_t SgDNNLConvNumInputs(const NodeAttrs& attrs) {
   auto num_input    = DefaultSubgraphOpNumInputs(attrs);
   if (param.full_conv_param.dnnl_param.quantized)
     return num_input + 2 +
-           (param.full_conv_param.dnnl_param.with_sum && !param.full_conv_param.dnnl_param.dedup_sum
-                ? 2
-                : 0);
+           (param.full_conv_param.dnnl_param.with_sum &&
+                    !param.full_conv_param.dnnl_param.dedup_sum ?
+                2 :
+                0);
   else
     return num_input;
 }
@@ -468,10 +469,10 @@ static void SgDNNLConvParamParser(nnvm::NodeAttrs* attrs) {
     } else if (node_name == "Convolution") {
       param_.full_conv_param.conv_param = nnvm::get<ConvolutionParam>(node->attrs.parsed);
     } else if (node_name == "Activation" || node_name == "LeakyReLU" || node_name == "clip") {
-      auto& post_act_param = (param_.full_conv_param.dnnl_param.with_act && !with_act)
-                                 ? param_.full_conv_param.act_param
-                                 : param_.full_conv_param.postsum_act_param;
-      with_act = true;
+      auto& post_act_param = (param_.full_conv_param.dnnl_param.with_act && !with_act) ?
+                                 param_.full_conv_param.act_param :
+                                 param_.full_conv_param.postsum_act_param;
+      with_act             = true;
       if (node_name == "Activation") {
         const auto act_param = nnvm::get<ActivationParam>(node->attrs.parsed);
         post_act_param.alg   = GetDNNLActAlgo(act_param);
diff --git a/src/operator/subgraph/dnnl/dnnl_fc.cc b/src/operator/subgraph/dnnl/dnnl_fc.cc
index c07b8f7b8835..44c1a3585156 100644
--- a/src/operator/subgraph/dnnl/dnnl_fc.cc
+++ b/src/operator/subgraph/dnnl/dnnl_fc.cc
@@ -670,8 +670,8 @@ NNVM_REGISTER_OP(_sg_onednn_fully_connected)
     })
     .set_num_outputs([](const NodeAttrs& attrs) {
       auto const& full_param = nnvm::get<DNNLFCFullParam>(attrs.parsed);
-      return (full_param.dnnl_param.quantized && !full_param.dnnl_param.enable_float_output) ? 3
-                                                                                             : 1;
+      return (full_param.dnnl_param.quantized && !full_param.dnnl_param.enable_float_output) ? 3 :
+                                                                                               1;
     })
     .set_attr_parser(SgDNNLFCParamParser)
     .set_attr<nnvm::FListInputNames>("FListInputNames", SgDNNLFCListInputNames)
diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
index 5db3bb01df8a..23131cb9792c 100644
--- a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
@@ -668,7 +668,7 @@ void ConvertConcatenate(GraphProto* graph_proto,
                         const array_view<IndexedGraph::NodeEntry>& inputs) {
   NodeProto* node_proto = graph_proto->add_node();
   node_proto->set_name(node_name);
-  const auto& _param = nnvm::get<ConcatParam>(attrs.parsed);
+  const auto& _param  = nnvm::get<ConcatParam>(attrs.parsed);
   const int param_dim = _param.dim.has_value() ? _param.dim.value() : 0;
   node_proto->set_op_type("Concat");
   node_proto->set_name(attrs.name);
diff --git a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
index c2b1dd215937..c145273076b2 100644
--- a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
@@ -73,11 +73,11 @@ class TRT_Logger : public nvinfer1::ILogger {
       time_t rawtime = std::time(0);
       char buf[256];
       strftime(&buf[0], 256, "%Y-%m-%d %H:%M:%S", std::gmtime(&rawtime));
-      const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG"
-                            : severity == Severity::kERROR        ? "  ERROR"
-                            : severity == Severity::kWARNING      ? "WARNING"
-                            : severity == Severity::kINFO         ? "   INFO"
-                                                                  : "UNKNOWN");
+      const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" :
+                            severity == Severity::kERROR          ? "  ERROR" :
+                            severity == Severity::kWARNING        ? "WARNING" :
+                            severity == Severity::kINFO           ? "   INFO" :
+                                                                    "UNKNOWN");
       (*_ostream) << "[" << buf << " " << sevstr << "] " << msg << std::endl;
     }
   }
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
index d142dc1ed358..ccfb150c838c 100644
--- a/src/operator/subgraph/tensorrt/tensorrt-inl.h
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -192,7 +192,7 @@ class TensorrtSelector : public SubgraphSelector {
     }
 
     if (op_name == "Concat") {
-      const auto& param = nnvm::get<ConcatParam>(n.attrs.parsed);
+      const auto& param   = nnvm::get<ConcatParam>(n.attrs.parsed);
       const int param_dim = param.dim.has_value() ? param.dim.value() : 0;
       return (param_dim != 0);
     }
diff --git a/src/operator/tensor/amp_cast.cc b/src/operator/tensor/amp_cast.cc
index aee5f537d9bc..62e63a183e5a 100644
--- a/src/operator/tensor/amp_cast.cc
+++ b/src/operator/tensor/amp_cast.cc
@@ -46,9 +46,9 @@ static void AMPCastExCPU(const nnvm::NodeAttrs& attrs,
     dnnl::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
     if (data.IsView() && data.IsDNNLData())
       data = data.Reorder2Default();
-    const auto i_mem            = data.GetDNNLData();
-    const size_t i_ndim         = data.shape().ndim();
-    dnnl::memory::dims i_dims   = dnnl::memory::dims(i_ndim);
+    const auto i_mem          = data.GetDNNLData();
+    const size_t i_ndim       = data.shape().ndim();
+    dnnl::memory::dims i_dims = dnnl::memory::dims(i_ndim);
     for (size_t i = 0; i < i_ndim; i++) {
       i_dims[i] = static_cast<int>(data.shape()[i]);
     }
@@ -94,9 +94,9 @@ static void AMPMultiCastExCPU(const nnvm::NodeAttrs& attrs,
     auto data = inputs[i];
     if (data.IsView() && data.IsDNNLData())
       data = data.Reorder2Default();
-    const auto i_mem            = data.GetDNNLData();
-    const size_t i_ndim         = data.shape().ndim();
-    dnnl::memory::dims i_dims   = dnnl::memory::dims(i_ndim);
+    const auto i_mem          = data.GetDNNLData();
+    const size_t i_ndim       = data.shape().ndim();
+    dnnl::memory::dims i_dims = dnnl::memory::dims(i_ndim);
     for (size_t j = 0; j < i_ndim; j++) {
       i_dims[j] = static_cast<int>(data.shape()[j]);
     }
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 30f52f126166..77a81bcb646e 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -795,9 +795,9 @@ struct ReduceImplConfig {
         kernel_1.gridDim.x =
             std::min((unsigned int)kBaseGridNum, ceil_idiv<unsigned int>(N, kernel_1.blockDim.x));
         kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
-        kernel_1.shMemSize = (kernel_1.blockDim.y > 1)
-                                 ? kernel_1.blockDim.x * kernel_1.blockDim.y * max_type_size * 2
-                                 : 0;
+        kernel_1.shMemSize = (kernel_1.blockDim.y > 1) ?
+                                 kernel_1.blockDim.x * kernel_1.blockDim.y * max_type_size * 2 :
+                                 0;
         // Maximum number of times we want TB to loop in M
         // Max size of M-block each TB can handle
         int maxMblock = kernel_1.blockDim.y * maxLoopPerTB;
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 863ef28598ec..7cd9fa9988d8 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -251,8 +251,8 @@ inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs,
   bool rhs_rsp_or_dns     = rhs_stype == kRowSparseStorage || rhs_stype == kDefaultStorage;
   bool hint_has_value     = param.forward_stype.has_value();
   NDArrayStorageType target_stype =
-      hint_has_value ? static_cast<NDArrayStorageType>(param.forward_stype.value())
-                     : kUndefinedStorage;
+      hint_has_value ? static_cast<NDArrayStorageType>(param.forward_stype.value()) :
+                       kUndefinedStorage;
   if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kDefaultStorage) {
     // dns, dns -> dns
     target_stype = hint_has_value ? target_stype : kDefaultStorage;
@@ -1341,13 +1341,13 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
       L[0] = mshadow::Shape1(lshape[0]);
       L[1] = lshape.ndim() > 1 ? mxnet::TShape(&lshape[1], lshape.end()) : mxnet::TShape(1, 1);
     } else {
-      L[0] = lshape.ndim() > 1 ? mxnet::TShape(&lshape[0], &lshape[lshape.ndim() - 1])
-                               : mxnet::TShape(1, 1);
+      L[0] = lshape.ndim() > 1 ? mxnet::TShape(&lshape[0], &lshape[lshape.ndim() - 1]) :
+                                 mxnet::TShape(1, 1);
       L[1] = mshadow::Shape1(lshape[lshape.ndim() - 1]);
     }
     if (Tb) {
-      R[0] = rshape.ndim() > 1 ? mxnet::TShape(&rshape[0], &rshape[rshape.ndim() - 1])
-                               : mxnet::TShape(1, 1);
+      R[0] = rshape.ndim() > 1 ? mxnet::TShape(&rshape[0], &rshape[rshape.ndim() - 1]) :
+                                 mxnet::TShape(1, 1);
       R[1] = mshadow::Shape1(rshape[rshape.ndim() - 1]);
     } else {
       R[0] = mshadow::Shape1(rshape[0]);
diff --git a/src/operator/tensor/elemwise_binary_op-inl.h b/src/operator/tensor/elemwise_binary_op-inl.h
index 9d8b43adb2af..b2d8394d71de 100644
--- a/src/operator/tensor/elemwise_binary_op-inl.h
+++ b/src/operator/tensor/elemwise_binary_op-inl.h
@@ -113,14 +113,14 @@ void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<cpu>* s,
 
       // Indices
       const Tensor<cpu, 1, IType> indices_l =
-          lhs_is_dense ? Tensor<cpu, 1, IType>()
-                       : lhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+          lhs_is_dense ? Tensor<cpu, 1, IType>() :
+                         lhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
       const Tensor<cpu, 1, IType> indices_r =
-          rhs_is_dense ? Tensor<cpu, 1, IType>()
-                       : rhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+          rhs_is_dense ? Tensor<cpu, 1, IType>() :
+                         rhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
       Tensor<cpu, 1, IType> indices_out =
-          is_dense_result ? Tensor<cpu, 1, IType>()
-                          : output.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+          is_dense_result ? Tensor<cpu, 1, IType>() :
+                            output.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
 
       // Data
       // TODO(cjolivier01): Change to get_with_shape() calls
@@ -565,8 +565,8 @@ struct ElemwiseDnsCsrCsrKernel {
       for (int j = csr_indptr[i]; j < csr_indptr[i + 1]; ++j) {
         KERNEL_ASSIGN(out[j],
                       req,
-                      reverse ? OP::Map(dns_data[i * num_cols + csr_indices[j]], csr_data[j])
-                              : OP::Map(csr_data[j], dns_data[i * num_cols + csr_indices[j]]));
+                      reverse ? OP::Map(dns_data[i * num_cols + csr_indices[j]], csr_data[j]) :
+                                OP::Map(csr_data[j], dns_data[i * num_cols + csr_indices[j]]));
       }
     }
   }
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 1fb241b24750..aa6b7f531f69 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -195,8 +195,8 @@ class BinaryScalarOp : public UnaryOp {
         // Split up into blocks of contiguous data and do those together
         const size_t row_item_start_iter = row_starts_ptr[i];
         const size_t input_items_this_row =
-            !last_row ? static_cast<size_t>(row_starts_ptr[i + 1]) - row_item_start_iter
-                      : item_count - row_item_start_iter;
+            !last_row ? static_cast<size_t>(row_starts_ptr[i + 1]) - row_item_start_iter :
+                        item_count - row_item_start_iter;
         if (input_items_this_row) {
           const IType* this_row_column_indexes = column_indexes_ptr + row_item_start_iter;
           const DType* row_data_start          = in + row_item_start_iter;
diff --git a/src/operator/tensor/histogram.cc b/src/operator/tensor/histogram.cc
index faa709c76e0d..d36e9e50faf0 100644
--- a/src/operator/tensor/histogram.cc
+++ b/src/operator/tensor/histogram.cc
@@ -161,9 +161,9 @@ Example::
                                      [](const NodeAttrs& attrs) {
                                        const HistogramParam& params =
                                            nnvm::get<HistogramParam>(attrs.parsed);
-                                       return params.bin_cnt.has_value()
-                                                  ? std::vector<std::string>{"data"}
-                                                  : std::vector<std::string>{"data", "bins"};
+                                       return params.bin_cnt.has_value() ?
+                                                  std::vector<std::string>{"data"} :
+                                                  std::vector<std::string>{"data", "bins"};
                                      })
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& attrs) {
diff --git a/src/operator/tensor/la_op-inl.h b/src/operator/tensor/la_op-inl.h
index 212d630bc016..49a3ff263ca8 100644
--- a/src/operator/tensor/la_op-inl.h
+++ b/src/operator/tensor/la_op-inl.h
@@ -674,10 +674,10 @@ struct gemm_backward {
                  const nnvm::NodeAttrs& attrs) {
     const LaMatrixMacParam& param = nnvm::get<LaMatrixMacParam>(attrs.parsed);
     bool tA(param.transpose_a), tB(param.transpose_b);
-    (tA ? gemm::op(B, dD, dA, DType(param.alpha), DType(0), tB, true, s)
-        : gemm::op(dD, B, dA, DType(param.alpha), DType(0), false, !tB, s));
-    (tB ? gemm::op(dD, A, dB, DType(param.alpha), DType(0), true, tA, s)
-        : gemm::op(A, dD, dB, DType(param.alpha), DType(0), !tA, false, s));
+    (tA ? gemm::op(B, dD, dA, DType(param.alpha), DType(0), tB, true, s) :
+          gemm::op(dD, B, dA, DType(param.alpha), DType(0), false, !tB, s));
+    (tB ? gemm::op(dD, A, dB, DType(param.alpha), DType(0), true, tA, s) :
+          gemm::op(A, dD, dB, DType(param.alpha), DType(0), !tA, false, s));
     Copy(dC, dD, s);
     using namespace mxnet_op;
     Kernel<Scale, xpu>::Launch(s, dC.MSize(), DType(param.beta), dC.dptr_);
@@ -708,10 +708,10 @@ struct gemm2_backward {
                  const nnvm::NodeAttrs& attrs) {
     const LaMatrixMultParam& param = nnvm::get<LaMatrixMultParam>(attrs.parsed);
     bool tA(param.transpose_a), tB(param.transpose_b);
-    (tA ? gemm::op(B, dC, dA, DType(param.alpha), DType(0), tB, true, s)
-        : gemm::op(dC, B, dA, DType(param.alpha), DType(0), false, !tB, s));
-    (tB ? gemm::op(dC, A, dB, DType(param.alpha), DType(0), true, tA, s)
-        : gemm::op(A, dC, dB, DType(param.alpha), DType(0), !tA, false, s));
+    (tA ? gemm::op(B, dC, dA, DType(param.alpha), DType(0), tB, true, s) :
+          gemm::op(dC, B, dA, DType(param.alpha), DType(0), false, !tB, s));
+    (tB ? gemm::op(dC, A, dB, DType(param.alpha), DType(0), true, tA, s) :
+          gemm::op(A, dC, dB, DType(param.alpha), DType(0), !tA, false, s));
   }
   template <typename xpu, int dim, typename DType>
   static void op(const Tensor<xpu, dim, DType>& dC,
@@ -824,8 +824,8 @@ struct trsm_backward {
     // Compute dA
     const bool da_left(param.rightside == param.transpose);
     DType scale(-1.0 / param.alpha);
-    (da_left ? gemm::op(dB, C, dA, scale, DType(0), param.transpose, !param.transpose, s)
-             : gemm::op(C, dB, dA, scale, DType(0), !param.transpose, param.transpose, s));
+    (da_left ? gemm::op(dB, C, dA, scale, DType(0), param.transpose, !param.transpose, s) :
+               gemm::op(C, dB, dA, scale, DType(0), !param.transpose, param.transpose, s));
     using namespace mxnet_op;
     Kernel<ZeroTriangular, xpu>::Launch(
         s, dA.MSize(), dA.size(1) * dA.stride_, dA.stride_, dA.dptr_, !param.lower);
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index dd993887e2c3..3d9eebdb0644 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -283,8 +283,8 @@ inline bool LaDiagTrianShape(const nnvm::NodeAttrs& attrs,
   if (ndim == 0) {
     return false;
   }
-  const int offset = (diag ? nnvm::get<LaDiagParam>(attrs.parsed).offset
-                           : nnvm::get<LaTrianParam>(attrs.parsed).offset);
+  const int offset = (diag ? nnvm::get<LaDiagParam>(attrs.parsed).offset :
+                             nnvm::get<LaTrianParam>(attrs.parsed).offset);
   std::vector<int> oshape(extract ? ndim - 1 : ndim + 1);
   for (int i = 0; i < ndim - 1; ++i) {
     oshape[i] = (*in_attrs)[0][i];
@@ -710,8 +710,8 @@ void LaOpGemmForward(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(inputs.size(), inum);
   CHECK_EQ(outputs.size(), onum);
-  const int axis(inputs.size() == 2 ? nnvm::get<LaMatrixMultParam>(attrs.parsed).axis
-                                    : nnvm::get<LaMatrixMacParam>(attrs.parsed).axis);
+  const int axis(inputs.size() == 2 ? nnvm::get<LaMatrixMultParam>(attrs.parsed).axis :
+                                      nnvm::get<LaMatrixMacParam>(attrs.parsed).axis);
   MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
     if (axis == -2 || axis == inputs[0].ndim() - 2) {
       LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, outputs, attrs, ctx);
@@ -732,8 +732,8 @@ void LaOpGemmBackward(const nnvm::NodeAttrs& attrs,
   Stream<xpu>* s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), inum);
   CHECK_EQ(outputs.size(), onum);
-  const int axis(inputs.size() == 3 ? nnvm::get<LaMatrixMultParam>(attrs.parsed).axis
-                                    : nnvm::get<LaMatrixMacParam>(attrs.parsed).axis);
+  const int axis(inputs.size() == 3 ? nnvm::get<LaMatrixMultParam>(attrs.parsed).axis :
+                                      nnvm::get<LaMatrixMacParam>(attrs.parsed).axis);
   MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
     std::vector<TBlob> tspace(outputs);
     for (int i = 0; i < onum; ++i) {
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index fd8306a96edd..b5bd1c96d25b 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -187,8 +187,8 @@ __global__ void split_tensor_kernel(size_t input_size,
       LType* out_aligned  = reinterpret_cast<LType*>(params.outputs[section]);
       size_t section_size_aligned =
           entries_per_load > 0 ? section_size / entries_per_load : section_size;
-      size_t index_aligned = entries_per_load > 0 ? params.indices[section] / entries_per_load
-                                                  : params.indices[section];
+      size_t index_aligned = entries_per_load > 0 ? params.indices[section] / entries_per_load :
+                                                    params.indices[section];
       size_t output_offset_leading = (blockIdx.x / blocks_last_axis) * section_size_aligned;
       size_t output_position = output_offset_leading + position_last_axis_aligned - index_aligned;
       out_aligned[output_position] = input_data;
@@ -330,9 +330,9 @@ inline void SplitOpForwardGPU(const nnvm::NodeAttrs& attrs,
           if (splitting_last_axis) {
             // may not be possible to include whole axis if too many sections
             last_axis_elements =
-                entries_per_load > 0
-                    ? ((params.indices[params.num_sections] - params.indices[0]) / entries_per_load)
-                    : 0;
+                entries_per_load > 0 ?
+                    ((params.indices[params.num_sections] - params.indices[0]) / entries_per_load) :
+                    0;
           }
           while (block_size < last_axis_elements && (block_size < max_threads_block)) {
             block_size += 32;
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index 5b6d89ebf774..bfa5d0a50e28 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -362,9 +362,9 @@ void RTCReduceImpl(Stream<gpu>* s,
   args.emplace_back(&param);
   args.emplace_back(&config.Mnext);
 
-  const auto& function_code = (lhs == nullptr)
-                                  ? (use_index ? reduce_function_index_code : reduce_function_code)
-                                  : reduce_function_use_input_code;
+  const auto& function_code = (lhs == nullptr) ?
+                                  (use_index ? reduce_function_index_code : reduce_function_code) :
+                                  reduce_function_use_input_code;
   const auto& kernel_name   = (config.Mnext > 1) ? "reduce_kernel_multi" : "reduce_kernel_single";
   auto reduce_kernel_func =
       get_function(code + function_code, kernel_name, reduce_kernel_code, dev_id);
@@ -497,9 +497,9 @@ void RTCReduceM1Impl(Stream<gpu>* s,
   args.emplace_back(&small.dptr_);
   args.emplace_back(&param);
 
-  const auto& function_code = (lhs == nullptr)
-                                  ? (use_index ? reduce_function_index_code : reduce_function_code)
-                                  : reduce_function_use_input_code;
+  const auto& function_code = (lhs == nullptr) ?
+                                  (use_index ? reduce_function_index_code : reduce_function_code) :
+                                  reduce_function_use_input_code;
   auto reduce_kernel_M1_func =
       get_function(code + function_code, "reduce_kernel_M1", reduce_kernel_M1_code, dev_id);
   launch(reduce_kernel_M1_func, config.kernel_1.gridDim, config.kernel_1.blockDim, 0, s, &args);
diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc
index 0ce48c6843f5..05917b6c1382 100644
--- a/src/operator/tensor/square_sum.cc
+++ b/src/operator/tensor/square_sum.cc
@@ -27,7 +27,7 @@ namespace mxnet {
 namespace op {
 
 template <>
-void CheckSameIdx<cpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx) {
+void CheckSameIdx<cpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx){
     MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_,
                             IType,
                             {
diff --git a/src/operator/tensor/square_sum.cu b/src/operator/tensor/square_sum.cu
index 92042e54206e..d41f0aa02918 100644
--- a/src/operator/tensor/square_sum.cu
+++ b/src/operator/tensor/square_sum.cu
@@ -27,7 +27,7 @@ namespace mxnet {
 namespace op {
 
 template <>
-void CheckSameIdx<gpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx) {
+void CheckSameIdx<gpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx){
     MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_,
                             IType,
                             {
diff --git a/src/profiler/aggregate_stats.cc b/src/profiler/aggregate_stats.cc
index 2cb8759e095a..78ae4d35ffb2 100644
--- a/src/profiler/aggregate_stats.cc
+++ b/src/profiler/aggregate_stats.cc
@@ -131,9 +131,9 @@ void AggregateStats::DumpTable(std::ostream& os, int sort_by, int ascending) {
            << " " << std::fixed << std::setw(16) << std::setprecision(4) << std::right
            << (is_memory ? ByteToKilobyte(data.max_aggregate_) : MicroToMilli(data.max_aggregate_))
            << " " << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-           << (data.type_ == AggregateStats::StatData::kCounter
-                   ? ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2)
-                   : MicroToMilli(static_cast<double>(data.total_aggregate_) / data.total_count_));
+           << (data.type_ == AggregateStats::StatData::kCounter ?
+                   ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2) :
+                   MicroToMilli(static_cast<double>(data.total_aggregate_) / data.total_count_));
         os << std::endl;
       }
       heap.pop();
@@ -181,9 +181,9 @@ void AggregateStats::DumpJson(std::ostream& os, int sort_by, int ascending) {
             << (is_memory ? ByteToKilobyte(data.max_aggregate_) : MicroToMilli(data.max_aggregate_))
             << "," << std::endl
             << "                \"Avg\": " << std::setprecision(4)
-            << (data.type_ == AggregateStats::StatData::kCounter
-                    ? ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2)
-                    : MicroToMilli(static_cast<double>(data.total_aggregate_) / data.total_count_))
+            << (data.type_ == AggregateStats::StatData::kCounter ?
+                    ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2) :
+                    MicroToMilli(static_cast<double>(data.total_aggregate_) / data.total_count_))
             << std::endl
             << "            }" << std::endl;
       }
diff --git a/src/runtime/container.cc b/src/runtime/container.cc
index 50a284af56f7..2197c10abb3e 100644
--- a/src/runtime/container.cc
+++ b/src/runtime/container.cc
@@ -93,8 +93,8 @@ MXNET_REGISTER_GLOBAL("container._MapGetItem").set_body([](MXNetArgs args, MXNet
   CHECK(ptr->IsInstance<MapObj>());
 
   auto* n = static_cast<const MapObj*>(ptr);
-  auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String()
-                                                    : args[1].operator ObjectRef());
+  auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String() :
+                                                      args[1].operator ObjectRef());
   CHECK(it != n->end()) << "cannot find the corresponding key in the Map";
   *rv = (*it).second;
 });
diff --git a/src/serialization/cnpy.cc b/src/serialization/cnpy.cc
index 0534b3ae7459..bcd525c5e351 100644
--- a/src/serialization/cnpy.cc
+++ b/src/serialization/cnpy.cc
@@ -743,8 +743,8 @@ std::pair<std::vector<NDArray>, std::vector<std::string>> load_arrays(
 
         arrays.push_back(array);
         return_names.emplace_back(dirname.size() ?  // Exclude "/"
-                                      dirname.substr(0, dirname.size() - 1)
-                                                 : dirname);
+                                      dirname.substr(0, dirname.size() - 1) :
+                                      dirname);
 
       } else {
         throw std::runtime_error("Loading " + format + " sparse matrix format is unsupported.");
@@ -881,8 +881,8 @@ std::pair<std::vector<NDArray>, std::vector<std::string>> load_arrays(
 
         arrays.push_back(array);
         return_names.emplace_back(dirname.size() ?  // Exclude "/"
-                                      dirname.substr(0, dirname.size() - 1)
-                                                 : dirname);
+                                      dirname.substr(0, dirname.size() - 1) :
+                                      dirname);
 
       } else {
         throw std::runtime_error("Loading " + format + " sparse matrix format is unsupported.");
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index f6e60c56fbf8..9d1c3900ace1 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -207,7 +207,7 @@ void PooledStorageManager<BucketingStrategy, StoringMethod>::Alloc(Storage::Hand
 #if MXNET_USE_CUDA
             dev_type_ == Context::kGPU ? cudaGetErrorString(static_cast<cudaError_t>(e)) :
 #endif
-                                       std::strerror(errno));
+                                         std::strerror(errno));
 
         LOG(FATAL) << "Memory allocation failed " << err;
       }

From 2b58452402a6ad0f8cf9734c7b10673ccdf883f3 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:01:02 +0100
Subject: [PATCH 02/10] [TEST] Re-format .cc .h files

---
 tests/cpp/engine/engine_shutdown_test.cc      |  17 +-
 tests/cpp/engine/omp_test.cc                  |  35 +-
 tests/cpp/engine/thread_local_test.cc         |  64 +-
 tests/cpp/engine/threaded_engine_test.cc      | 272 +++---
 tests/cpp/include/test_core_op.h              | 192 ++--
 tests/cpp/include/test_legacy_op.h            | 245 ++---
 tests/cpp/include/test_ndarray_utils.h        | 115 ++-
 tests/cpp/include/test_op.h                   |  90 +-
 tests/cpp/include/test_op_runner.h            | 143 ++-
 tests/cpp/include/test_perf.h                 | 120 ++-
 tests/cpp/include/test_tune.h                 | 122 +--
 tests/cpp/include/test_util.h                 | 250 +++--
 tests/cpp/kvstore/gpu_topology_test.cc        | 278 +++---
 tests/cpp/misc/base.cc                        |  30 +-
 tests/cpp/operator/activation_perf.cc         |  69 +-
 tests/cpp/operator/batchnorm_test.cc          | 874 +++++++++---------
 tests/cpp/operator/coreop_perf.cc             |  61 +-
 tests/cpp/operator/dnnl_operator_test.cc      |   4 +-
 tests/cpp/operator/dropout_perf.cc            |  58 +-
 tests/cpp/operator/fully_conn_perf.cc         |  62 +-
 tests/cpp/operator/krprod_test.cc             | 115 ++-
 .../operator/runner/core_op_runner_test.cc    | 196 ++--
 tests/cpp/operator/slice_channel_perf.cc      |  52 +-
 tests/cpp/operator/tune/operator_tune_test.cc |  66 +-
 tests/cpp/storage/storage_test.cc             |  26 +-
 tests/cpp/test_main.cc                        |  23 +-
 26 files changed, 1719 insertions(+), 1860 deletions(-)

diff --git a/tests/cpp/engine/engine_shutdown_test.cc b/tests/cpp/engine/engine_shutdown_test.cc
index 893d08502c3a..98830796e2cf 100644
--- a/tests/cpp/engine/engine_shutdown_test.cc
+++ b/tests/cpp/engine/engine_shutdown_test.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2019 by Contributors
  * \file engine_shutdown_test.cc
  * \brief Tests engine shutdown for possible crashes
-*/
+ */
 #include <gtest/gtest.h>
 
 #include "../src/engine/engine_impl.h"
@@ -29,13 +29,14 @@
 
 /**
  * This test will help ensure we don't crash during engine shutdown.
- * The crash happens during a static destructor call, so this test may pass and then cause a test-run process crash.
+ * The crash happens during a static destructor call, so this test may pass and then cause a
+ * test-run process crash.
  */
 TEST(EngineShutdown, stop_without_crashing) {
-    static std::unique_ptr<mxnet::NDArray> ndArray;
-    {
-        auto engine = mxnet::Engine::_GetSharedRef();
-        ndArray = std::make_unique<mxnet::NDArray>(mxnet::Context::CPU());
-        engine->Stop();
-    }
+  static std::unique_ptr<mxnet::NDArray> ndArray;
+  {
+    auto engine = mxnet::Engine::_GetSharedRef();
+    ndArray     = std::make_unique<mxnet::NDArray>(mxnet::Context::CPU());
+    engine->Stop();
+  }
 }
diff --git a/tests/cpp/engine/omp_test.cc b/tests/cpp/engine/omp_test.cc
index f4ef421a8595..c6cb5c0470c6 100644
--- a/tests/cpp/engine/omp_test.cc
+++ b/tests/cpp/engine/omp_test.cc
@@ -28,24 +28,23 @@
 #include <sys/wait.h>
 #include <dmlc/logging.h>
 
-
 TEST(OMPBehaviour, after_fork) {
-    /* 
-     * Check that after fork, OMP is disabled, and the recommended thread count is 1 to prevent 
-     * process fanout.
-     */
-    using namespace mxnet::engine;
-    auto openmp = OpenMP::Get();
-    pid_t pid = fork();
-    if (pid == 0) {
-        EXPECT_FALSE(openmp->enabled());
-        EXPECT_EQ(openmp->GetRecommendedOMPThreadCount(), 1);
-    } else if (pid > 0) {
-        int status;
-        int ret = waitpid(pid, &status, 0);
-        CHECK_EQ(ret, pid) << "waitpid failed";
-    } else {
-        CHECK(false) << "fork failed";
-    }
+  /*
+   * Check that after fork, OMP is disabled, and the recommended thread count is 1 to prevent
+   * process fanout.
+   */
+  using namespace mxnet::engine;
+  auto openmp = OpenMP::Get();
+  pid_t pid   = fork();
+  if (pid == 0) {
+    EXPECT_FALSE(openmp->enabled());
+    EXPECT_EQ(openmp->GetRecommendedOMPThreadCount(), 1);
+  } else if (pid > 0) {
+    int status;
+    int ret = waitpid(pid, &status, 0);
+    CHECK_EQ(ret, pid) << "waitpid failed";
+  } else {
+    CHECK(false) << "fork failed";
+  }
 }
 #endif
diff --git a/tests/cpp/engine/thread_local_test.cc b/tests/cpp/engine/thread_local_test.cc
index 6801b377ef83..a30577ef263d 100644
--- a/tests/cpp/engine/thread_local_test.cc
+++ b/tests/cpp/engine/thread_local_test.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2019 by Contributors
  * \file engine_thread_local_test.cc
  * \brief Tests thread safety and lifetime of thread local store
-*/
+ */
 #include <gtest/gtest.h>
 #include <dmlc/logging.h>
 #include <dmlc/thread_group.h>
@@ -37,44 +37,42 @@
 #include <vector>
 
 struct A {
-    std::vector<int> a;
+  std::vector<int> a;
 };
-int num_threads = 10;
+int num_threads  = 10;
 int num_elements = num_threads * 10;
 
 static int ThreadSafetyTest(int num, std::vector<int>* tmp_inputs, std::vector<int*>* res) {
-    A *ret = dmlc::ThreadLocalStore<A>::Get();
-    for (size_t i = num * 10; i < num * 10 + 10; ++i) {
-        (*tmp_inputs)[i] = i;
-    }
-    ret->a.clear();
-    ret->a.reserve(10);
-    for (size_t i = num * 10; i < num * 10 + 10; ++i) {
-        ret->a.push_back((*tmp_inputs)[i]);
-    }
-    (*res)[num] = dmlc::BeginPtr(ret->a);
-    return 0;
+  A* ret = dmlc::ThreadLocalStore<A>::Get();
+  for (size_t i = num * 10; i < num * 10 + 10; ++i) {
+    (*tmp_inputs)[i] = i;
+  }
+  ret->a.clear();
+  ret->a.reserve(10);
+  for (size_t i = num * 10; i < num * 10 + 10; ++i) {
+    ret->a.push_back((*tmp_inputs)[i]);
+  }
+  (*res)[num] = dmlc::BeginPtr(ret->a);
+  return 0;
 }
 
 TEST(ThreadLocal, VerifyThreadSafety) {
-    std::vector<int> tmp_inputs;
-    tmp_inputs.resize(num_elements);
-    std::vector<int*> outputs;
-    outputs.resize(num_threads);
-    auto func = [&](int num) {
-        ThreadSafetyTest(num, &tmp_inputs, &outputs);
-    };
-    std::vector<std::thread> worker_threads(num_threads);
-    int count = 0;
-    for (auto&& i : worker_threads) {
-        i = std::thread(func, count);
-        count++;
-    }
-    for (auto&& i : worker_threads) {
-        i.join();
-    }
+  std::vector<int> tmp_inputs;
+  tmp_inputs.resize(num_elements);
+  std::vector<int*> outputs;
+  outputs.resize(num_threads);
+  auto func = [&](int num) { ThreadSafetyTest(num, &tmp_inputs, &outputs); };
+  std::vector<std::thread> worker_threads(num_threads);
+  int count = 0;
+  for (auto&& i : worker_threads) {
+    i = std::thread(func, count);
+    count++;
+  }
+  for (auto&& i : worker_threads) {
+    i.join();
+  }
 
-    for (size_t i = 0; i < num_elements; i++) {
-        CHECK(outputs[i/10][i%10] == i);
-    }
+  for (size_t i = 0; i < num_elements; i++) {
+    CHECK(outputs[i / 10][i % 10] == i);
+  }
 }
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 465e387b8d42..5b86781d6838 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file threaded_engine_test.cc
  * \brief threaded engine tests
-*/
+ */
 #include <dmlc/logging.h>
 #include <dmlc/thread_group.h>
 #include <dmlc/omp.h>
@@ -57,9 +57,12 @@ static uint32_t seed_ = 0xdeadbeef;
 /**
  * generate a list of workloads
  */
-void GenerateWorkload(int num_workloads, int num_var,
-                      int min_read, int max_read,
-                      int min_time, int max_time,
+void GenerateWorkload(int num_workloads,
+                      int num_var,
+                      int min_read,
+                      int max_read,
+                      int min_time,
+                      int max_time,
                       std::vector<Workload>* workloads) {
   workloads->clear();
   workloads->resize(num_workloads);
@@ -68,8 +71,8 @@ void GenerateWorkload(int num_workloads, int num_var,
   std::uniform_int_distribution<int> distribution_time(min_time, max_time - 1);
   std::uniform_int_distribution<int> distribution_read(min_read, max_read - 1);
   for (int i = 0; i < num_workloads; ++i) {
-    auto& wl = workloads->at(i);
-    wl.write = distribution_var(generator);
+    auto& wl     = workloads->at(i);
+    wl.write     = distribution_var(generator);
     int num_read = distribution_read(generator);
     for (int j = 0; j < num_read; ++j) {
       wl.reads.push_back(distribution_var(generator));
@@ -83,7 +86,8 @@ void GenerateWorkload(int num_workloads, int num_var,
  */
 void EvaluateWorkload(const Workload& wl, std::vector<double>* data) {
   double tmp = 0;
-  for (int i : wl.reads) tmp += data->at(i);
+  for (int i : wl.reads)
+    tmp += data->at(i);
   data->at(wl.write) = tmp / (wl.reads.size() + 1);
   if (wl.time > 0) {
     std::this_thread::sleep_for(std::chrono::microseconds(wl.time));
@@ -106,7 +110,8 @@ double EvaluateWorkloads(const std::vector<Workload>& workloads,
   }
 
   for (const auto& wl : workloads) {
-    if (wl.reads.size() == 0) continue;
+    if (wl.reads.size() == 0)
+      continue;
     if (engine == nullptr) {
       EvaluateWorkload(wl, data);
     } else {
@@ -119,7 +124,8 @@ double EvaluateWorkloads(const std::vector<Workload>& workloads,
       };
       std::vector<Engine::VarHandle> reads;
       for (auto i : wl.reads) {
-        if (i != wl.write) reads.push_back(vars[i]);
+        if (i != wl.write)
+          reads.push_back(vars[i]);
       }
       engine->PushAsync(func, Context::CPU(), reads, {vars[wl.write]});
     }
@@ -134,9 +140,9 @@ double EvaluateWorkloads(const std::vector<Workload>& workloads,
 TEST(Engine, start_stop) {
   const int num_engine = 3;
   std::vector<mxnet::Engine*> engine(num_engine);
-  engine[0] = mxnet::engine::CreateNaiveEngine();
-  engine[1] = mxnet::engine::CreateThreadedEnginePooled();
-  engine[2] = mxnet::engine::CreateThreadedEnginePerDevice();
+  engine[0]                 = mxnet::engine::CreateNaiveEngine();
+  engine[1]                 = mxnet::engine::CreateThreadedEnginePooled();
+  engine[2]                 = mxnet::engine::CreateThreadedEnginePerDevice();
   std::string type_names[3] = {"NaiveEngine", "ThreadedEnginePooled", "ThreadedEnginePerDevice"};
 
   for (int i = 0; i < num_engine; ++i) {
@@ -150,7 +156,7 @@ TEST(Engine, start_stop) {
 
 TEST(Engine, RandSumExpr) {
   std::vector<Workload> workloads;
-  int num_repeat = 5;
+  int num_repeat       = 5;
   const int num_engine = 4;
 
   std::vector<double> t(num_engine, 0.0);
@@ -172,19 +178,21 @@ TEST(Engine, RandSumExpr) {
     }
 
     for (int i = 1; i < num_engine; ++i) {
-      for (int j = 0; j < num_var; ++j) EXPECT_EQ(data[0][j], data[i][j]);
+      for (int j = 0; j < num_var; ++j)
+        EXPECT_EQ(data[0][j], data[i][j]);
     }
     LOG(INFO) << "data: " << data[0][1] << " " << data[0][2] << "...";
   }
 
-
-  LOG(INFO) << "baseline\t\t"  << t[0] << " sec";
-  LOG(INFO) << "NaiveEngine\t\t"  << t[1] << " sec";
+  LOG(INFO) << "baseline\t\t" << t[0] << " sec";
+  LOG(INFO) << "NaiveEngine\t\t" << t[1] << " sec";
   LOG(INFO) << "ThreadedEnginePooled\t" << t[2] << " sec";
   LOG(INFO) << "ThreadedEnginePerDevice\t" << t[3] << " sec";
 }
 
-void Foo(mxnet::RunContext, int i) { printf("The fox says %d\n", i); }
+void Foo(mxnet::RunContext, int i) {
+  printf("The fox says %d\n", i);
+}
 
 void FooAsyncFunc(void*, void*, void* cb_ptr, void* param) {
   if (param == nullptr) {
@@ -222,7 +230,7 @@ TEST(Engine, PushFunc) {
 
   // Test #1
   LOG(INFO) << "===== Test #1: PushAsync param and deleter =====";
-  int* a = new int(100);
+  int* a  = new int(100);
   int res = MXEnginePushAsync(FooAsyncFunc, a, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
   EXPECT_EQ(res, 0);
 
@@ -244,7 +252,7 @@ TEST(Engine, PushFunc) {
   // Test #5
   LOG(INFO) << "===== Test #5: PushSync param and deleter =====";
   int* b = new int(101);
-  res = MXEnginePushSync(FooSyncFunc, b, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
+  res    = MXEnginePushSync(FooSyncFunc, b, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
   EXPECT_EQ(res, 0);
 
   // Test #6
@@ -268,82 +276,121 @@ TEST(Engine, PushFuncND) {
   std::vector<mxnet::NDArray*> nds;
   const int num_nds = 5;
   for (int i = 0; i < num_nds; ++i) {
-      mxnet::NDArray *pnd = new mxnet::NDArray(ctx);
-      nds.push_back(pnd);
+    mxnet::NDArray* pnd = new mxnet::NDArray(ctx);
+    nds.push_back(pnd);
   }
   for (int num_const_nds = 0; num_const_nds <= num_nds; ++num_const_nds) {
-      int num_mutable_nds = num_nds - num_const_nds;
-      void** const_nds_handle = num_const_nds > 0 ?
-          reinterpret_cast<void**>(nds.data()) : nullptr;
-      void** mutable_nds_handle = num_mutable_nds > 0 ?
-          reinterpret_cast<void**>(nds.data() + num_const_nds) : nullptr;
-
-      // Test #1
-      LOG(INFO) << "===== Test #1: PushAsyncND param and deleter =====";
-      int* a = new int(100);
-      int res = MXEnginePushAsyncND(FooAsyncFunc, a, FooFuncDeleter, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, 0);
-
-      // Test #2
-      LOG(INFO) << "===== Test #2: PushAsyncND NULL param and NULL deleter =====";
-      res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, 0);
-
-      // Test #3
-      LOG(INFO) << "===== Test #3: PushAsyncND invalid number of const nds =====";
-      res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, -1,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, -1);
-
-      // Test #4
-      LOG(INFO) << "===== Test #4: PushAsyncND invalid number of mutable nds =====";
-      res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, -1);
-      EXPECT_EQ(res, -1);
-
-      // Test #5
-      LOG(INFO) << "===== Test #5: PushSyncND param and deleter =====";
-      int* b = new int(101);
-      res = MXEnginePushSyncND(FooSyncFunc, b, FooFuncDeleter, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, 0);
-
-      // Test #6
-      LOG(INFO) << "===== Test #6: PushSyncND NULL param and NULL deleter =====";
-      res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, 0);
-
-      // Test #7
-      LOG(INFO) << "===== Test #7: PushSyncND invalid number of const nds =====";
-      res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, -1,
-              mutable_nds_handle, num_mutable_nds);
-      EXPECT_EQ(res, -1);
-
-      // Test #8
-      LOG(INFO) << "===== Test #8: PushSyncND invalid number of mutable nds =====";
-      res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx,
-              const_nds_handle, num_const_nds,
-              mutable_nds_handle, -1);
-      EXPECT_EQ(res, -1);
+    int num_mutable_nds     = num_nds - num_const_nds;
+    void** const_nds_handle = num_const_nds > 0 ? reinterpret_cast<void**>(nds.data()) : nullptr;
+    void** mutable_nds_handle =
+        num_mutable_nds > 0 ? reinterpret_cast<void**>(nds.data() + num_const_nds) : nullptr;
+
+    // Test #1
+    LOG(INFO) << "===== Test #1: PushAsyncND param and deleter =====";
+    int* a  = new int(100);
+    int res = MXEnginePushAsyncND(FooAsyncFunc,
+                                  a,
+                                  FooFuncDeleter,
+                                  &ctx,
+                                  const_nds_handle,
+                                  num_const_nds,
+                                  mutable_nds_handle,
+                                  num_mutable_nds);
+    EXPECT_EQ(res, 0);
+
+    // Test #2
+    LOG(INFO) << "===== Test #2: PushAsyncND NULL param and NULL deleter =====";
+    res = MXEnginePushAsyncND(FooAsyncFunc,
+                              nullptr,
+                              nullptr,
+                              &ctx,
+                              const_nds_handle,
+                              num_const_nds,
+                              mutable_nds_handle,
+                              num_mutable_nds);
+    EXPECT_EQ(res, 0);
+
+    // Test #3
+    LOG(INFO) << "===== Test #3: PushAsyncND invalid number of const nds =====";
+    res = MXEnginePushAsyncND(FooAsyncFunc,
+                              nullptr,
+                              nullptr,
+                              &ctx,
+                              const_nds_handle,
+                              -1,
+                              mutable_nds_handle,
+                              num_mutable_nds);
+    EXPECT_EQ(res, -1);
+
+    // Test #4
+    LOG(INFO) << "===== Test #4: PushAsyncND invalid number of mutable nds =====";
+    res = MXEnginePushAsyncND(FooAsyncFunc,
+                              nullptr,
+                              nullptr,
+                              &ctx,
+                              const_nds_handle,
+                              num_const_nds,
+                              mutable_nds_handle,
+                              -1);
+    EXPECT_EQ(res, -1);
+
+    // Test #5
+    LOG(INFO) << "===== Test #5: PushSyncND param and deleter =====";
+    int* b = new int(101);
+    res    = MXEnginePushSyncND(FooSyncFunc,
+                             b,
+                             FooFuncDeleter,
+                             &ctx,
+                             const_nds_handle,
+                             num_const_nds,
+                             mutable_nds_handle,
+                             num_mutable_nds);
+    EXPECT_EQ(res, 0);
+
+    // Test #6
+    LOG(INFO) << "===== Test #6: PushSyncND NULL param and NULL deleter =====";
+    res = MXEnginePushSyncND(FooSyncFunc,
+                             nullptr,
+                             nullptr,
+                             &ctx,
+                             const_nds_handle,
+                             num_const_nds,
+                             mutable_nds_handle,
+                             num_mutable_nds);
+    EXPECT_EQ(res, 0);
+
+    // Test #7
+    LOG(INFO) << "===== Test #7: PushSyncND invalid number of const nds =====";
+    res = MXEnginePushSyncND(FooSyncFunc,
+                             nullptr,
+                             nullptr,
+                             &ctx,
+                             const_nds_handle,
+                             -1,
+                             mutable_nds_handle,
+                             num_mutable_nds);
+    EXPECT_EQ(res, -1);
+
+    // Test #8
+    LOG(INFO) << "===== Test #8: PushSyncND invalid number of mutable nds =====";
+    res = MXEnginePushSyncND(FooSyncFunc,
+                             nullptr,
+                             nullptr,
+                             &ctx,
+                             const_nds_handle,
+                             num_const_nds,
+                             mutable_nds_handle,
+                             -1);
+    EXPECT_EQ(res, -1);
   }
   for (mxnet::NDArray* pnd : nds) {
-      delete pnd;
+    delete pnd;
   }
 }
 
 TEST(Engine, basics) {
   auto&& engine = mxnet::Engine::Get();
-  auto&& var = engine->NewVariable();
+  auto&& var    = engine->NewVariable();
   std::vector<mxnet::Engine::OprHandle> oprs;
 
   // Test #1
@@ -460,9 +507,9 @@ TEST(Engine, basics) {
 TEST(Engine, VarVersion) {
   const size_t num_engines = 3;
   std::vector<mxnet::Engine*> engines(num_engines);
-  engines[0] = mxnet::engine::CreateNaiveEngine();
-  engines[1] = mxnet::engine::CreateThreadedEnginePooled();
-  engines[2] = mxnet::engine::CreateThreadedEnginePerDevice();
+  engines[0]                = mxnet::engine::CreateNaiveEngine();
+  engines[1]                = mxnet::engine::CreateThreadedEnginePooled();
+  engines[2]                = mxnet::engine::CreateThreadedEnginePerDevice();
   std::string type_names[3] = {"NaiveEngine", "ThreadedEnginePooled", "ThreadedEnginePerDevice"};
   for (size_t k = 0; k < num_engines; ++k) {
     auto engine = engines[k];
@@ -534,7 +581,7 @@ struct TestSaveAndRestoreOMPState {
     omp_set_dynamic(dynamic_);
   }
   const int nthreads_ = omp_get_max_threads();
-  const int dynamic_ = omp_get_dynamic();
+  const int dynamic_  = omp_get_dynamic();
 };
 
 /*!
@@ -542,8 +589,8 @@ struct TestSaveAndRestoreOMPState {
  */
 TEST(Engine, omp_threading_count_scope) {
   TestSaveAndRestoreOMPState omp_state;
-  const int THREAD_COUNT = 10;
-  std::shared_ptr<dmlc::ManualEvent> ready = std::make_shared<dmlc::ManualEvent>();
+  const int THREAD_COUNT                     = 10;
+  std::shared_ptr<dmlc::ManualEvent> ready   = std::make_shared<dmlc::ManualEvent>();
   std::shared_ptr<dmlc::ThreadGroup> threads = std::make_shared<dmlc::ThreadGroup>();
   std::atomic<int> counter(0), correct(0);
   omp_set_dynamic(0);
@@ -551,24 +598,27 @@ TEST(Engine, omp_threading_count_scope) {
     std::string name = "thread: ";
     name += std::to_string(x + 1);
     ++counter;
-    threads->create(name, false,
-                    [x, &counter, &correct](std::shared_ptr<dmlc::ManualEvent> ready_ptr) -> int {
-                      const int thread_count = x + 1;
-                      omp_set_num_threads(thread_count);
-                      --counter;
-                      ready_ptr->wait();
-                      CHECK_EQ(omp_get_max_threads(), thread_count);
-                      #pragma omp parallel for
-                      for (int i = 0; i < 100; ++i) {
-                        if (i == 50) {
-                          const int current_threads = omp_get_num_threads();
-                          if (current_threads == thread_count) {
-                            ++correct;
-                          }
-                        }
-                      }
-                      return 0;
-                    }, ready);
+    threads->create(
+        name,
+        false,
+        [x, &counter, &correct](std::shared_ptr<dmlc::ManualEvent> ready_ptr) -> int {
+          const int thread_count = x + 1;
+          omp_set_num_threads(thread_count);
+          --counter;
+          ready_ptr->wait();
+          CHECK_EQ(omp_get_max_threads(), thread_count);
+#pragma omp parallel for
+          for (int i = 0; i < 100; ++i) {
+            if (i == 50) {
+              const int current_threads = omp_get_num_threads();
+              if (current_threads == thread_count) {
+                ++correct;
+              }
+            }
+          }
+          return 0;
+        },
+        ready);
   }
   while (counter.load() > 0) {
     usleep(100);
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index ecbfcd5d7d3a..0ff089cc5666 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -34,16 +34,13 @@ namespace test {
 namespace op {
 
 // Tried making this a struct w/constexpr, but getting undefined reference on gcc 5.4.1
-#define COREOP_FWD_OP_NAME_KEY          "fwd_op_name"
-#define COREOP_BWD_OP_NAME_KEY          "bwd_op_name"
-#define COREOP_BWD_OP_NAME_VALUE_NONE   "[none]"
+#define COREOP_FWD_OP_NAME_KEY        "fwd_op_name"
+#define COREOP_BWD_OP_NAME_KEY        "bwd_op_name"
+#define COREOP_BWD_OP_NAME_VALUE_NONE "[none]"
 
-enum TimingDirection {
-  kForward,
-  kBackward
-};
+enum TimingDirection { kForward, kBackward };
 
-inline const char *TimingDirectionAsString(const TimingDirection td) {
+inline const char* TimingDirectionAsString(const TimingDirection td) {
   switch (td) {
     case kForward:
       return "Forward";
@@ -59,9 +56,9 @@ inline const char *TimingDirectionAsString(const TimingDirection td) {
  * Low-noise operator executor
  * @tparam DType Data type for the operator executions
  */
-template<typename DType, typename AccReal = float>
-class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
-  , public test::op::OperatorExecutorTiming {
+template <typename DType, typename AccReal = float>
+class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>,
+                       public test::op::OperatorExecutorTiming {
   /*! \brief Performance timing categories */
   /*!
    * \brief Parse additional arguments into NodeAttrs structure
@@ -69,13 +66,13 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param args vector of string pairs representing argument key/value pairs
    * \return Constructed NodeAttrs structure
    */
-  static nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op, const kwargs_t& args) {
+  static nnvm::NodeAttrs ParseAttrs(const nnvm::Op* op, const kwargs_t& args) {
     const size_t count = args.size();
-    std::vector<const char *> keys, values;
+    std::vector<const char*> keys, values;
     keys.reserve(count);
     values.reserve(count);
-    for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end();
-         i_iter != e_iter; ++i_iter) {
+    for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end(); i_iter != e_iter;
+         ++i_iter) {
       keys.emplace_back(i_iter->first.c_str());
       values.emplace_back(i_iter->second.c_str());
     }
@@ -89,7 +86,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \return Reference to the supplied vector of TBlob results
    */
   static inline std::vector<TBlob>& CollectBlobs(const std::vector<NDArray>& src,
-                                                 std::vector<TBlob> *dest) {
+                                                 std::vector<TBlob>* dest) {
     dest->resize(0);
     dest->reserve(dest->size() + src.size());
     for (size_t i = 0, n = src.size(); i < n; ++i) {
@@ -128,7 +125,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 
   nnvm::ObjectPtr MakeNode() const {
     nnvm::ObjectPtr node = nnvm::Node::Create();
-    node->attrs = attrs_;
+    node->attrs          = attrs_;
     return node;
   }
 
@@ -138,7 +135,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    */
   std::vector<std::pair<std::shared_ptr<CoreOpExecutor>, std::string>> GetBackward() {
     std::vector<std::pair<std::shared_ptr<CoreOpExecutor>, std::string>> res;
-    static auto gradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
+    static auto gradient     = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
     nnvm::FGradient grad_fun = gradient.get(op_, nullptr);
     if (grad_fun) {
       auto n = MakeNode();
@@ -154,8 +151,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           std::cout << node_entry.node->op()->name << std::endl;
         }
         std::shared_ptr<CoreOpExecutor> pOp = std::make_shared<CoreOpExecutor>(
-          ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(outputs()));
-        res.push_back({ pOp, node_entry.node->op()->name });
+            ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(outputs()));
+        res.push_back({pOp, node_entry.node->op()->name});
       }
     }
     return res;
@@ -167,10 +164,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param attrs NodeAttrs structure (node attributes)
    * \param op Pointer to nnvm Operator object
    */
-  void AttachResources(OpContext *ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) {
+  void AttachResources(OpContext* ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op* op) {
     std::vector<ResourceRequest> reqs;
     std::vector<Resource>& requested = ctx->requested;
-    static auto& fresource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
+    static auto& fresource           = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
     if (fresource.count(op) != 0) {
       reqs = fresource[op](attrs);
     } else {
@@ -218,7 +215,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   }
 
  public:
-  typedef DType   DataType;
+  typedef DType DataType;
   typedef AccReal AccRealType;
 
   /*! \brief Add 'fwd_op_name' to kwargs and return the new kwargs */
@@ -233,9 +230,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
         new_args.emplace_back(a);
       }
     }
-    new_args.push_back({ COREOP_FWD_OP_NAME_KEY, fwd_op_name});
+    new_args.push_back({COREOP_FWD_OP_NAME_KEY, fwd_op_name});
     if (!bwd_op_name.empty()) {
-      new_args.push_back({ COREOP_BWD_OP_NAME_KEY, bwd_op_name});
+      new_args.push_back({COREOP_BWD_OP_NAME_KEY, bwd_op_name});
     }
     return new_args;
   }
@@ -267,11 +264,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param shapes Array of input shapes
    */
   CoreOpExecutor(const bool isGPU, const mxnet::ShapeVector& shapes)
-    : input_shapes_(shapes)
-      , op_(nullptr)  {
-    ctx_.is_train = true;
-    ctx_.run_ctx.ctx.dev_id = 0;
-    ctx_.run_ctx.stream = nullptr;
+      : input_shapes_(shapes), op_(nullptr) {
+    ctx_.is_train             = true;
+    ctx_.run_ctx.ctx.dev_id   = 0;
+    ctx_.run_ctx.stream       = nullptr;
     ctx_.run_ctx.ctx.dev_type = Context::kCPU;
 #if MXNET_USE_CUDA
     if (isGPU) {
@@ -300,7 +296,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   }
 
   nnvm::ObjectPtr GetBackwardDependency(const nnvm::ObjectPtr& node,
-                                      std::map<int, const NDArray *>* index2array) const {
+                                        std::map<int, const NDArray*>* index2array) const {
     index2array->clear();
     static auto& fgradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
 
@@ -331,9 +327,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     return nullptr;
   }
 
-  nnvm::ObjectPtr CalcBackwardPass(std::map<int, const NDArray *> *index2array) const {
+  nnvm::ObjectPtr CalcBackwardPass(std::map<int, const NDArray*>* index2array) const {
     nnvm::ObjectPtr node = nnvm::Node::Create();
-    node->attrs = attrs_;
+    node->attrs          = attrs_;
     return GetBackwardDependency(node, index2array);
   }
 
@@ -343,11 +339,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param inputs Optional input data (otherwise, random data will be used as input)
    */
   void Init(const kwargs_t& in_args,
-            const std::vector<NDArray>& inputs = {},
-            const std::vector<NDArray>& outputs = {},
-            const CoreOpExecutor *backward_for_op = nullptr,
-            nnvm::ObjectPtr bwd_node_ptr = nullptr
-  ) {
+            const std::vector<NDArray>& inputs    = {},
+            const std::vector<NDArray>& outputs   = {},
+            const CoreOpExecutor* backward_for_op = nullptr,
+            nnvm::ObjectPtr bwd_node_ptr          = nullptr) {
     if (!initialized_) {
       initialized_ = true;
 
@@ -356,7 +351,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       CHECK(op_name.empty() == false);
 
       CHECK(!backward_for_op || bwd_op_name.empty())
-        << "Backward op should not be supplied another backward operator";
+          << "Backward op should not be supplied another backward operator";
 
       if (verbose_ && backward_for_op) {
         std::cout << "Backward op: " << op_name;
@@ -365,7 +360,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       op_ = nnvm::Op::Get(op_name);
       CHECK_NOTNULL(op_);
 
-      std::map<int, const NDArray *> index2array;
+      std::map<int, const NDArray*> index2array;
       nnvm::ObjectPtr bwd_node_ptr;
       if (backward_for_op) {
         bwd_node_ptr = backward_for_op->CalcBackwardPass(&index2array);
@@ -400,12 +395,12 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       std::vector<mxnet::TShape> input_shapes;
       if (!input_shapes_.empty()) {
         for (size_t i = 0, n = num_inputs; i < n; ++i) {
-          input_shapes.emplace_back(i < input_shapes_.size() ? input_shapes_[i]
-                                                             : input_shapes_[input_shapes_.size()
-                                                                             - 1]);
+          input_shapes.emplace_back(i < input_shapes_.size() ?
+                                        input_shapes_[i] :
+                                        input_shapes_[input_shapes_.size() - 1]);
         }
       }
-      std::vector<NDArray *> inputs_p, outputs_p;
+      std::vector<NDArray*> inputs_p, outputs_p;
 
       if (!outputs.empty()) {
         CHECK_EQ(outputs.size(), static_cast<size_t>(inferred_num_outputs));
@@ -438,9 +433,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
               const int map_key = bwd_node_ptr->inputs[i].index;
               CHECK(index2array.find(map_key) != index2array.end());
               const int dtype = index2array[map_key]->dtype();
-              input_types[i] = dtype;
+              input_types[i]  = dtype;
             }
-            for (const auto &fwd_inp : backward_for_op->inputs()) {
+            for (const auto& fwd_inp : backward_for_op->inputs()) {
               const int dtype = fwd_inp.data().type_flag_;
               output_types.emplace_back(dtype);
             }
@@ -448,7 +443,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
             for (int x = 0; x < num_inputs; ++x) {
               input_types.emplace_back(default_dtype());
             }
-            for (const auto &fwd_inp : backward_for_op->inputs()) {
+            for (const auto& fwd_inp : backward_for_op->inputs()) {
               const int dtype = fwd_inp.data().type_flag_;
               output_types.emplace_back(dtype);
             }
@@ -482,7 +477,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
               for (int i = 0; i < num_inputs; ++i) {
                 const int map_key = bwd_node_ptr->inputs[i].index;
                 CHECK(index2array.find(map_key) != index2array.end());
-                const mxnet::TShape &shp = index2array[map_key]->shape();
+                const mxnet::TShape& shp = index2array[map_key]->shape();
                 input_shapes.push_back(shp);
                 const mxnet::TShape ss = input_shapes[i];
               }
@@ -503,22 +498,21 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
         for (size_t i = 0; i < static_cast<size_t>(inferred_num_outputs); ++i) {
           // If supplied and valid, pass from the supplied outputs vector
           // Otherwise use empty for forward pass, or zero-filled for backward pass
-          outputs_.emplace_back(i < outputs.size() ? outputs[i]
-                                                   : (backward_for_op
-                                                      ? CreateZeroArray(output_shapes[i],
-                                                                        ctx_.run_ctx,
-                                                                        output_types[i])
-                                                      : NDArray()));
+          outputs_.emplace_back(
+              i < outputs.size() ?
+                  outputs[i] :
+                  (backward_for_op ?
+                       CreateZeroArray(output_shapes[i], ctx_.run_ctx, output_types[i]) :
+                       NDArray()));
           outputs_p.emplace_back(&*outputs_.rbegin());
         }
       }
 
       for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
         CHECK_LT(i, static_cast<int>(input_shapes.size()));
-        inputs_.emplace_back(i < inputs.size()
-                             ? inputs[i] : CreateRandArray(input_shapes[i],
-                                                           ctx_.run_ctx,
-                                                           input_types[i]));
+        inputs_.emplace_back(i < inputs.size() ?
+                                 inputs[i] :
+                                 CreateRandArray(input_shapes[i], ctx_.run_ctx, input_types[i]));
         inputs_p.emplace_back(&*inputs_.rbegin());
       }
 
@@ -533,15 +527,15 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       CollectBlobs(inputs_, &blob_inputs_);
       CollectBlobs(outputs_, &blob_outputs_);
 
-      function_ = common::GetFCompute<FCompute>(op_, "FCompute", ctx_.run_ctx.ctx);
+      function_   = common::GetFCompute<FCompute>(op_, "FCompute", ctx_.run_ctx.ctx);
       functionex_ = common::GetFCompute<FComputeEx>(op_, "FComputeEx", ctx_.run_ctx.ctx);
-      stateful_function_ = common::GetFCompute<FStatefulCompute>(op_, "FStatefulCompute",
-                                                                 ctx_.run_ctx.ctx);
+      stateful_function_ =
+          common::GetFCompute<FStatefulCompute>(op_, "FStatefulCompute", ctx_.run_ctx.ctx);
 
       AttachResources(&ctx_, attrs_, op_);
 
       auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-      auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+      auto& createop          = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
       if (createop.count(op_) || is_layer_backward.get(op_, false)) {
         if (backward_for_op) {
           state_ = backward_for_op->state_;
@@ -562,7 +556,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           if (bwd_op_name != COREOP_BWD_OP_NAME_VALUE_NONE) {
             // Backward op was specified
             std::shared_ptr<CoreOpExecutor> pOp = std::make_shared<CoreOpExecutor>(
-              ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(this->outputs()));
+                ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(this->outputs()));
             bwd.push_back({pOp, bwd_op_name});
           } else {
             no_backward = true;
@@ -573,9 +567,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
         }
         if (!no_backward) {
           CHECK_GE(bwd.size(), 1U)
-            << "Can't automatically determine backward op name. Please specify";
+              << "Can't automatically determine backward op name. Please specify";
 
-          for (std::pair<std::shared_ptr<CoreOpExecutor>, std::string> &bw_item : bwd) {
+          for (std::pair<std::shared_ptr<CoreOpExecutor>, std::string>& bw_item : bwd) {
             bw_item.first->set_verbose(verbose_);
             backward_.emplace_back(bw_item.first);
             bw_item.first->Init(ArgsWithOpName(args, bw_item.second), {}, {}, this);
@@ -585,15 +579,15 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     }
   }
 
-  template<typename OpProp>
-  inline bool initForward(const OpProp &opProp, std::vector<int> *in_type) {
+  template <typename OpProp>
+  inline bool initForward(const OpProp& opProp, std::vector<int>* in_type) {
     Init(opProp.GetArgs());
     resetForward();
     return true;
   }
 
-  template<typename OpProp>
-  inline bool initBackward(const OpProp &opProp, std::vector<int> *in_type) {
+  template <typename OpProp>
+  inline bool initBackward(const OpProp& opProp, std::vector<int>* in_type) {
     resetBackward();
     return true;
   }
@@ -670,7 +664,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     CHECK(HasBackward());
     if (!backward_.empty()) {
       // Avoid locked ref count here
-      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+      for (std::shared_ptr<CoreOpExecutor>& p : backward_) {
         p->Execute();
       }
       return true;
@@ -686,7 +680,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     CHECK(HasBackward());
     if (!backward_.empty()) {
       // Avoid locked ref count here
-      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+      for (std::shared_ptr<CoreOpExecutor>& p : backward_) {
         p->ExecuteEx();
       }
       return true;
@@ -702,7 +696,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     CHECK(HasBackward());
     if (!backward_.empty()) {
       // Avoid locked ref count here
-      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+      for (std::shared_ptr<CoreOpExecutor>& p : backward_) {
         p->ExecuteStateful();
       }
       return true;
@@ -714,19 +708,35 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \brief Access input NDArray vector
    * \return reference to NDArray vector of forward inputs
    */
-  std::vector<NDArray>& inputs() { return inputs_; }
-  const std::vector<NDArray>& inputs() const { return inputs_; }
-  std::vector<TBlob>& input_blobs() { return blob_inputs_; }
-  const std::vector<TBlob>& input_blobs() const { return blob_inputs_; }
+  std::vector<NDArray>& inputs() {
+    return inputs_;
+  }
+  const std::vector<NDArray>& inputs() const {
+    return inputs_;
+  }
+  std::vector<TBlob>& input_blobs() {
+    return blob_inputs_;
+  }
+  const std::vector<TBlob>& input_blobs() const {
+    return blob_inputs_;
+  }
 
   /*!
    * \brief Access input NDArray vector
    * \return reference to NDArray vector of forward outputs
    */
-  std::vector<NDArray>& outputs() { return outputs_; }
-  const std::vector<NDArray>& outputs() const { return outputs_; }
-  std::vector<TBlob>& output_blobs() { return blob_outputs_; }
-  const std::vector<TBlob>& output_blobs() const { return blob_outputs_; }
+  std::vector<NDArray>& outputs() {
+    return outputs_;
+  }
+  const std::vector<NDArray>& outputs() const {
+    return outputs_;
+  }
+  std::vector<TBlob>& output_blobs() {
+    return blob_outputs_;
+  }
+  const std::vector<TBlob>& output_blobs() const {
+    return blob_outputs_;
+  }
 
   /*!
    * \brief Backward inputs (i.e. output grad)
@@ -792,7 +802,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   /*
    * \brief Pointer to the operator object
    */
-  const nnvm::Op *op_;
+  const nnvm::Op* op_;
   /*!
    * \brief Operator attributes
    */
@@ -838,17 +848,21 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 
 class CoreOpProp {
  public:
-  virtual void Init(const kwargs_t& kwargs) { kwargs_ = kwargs; }
-  const kwargs_t& GetArgs() const { return kwargs_; }
+  virtual void Init(const kwargs_t& kwargs) {
+    kwargs_ = kwargs;
+  }
+  const kwargs_t& GetArgs() const {
+    return kwargs_;
+  }
   virtual ~CoreOpProp() {}
+
  private:
-  kwargs_t          kwargs_;
+  kwargs_t kwargs_;
 };
 
-template<typename DType>
+template <typename DType>
 using CoreOperatorRunner = test::OperatorRunner<CoreOpProp, CoreOpExecutor<DType>>;
 
-
 /*!
  * \brief Rune a core op forward and backward
  * \tparam DType Data type
@@ -860,13 +874,13 @@ using CoreOperatorRunner = test::OperatorRunner<CoreOpProp, CoreOpExecutor<DType
  *        an exception will be thrown.
  *        If the string is [none], then no backward operator will be created or executed
  */
-template<typename DType = float>
+template <typename DType = float>
 inline void BasicRunCoreOpBidirectional(const bool isGPU,
                                         bool verbose,
                                         const kwargs_t& op_kwargs,
                                         const mxnet::ShapeVector& shapes,
-                                        const char *op_name,
-                                        const char *backward_op_name = "") {
+                                        const char* op_name,
+                                        const char* backward_op_name = "") {
   test::op::CoreOpExecutor<DType> op(isGPU, shapes);
   op.set_verbose(verbose);
 
diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h
index fdb52cf6e4e0..4259751b71c4 100644
--- a/tests/cpp/include/test_legacy_op.h
+++ b/tests/cpp/include/test_legacy_op.h
@@ -60,8 +60,8 @@ namespace op {
  * \tparam DType
  */
 template <typename DType, typename AccReal>
-class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
-                              , public OperatorExecutorTiming {
+class LegacyOperatorExecutor : public OperatorDataInitializer<DType>,
+                               public OperatorExecutorTiming {
  public:
   typedef DType DataType;
   typedef AccReal AccRealType;
@@ -69,14 +69,17 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   /*! \brief Manage test blobs and context */
   LegacyOperatorExecutor(const bool isGPU, const mxnet::ShapeVector& topShapes)
 #if !MXNET_USE_CUDA
-    : isGPU_(false)
+      : isGPU_(false)
 #else
-    : isGPU_(isGPU)
+      : isGPU_(isGPU)
 #endif
-    , initializeForward_(0)   // unit testing may call inits in any order based
-      , initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
-      , initializeCallback_(0) {
-    opContext_.is_train = true;
+        ,
+        initializeForward_(0)  // unit testing may call inits in any order based
+        ,
+        initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
+        ,
+        initializeCallback_(0) {
+    opContext_.is_train       = true;
     opContext_.run_ctx.stream = nullptr;
     CHECK(!topShapes.empty());
     shape_input_vec_ = topShapes;
@@ -93,14 +96,14 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   virtual void resetBackward() {}
 
   /*! \brief Initialize auxiliary and output blobs */
-  template<typename OperatorPropertyType>
-  bool initForward(const OperatorPropertyType &opProp, std::vector<int> *in_type) {
+  template <typename OperatorPropertyType>
+  bool initForward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
     if (!initializeForward_++) {
       shape_input_vec_.resize(opProp.ListArguments().size());
       op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
       if (op_) {
         const size_t output_count = opProp.ListOutputs().size();
-        const size_t aux_count = opProp.ListAuxiliaryStates().size();
+        const size_t aux_count    = opProp.ListAuxiliaryStates().size();
         // Figure out what sort of blobs we need to allocate
         mxnet::ShapeVector out_shape, aux_shape;
         out_shape.resize(output_count);
@@ -150,19 +153,23 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   }
 
   /*! \brief Initialize auxiliary and output blobs */
-  template<typename OperatorPropertyType>
-  bool initBackward(const OperatorPropertyType &opProp, std::vector<int> *in_type) {
+  template <typename OperatorPropertyType>
+  bool initBackward(const OperatorPropertyType& opProp, std::vector<int>* in_type) {
     initForward(opProp, in_type);
     if (!initializeBackward_++) {
       for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
         CHECK_LT(x, c_.blob_output_vec_.size());
-        allocateBlob(&c_.blob_out_grad_, c_.blob_output_vec_[x].shape_,
-                     false, c_.blob_output_vec_[x].type_flag_);
+        allocateBlob(&c_.blob_out_grad_,
+                     c_.blob_output_vec_[x].shape_,
+                     false,
+                     c_.blob_output_vec_[x].type_flag_);
       }
 
       for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
-        allocateBlob(&c_.blob_in_grad_,  c_.blob_input_vec_[x].shape_,
-                     false, c_.blob_input_vec_[x].type_flag_);
+        allocateBlob(&c_.blob_in_grad_,
+                     c_.blob_input_vec_[x].shape_,
+                     false,
+                     c_.blob_input_vec_[x].type_flag_);
       }
 
       // Get the resource of temporal space
@@ -180,18 +187,14 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   void forward(const size_t count = 1) {
     const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
-    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
-                      new GPUOpData(c_, &opContext_) : nullptr));
-    perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward,
-                           "Forward", count);
+    MXNET_CUDA_ONLY(
+        std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
+    perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count);
     if (!isGPU_) {
       mxnet::profiler::vtune::VTuneResume profile;  // VTune sample only this scope
       for (size_t x = 0; x < count; ++x) {
-        op()->Forward(opContext_,
-                      c_.blob_input_vec_,
-                      req,
-                      c_.blob_output_vec_,
-                      c_.blob_aux_states_);
+        op()->Forward(
+            opContext_, c_.blob_input_vec_, req, c_.blob_output_vec_, c_.blob_aux_states_);
       }
     } else {
       for (size_t x = 0; x < count; ++x) {
@@ -208,10 +211,9 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   void backward(const size_t count = 1) {
     const std::vector<OpReqType> req(c_.blob_in_grad_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
-    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
-                      new GPUOpData(c_, &opContext_) : nullptr));
-    perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward,
-                           "Backward", count);
+    MXNET_CUDA_ONLY(
+        std::unique_ptr<GPUOpData> gpuData(isGPU_ ? new GPUOpData(c_, &opContext_) : nullptr));
+    perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count);
     if (!isGPU_) {
       mxnet::profiler::vtune::VTuneResume profile;  // VTune sample only this scope
       for (size_t x = 0; x < count; ++x) {
@@ -240,25 +242,26 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
    * \brief Test if operator has a backward pass
    * \return true if this operator has a backward pass
    */
-  MSHADOW_CINLINE bool HasBackward() const { return true; }
+  MSHADOW_CINLINE bool HasBackward() const {
+    return true;
+  }
 
   /*! \brief Getter functions for the operator */
-  inline Operator *op() { return op_.get(); }
-  inline const Operator *op() const { return op_.get(); }
-
-  enum BlobVectorType {
-    kInput,
-    kOutput,
-    kAux,
-    kInGrad,
-    kOutGrad,
-    kBlobVectorTypeCount
-  };
+  inline Operator* op() {
+    return op_.get();
+  }
+  inline const Operator* op() const {
+    return op_.get();
+  }
 
-#define CASE_STR(__v$) case (__v$): return #__v$
+  enum BlobVectorType { kInput, kOutput, kAux, kInGrad, kOutGrad, kBlobVectorTypeCount };
+
+#define CASE_STR(__v$) \
+  case (__v$):         \
+    return #__v$
 
   /*! \brief Convert BlobVectorType enum into a string */
-  static inline const char *bvt2String(const BlobVectorType bvt) {
+  static inline const char* bvt2String(const BlobVectorType bvt) {
     switch (bvt) {
       CASE_STR(kInput);
       CASE_STR(kOutput);
@@ -298,11 +301,11 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
    * After that, you can compare with the "actual" operator state (BasicOperatorData) of
    * the operator that you are testing.
    */
-  template<typename Stream>
-  inline void dumpC(Stream *_os, const std::string& label) {
+  template <typename Stream>
+  inline void dumpC(Stream* _os, const std::string& label) {
     Stream& os = *_os;
-    os << "static const std::vector< std::vector< std::vector<float> > > ___"
-       << label << "_data_shape_";
+    os << "static const std::vector< std::vector< std::vector<float> > > ___" << label
+       << "_data_shape_";
     const mxnet::TShape& shape = shape_input_vec_[0];
     for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
       os << shape[i] << "_";
@@ -329,10 +332,12 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
     os << "};" << std::endl;
   }
 
-  static inline void copy(const TBlob& blob, const DType array[],
-                          const size_t start, const size_t end) {
+  static inline void copy(const TBlob& blob,
+                          const DType array[],
+                          const size_t start,
+                          const size_t end) {
     const size_t blobSize = blob.Size();
-    DType *p = blob.dptr<DType>();
+    DType* p              = blob.dptr<DType>();
     for (size_t i = 0, n = end - start; i < n; ++i) {
       CHECK_LT(i, blobSize);
       p[i] = array[i + start];
@@ -342,63 +347,75 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   /*! \brief Runtime load of the C++ data code generated by dumpC() */
   void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
     for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
-      for (size_t j = 0, nj = cData[i].size(); j < nj; ++j)  {
-        const TBlob& blob = getBlobVect(BlobVectorType(i))[j];
+      for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) {
+        const TBlob& blob           = getBlobVect(BlobVectorType(i))[j];
         const size_t sourceDataSize = cData[i][j].size();
         CHECK_EQ(sourceDataSize, blob.Size());
-        const DType *sourceData = &cData[i][j][0];
+        const DType* sourceData = &cData[i][j][0];
         copy(blob, sourceData, 0, sourceDataSize);
       }
     }
   }
 
   /*! \brief Runtime load of the C++ data code generated by dumpC() */
-  void load(const std::vector<std::vector<std::vector<DType>>>& cData,
-            const BlobVectorType type) {
+  void load(const std::vector<std::vector<std::vector<DType>>>& cData, const BlobVectorType type) {
     CHECK_LT(type, cData.size());
-    for (size_t j = 0, nj = cData[type].size(); j < nj; ++j)  {
-      const TBlob& blob = getBlobVect(type)[j];
+    for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) {
+      const TBlob& blob           = getBlobVect(type)[j];
       const size_t sourceDataSize = cData[type][j].size();
       CHECK_EQ(sourceDataSize, blob.Size());
-      const DType *sourceData = &cData[type][j][0];
+      const DType* sourceData = &cData[type][j][0];
       copy(blob, sourceData, 0, sourceDataSize);
     }
   }
 
   /*! \brief Runtime load of the C++ data code generated by dumpC() */
   void load(const std::vector<std::vector<std::vector<DType>>>& cData,
-            const BlobVectorType type, const int idx) {
+            const BlobVectorType type,
+            const int idx) {
     CHECK_LT(type, cData.size());
     CHECK_LT(idx, cData[type].size());
-    const TBlob& blob = getBlobVect(type)[idx];
+    const TBlob& blob           = getBlobVect(type)[idx];
     const size_t sourceDataSize = cData[type][idx].size();
     CHECK_EQ(sourceDataSize, blob.Size());
-    const DType *sourceData = &cData[type][idx][0];
+    const DType* sourceData = &cData[type][idx][0];
     copy(blob, sourceData, 0, sourceDataSize);
   }
 
-//  void FillRandom() {
-//    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
-//      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
-//      if (data_vect) {
-//        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
-//          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
-//        }
-//      }
-//    }
-//  }
-
-  std::vector<TBlob>& inputs() { return c_.blob_input_vec_; }
-  const std::vector<TBlob>& inputs() const { return c_.blob_input_vec_; }
-  std::vector<TBlob>& outputs() { return c_.blob_output_vec_; }
-  const std::vector<TBlob>& outputs() const { return c_.blob_output_vec_; }
-  std::vector<TBlob>& bwd_inputs() { return c_.blob_out_grad_; }
-  std::vector<TBlob>& bwd_outputs() { return c_.blob_in_grad_; }
+  //  void FillRandom() {
+  //    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
+  //      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
+  //      if (data_vect) {
+  //        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
+  //          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
+  //        }
+  //      }
+  //    }
+  //  }
+
+  std::vector<TBlob>& inputs() {
+    return c_.blob_input_vec_;
+  }
+  const std::vector<TBlob>& inputs() const {
+    return c_.blob_input_vec_;
+  }
+  std::vector<TBlob>& outputs() {
+    return c_.blob_output_vec_;
+  }
+  const std::vector<TBlob>& outputs() const {
+    return c_.blob_output_vec_;
+  }
+  std::vector<TBlob>& bwd_inputs() {
+    return c_.blob_out_grad_;
+  }
+  std::vector<TBlob>& bwd_outputs() {
+    return c_.blob_in_grad_;
+  }
 
   /*! \brief Input and output blobs */
-  OpContext                 opContext_;
+  OpContext opContext_;
 
-  mxnet::ShapeVector       shape_input_vec_;
+  mxnet::ShapeVector shape_input_vec_;
 
   struct OpData {
     std::vector<TBlob> blob_input_vec_;
@@ -407,7 +424,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
     std::vector<TBlob> blob_in_grad_;
     std::vector<TBlob> blob_out_grad_;  // Remaining err (loss) pushing back upstream
 
-    std::vector<std::vector<TBlob> *> all_blob_vects_;
+    std::vector<std::vector<TBlob>*> all_blob_vects_;
     inline OpData() {
       all_blob_vects_.emplace_back(&blob_input_vec_);
       all_blob_vects_.emplace_back(&blob_output_vec_);
@@ -420,31 +437,30 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
 
 #if MXNET_USE_CUDA
   class GPUOpData : public OpData {
-    GPUOpData() = delete;
+    GPUOpData()                   = delete;
     GPUOpData(const GPUOpData& o) = delete;
 
    public:
-    inline GPUOpData(const OpData& cpuData, OpContext *opContext)
-    : cpuData_(cpuData)
-      , allocGPUStream_(opContext) {
+    inline GPUOpData(const OpData& cpuData, OpContext* opContext)
+        : cpuData_(cpuData), allocGPUStream_(opContext) {
       // Copy CPU->GPU
       CHECK_EQ(gpuBlobs_.size(), 0U);
       CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
       for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
-        std::vector<TBlob>& bv_src = *cpuData_.all_blob_vects_[bvt];
+        std::vector<TBlob>& bv_src   = *cpuData_.all_blob_vects_[bvt];
         std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
         for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
           const TBlob& srcBlob = bv_src[i];
-          TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_,
-                                         true, srcBlob.type_flag_);
+          TBlob* destBlob =
+              allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, true, srcBlob.type_flag_);
 
           Context cpu_ctx, gpu_ctx;
           cpu_ctx.dev_type = Context::kCPU;
           gpu_ctx.dev_type = Context::kGPU;
           cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
 
-          mxnet::ndarray::Copy<cpu, gpu>(srcBlob, destBlob, cpu_ctx,
-                                         gpu_ctx, allocGPUStream_.opContext_.run_ctx);
+          mxnet::ndarray::Copy<cpu, gpu>(
+              srcBlob, destBlob, cpu_ctx, gpu_ctx, allocGPUStream_.opContext_.run_ctx);
         }
       }
       cudaDeviceSynchronize();
@@ -453,19 +469,19 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
       // Copy GPU->CPU
       cudaDeviceSynchronize();
       for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
-        std::vector<TBlob>& bv_src = *this->all_blob_vects_[bvt];
+        std::vector<TBlob>& bv_src   = *this->all_blob_vects_[bvt];
         std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
         for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
           const TBlob& srcBlob = bv_src[i];
-          TBlob *destBlob = &bvt_dest[i];
+          TBlob* destBlob      = &bvt_dest[i];
 
           Context cpu_ctx, gpu_ctx;
           cpu_ctx.dev_type = Context::kCPU;
           gpu_ctx.dev_type = Context::kGPU;
           cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
 
-          mxnet::ndarray::Copy<gpu, cpu>(srcBlob, destBlob, gpu_ctx,
-                                         cpu_ctx, allocGPUStream_.opContext_.run_ctx);
+          mxnet::ndarray::Copy<gpu, cpu>(
+              srcBlob, destBlob, gpu_ctx, cpu_ctx, allocGPUStream_.opContext_.run_ctx);
         }
       }
       gpuBlobs_.clear();  // Force deallocation of the GPU blob data
@@ -483,7 +499,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
 #endif  // MXNET_USE_CUDA
 
  protected:
-  OpData                    c_;
+  OpData c_;
 
   /*! \brief Allocate the operator's resource requests */
   void allocateResources(const std::vector<ResourceRequest>& reqs) {
@@ -491,7 +507,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
 
     Context ctx;
     ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
-    ctx.dev_id = 0;
+    ctx.dev_id   = 0;
 
     for (const ResourceRequest& req : reqs) {
       switch (req.type) {
@@ -513,7 +529,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
           Resource rm = ResourceManager::Get()->Request(ctx, req);
           if (ctx.dev_mask() == Context::kCPU) {
             common::random::RandGenerator<cpu, DType>::AllocState(
-              rm.get_parallel_random<cpu, DType>());
+                rm.get_parallel_random<cpu, DType>());
           }
           opContext_.requested.emplace_back(rm);
           break;
@@ -531,47 +547,46 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
   }
 
   /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
-  static TBlob *allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>> *standalone_blobs,
-                             std::vector<TBlob> *dest,
+  static TBlob* allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>>* standalone_blobs,
+                             std::vector<TBlob>* dest,
                              const mxnet::TShape& shape,
                              const bool isGPU,
                              const int dtype) {
-    test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype);
-    CHECK_NE(blob, static_cast<TBlob *>(nullptr));
+    test::StandaloneBlob* blob = new test::StandaloneBlob(shape, isGPU, dtype);
+    CHECK_NE(blob, static_cast<TBlob*>(nullptr));
     standalone_blobs->emplace_back(std::unique_ptr<test::StandaloneBlob>(blob));
     (*dest).emplace_back(*blob);
     return blob;
   }
 
   /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
-  inline TBlob *allocateBlob(std::vector<TBlob> *dest, const mxnet::TShape& shape,
-                             const bool isGPU, const int dtype) {
+  inline TBlob* allocateBlob(std::vector<TBlob>* dest,
+                             const mxnet::TShape& shape,
+                             const bool isGPU,
+                             const int dtype) {
     return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
   }
 
   /*! \brief Performance timing categories */
-  enum TimingId {
-    Forward,
-    Backward
-  };
+  enum TimingId { Forward, Backward };
 
   /*! \brief The operator */
-  std::unique_ptr<Operator>   op_;
+  std::unique_ptr<Operator> op_;
   /*! \brief Is this for a GPU? */
-  const bool                  isGPU_;
+  const bool isGPU_;
   /*! \brief Assure that the Forward initialized only once */
-  std::atomic<int>            initializeForward_;
+  std::atomic<int> initializeForward_;
   /*! \brief Assure that the Forward initialized only once */
-  std::atomic<int>            initializeBackward_;
+  std::atomic<int> initializeBackward_;
   /*! \brief Assure that the callback is initialized only once */
-  std::atomic<int>            initializeCallback_;
+  std::atomic<int> initializeCallback_;
   /*! \brief scoped lifecycle management of allocated blobs */
   std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
 };
 
-template<typename OperatorProp, typename DType, typename AccReal>
+template <typename OperatorProp, typename DType, typename AccReal>
 using LegacyOpRunner =
-mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;
+    mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;
 
 }  // namespace op
 }  // namespace test
diff --git a/tests/cpp/include/test_ndarray_utils.h b/tests/cpp/include/test_ndarray_utils.h
index 8a53298f4811..5656d2003d0a 100644
--- a/tests/cpp/include/test_ndarray_utils.h
+++ b/tests/cpp/include/test_ndarray_utils.h
@@ -41,8 +41,8 @@ using namespace mxnet;
 #define TEST_DTYPE float
 #define TEST_ITYPE int32_t
 
-inline void CheckDataRegion(const TBlob &src, const TBlob &dst) {
-  auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_);
+inline void CheckDataRegion(const TBlob& src, const TBlob& dst) {
+  auto size   = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_);
   auto equals = memcmp(src.dptr_, dst.dptr_, size);
   EXPECT_EQ(equals, 0);
 }
@@ -55,13 +55,14 @@ inline unsigned gen_rand_seed() {
 
 inline float RandFloat() {
   static unsigned seed = gen_rand_seed();
-  double v = rand_r(&seed) * 1.0 / RAND_MAX;
+  double v             = rand_r(&seed) * 1.0 / RAND_MAX;
   return static_cast<float>(v);
 }
 
 // Get an NDArray with provided indices, prepared for a RowSparse NDArray.
-inline NDArray RspIdxND(const mxnet::TShape shape, const Context ctx,
-                        const std::vector<TEST_ITYPE> &values) {
+inline NDArray RspIdxND(const mxnet::TShape shape,
+                        const Context ctx,
+                        const std::vector<TEST_ITYPE>& values) {
   NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE);
   size_t num_val = values.size();
   MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
@@ -92,8 +93,8 @@ inline NDArray DnsND(const mxnet::TShape shape, const Context ctx, std::vector<T
   return nd;
 }
 
-template<typename xpu>
-static void inline CopyBlob(mshadow::Stream<xpu> *s,
+template <typename xpu>
+static void inline CopyBlob(mshadow::Stream<xpu>* s,
                             const TBlob& dest_blob,
                             const TBlob& src_blob) {
   using namespace mshadow;
@@ -125,10 +126,9 @@ inline NDArray RspND(const mxnet::TShape shape,
   print(&std::cout, "data", data);
   // create result nd
   mxnet::ShapeVector aux_shapes = {mshadow::Shape1(num_rows)};
-  NDArray nd(kRowSparseStorage, shape, ctx, false, mshadow::default_type_flag,
-             {}, aux_shapes);
+  NDArray nd(kRowSparseStorage, shape, ctx, false, mshadow::default_type_flag, {}, aux_shapes);
 
-  mshadow::Stream<cpu> *s = nullptr;
+  mshadow::Stream<cpu>* s = nullptr;
   CopyBlob(s, nd.aux_data(rowsparse::kIdx), index.data());
   CopyBlob(s, nd.data(), data.data());
 
@@ -137,15 +137,15 @@ inline NDArray RspND(const mxnet::TShape shape,
 }
 
 /*! \brief Array - utility class to construct sparse arrays
- *  \warning This class is not meant to run in a production environment.  Since it is for unit tests only,
- *           simplicity has been chosen over performance.
+ *  \warning This class is not meant to run in a production environment.  Since it is for unit tests
+ *only, simplicity has been chosen over performance.
  **/
-template<typename DType>
+template <typename DType>
 class Array {
   typedef std::map<size_t, std::map<size_t, DType> > TItems;
   static constexpr double EPSILON = 1e-5;
 
-  static const char *st2str(const NDArrayStorageType storageType) {
+  static const char* st2str(const NDArrayStorageType storageType) {
     switch (storageType) {
       case kDefaultStorage:
         return "kDefaultStorage";
@@ -163,15 +163,13 @@ class Array {
 
   /*! \brief Remove all zero entries */
   void Prune() {
-    for (typename TItems::iterator i = items_.begin(), e = items_.end();
-         i != e;) {
-      const size_t y = i->first;
-      std::map<size_t, DType> &m = i->second;
+    for (typename TItems::iterator i = items_.begin(), e = items_.end(); i != e;) {
+      const size_t y             = i->first;
+      std::map<size_t, DType>& m = i->second;
       ++i;
-      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
-           j != jn;) {
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end(); j != jn;) {
         const size_t x = j->first;
-        const DType v = j->second;
+        const DType v  = j->second;
         ++j;
         if (IsZero(v)) {
           m.erase(x);
@@ -186,20 +184,19 @@ class Array {
   /*! \brief Create a dense NDArray from our mapped data */
   NDArray CreateDense(const Context& ctx) const {
     NDArray array(shape_, Context::CPU(-1));
-    TBlob data = array.data();
-    DType *p_data = data.dptr<DType>();
+    TBlob data    = array.data();
+    DType* p_data = data.dptr<DType>();
     memset(p_data, 0, array.shape().Size() * sizeof(DType));
-    for (typename TItems::const_iterator i = items_.begin(), e = items_.end();
-         i != e; ++i) {
-      const size_t y = i->first;
-      const std::map<size_t, DType> &m = i->second;
-      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
-           j != jn; ++j) {
+    for (typename TItems::const_iterator i = items_.begin(), e = items_.end(); i != e; ++i) {
+      const size_t y                   = i->first;
+      const std::map<size_t, DType>& m = i->second;
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end(); j != jn;
+           ++j) {
         const size_t x = j->first;
-        const DType v = j->second;
+        const DType v  = j->second;
         if (!IsZero(v)) {
           const size_t offset = mxnet::test::offset(shape_, {y, x});
-          p_data[offset] = v;
+          p_data[offset]      = v;
         }
       }
     }
@@ -215,11 +212,9 @@ class Array {
  public:
   Array() = default;
 
-  explicit Array(const mxnet::TShape &shape)
-    : shape_(shape) {}
+  explicit Array(const mxnet::TShape& shape) : shape_(shape) {}
 
-  explicit Array(const NDArray &arr)
-    : shape_(arr.shape()) {
+  explicit Array(const NDArray& arr) : shape_(arr.shape()) {
     Load(arr);
   }
 
@@ -228,19 +223,25 @@ class Array {
     shape_ = mxnet::TShape(0);
   }
 
-  static inline bool IsNear(const DType v1, const DType v2) { return fabs(v2 - v1) <= EPSILON; }
-  static inline bool IsZero(const DType v) { return IsNear(v, DType(0)); }
+  static inline bool IsNear(const DType v1, const DType v2) {
+    return fabs(v2 - v1) <= EPSILON;
+  }
+  static inline bool IsZero(const DType v) {
+    return IsNear(v, DType(0));
+  }
 
   /*! Index into value maps via: [y][x] (row, col) */
-  std::map<size_t, DType> &operator[](const size_t idx) { return items_[idx]; }
+  std::map<size_t, DType>& operator[](const size_t idx) {
+    return items_[idx];
+  }
 
-  const std::map<size_t, DType> &operator[](const size_t idx) const {
+  const std::map<size_t, DType>& operator[](const size_t idx) const {
     typename TItems::const_iterator i = items_.find(idx);
     if (i != items_.end()) {
       return i->second;
     }
     CHECK(false) << "Attempt to access a non-existent key in a constant map";
-    return *static_cast<std::map<size_t, DType> *>(nullptr);
+    return *static_cast<std::map<size_t, DType>*>(nullptr);
   }
 
   bool Contains(const size_t row, const size_t col) const {
@@ -255,12 +256,12 @@ class Array {
   }
 
   /*! \brief Convert from one storage type NDArray to another */
-  static NDArray Convert(const Context& ctx, const NDArray& src,
+  static NDArray Convert(const Context& ctx,
+                         const NDArray& src,
                          const NDArrayStorageType storageType) {
-    std::unique_ptr<NDArray> pArray(
-      storageType == kDefaultStorage
-      ? new NDArray(src.shape(), ctx)
-      : new NDArray(storageType, src.shape(), ctx));
+    std::unique_ptr<NDArray> pArray(storageType == kDefaultStorage ?
+                                        new NDArray(src.shape(), ctx) :
+                                        new NDArray(storageType, src.shape(), ctx));
     OpContext opContext;
     MXNET_CUDA_ONLY(std::unique_ptr<test::op::GPUStreamScope> gpuScope;);
     switch (ctx.dev_type) {
@@ -269,7 +270,7 @@ class Array {
         gpuScope.reset(new test::op::GPUStreamScope(&opContext));
         mxnet::op::CastStorageComputeImpl<gpu>(s, src, dest);
         break;
-#endif  // MNXNET_USE_CUDA
+#endif            // MNXNET_USE_CUDA
       default: {  // CPU
         OpContext op_ctx;
         mxnet::op::CastStorageComputeImpl<cpu>(op_ctx, src, *pArray);
@@ -308,7 +309,7 @@ class Array {
     }
 #endif  // MXNET_USE_CUDA
     const TBlob blob = array.data();
-    DType *p = blob.dptr<DType>();
+    DType* p         = blob.dptr<DType>();
     CHECK_EQ(shape_.ndim(), 2U);
     for (size_t row = 0, nrow = shape_[0]; row < nrow; ++row) {
       for (size_t col = 0, ncol = shape_[1]; col < ncol; ++col) {
@@ -321,15 +322,14 @@ class Array {
   }
 
   void print() const {
-    for (typename TItems::const_iterator i = items_.begin(), e = items_.end();
-         i != e; ++i) {
-      const size_t y = i->first;
-      const std::map<size_t, DType> &m = i->second;
+    for (typename TItems::const_iterator i = items_.begin(), e = items_.end(); i != e; ++i) {
+      const size_t y                   = i->first;
+      const std::map<size_t, DType>& m = i->second;
       CHECK_EQ(m.empty(), false);  // How did it get to have an empty map?
-      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
-           j != jn; ++j) {
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end(); j != jn;
+           ++j) {
         const size_t x = j->first;
-        const DType v = j->second;
+        const DType v  = j->second;
         if (!IsZero(v)) {
           std::cout << "[row=" << y << ", col=" << x << "]: " << v << std::endl;
         }
@@ -343,11 +343,10 @@ class Array {
   TItems items_;
 };
 
-template<typename StreamType>
-inline StreamType& print_dense(StreamType *_os, const std::string& label, const NDArray& arr) {
+template <typename StreamType>
+inline StreamType& print_dense(StreamType* _os, const std::string& label, const NDArray& arr) {
   MSHADOW_TYPE_SWITCH(arr.data().type_flag_, DType, {
-    print(_os, label, test::Array<DType>(arr).Save(arr.ctx(), kDefaultStorage))
-      << std::endl;
+    print(_os, label, test::Array<DType>(arr).Save(arr.ctx(), kDefaultStorage)) << std::endl;
   });
   return *_os;
 }
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index c80255d72f20..141c5975f993 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -67,17 +67,14 @@ namespace op {
  * \brief Maintain the lifecycle of a GPU stream
  */
 struct GPUStreamScope {
-  explicit inline GPUStreamScope(OpContext *opContext)
-    : opContext_(*opContext) {
-    CHECK_EQ(opContext_.run_ctx.stream == nullptr, true)
-      << "Invalid runtime context stream state";
+  explicit inline GPUStreamScope(OpContext* opContext) : opContext_(*opContext) {
+    CHECK_EQ(opContext_.run_ctx.stream == nullptr, true) << "Invalid runtime context stream state";
     opContext_.run_ctx.stream = mshadow::NewStream<gpu>(true, true, opContext_.run_ctx.ctx.dev_id);
-    CHECK_EQ(opContext_.run_ctx.stream != nullptr, true)
-      << "Unable to allocate a GPU stream";
+    CHECK_EQ(opContext_.run_ctx.stream != nullptr, true) << "Unable to allocate a GPU stream";
   }
   inline ~GPUStreamScope() {
     if (opContext_.run_ctx.stream) {
-      mshadow::DeleteStream<gpu>(static_cast<mshadow::Stream<gpu> *>(opContext_.run_ctx.stream));
+      mshadow::DeleteStream<gpu>(static_cast<mshadow::Stream<gpu>*>(opContext_.run_ctx.stream));
       opContext_.run_ctx.stream = nullptr;
     }
   }
@@ -88,12 +85,10 @@ struct GPUStreamScope {
 /*!
  * \brief Base class for operator test-data classes
  */
-template<typename DType>
+template <typename DType>
 class OperatorDataInitializer {
  public:
-  OperatorDataInitializer()
-  : generator_(new std::mt19937()) {
-  }
+  OperatorDataInitializer() : generator_(new std::mt19937()) {}
   virtual ~OperatorDataInitializer() {}
 
   /*!
@@ -132,7 +127,9 @@ class OperatorDataInitializer {
    * \brief mt19937 generator for random number generator
    * \return reference to mt19937 generator object
    */
-  std::mt19937& generator() const { return *generator_; }
+  std::mt19937& generator() const {
+    return *generator_;
+  }
 
   /*! \brief Per-test generator */
   std::unique_ptr<std::mt19937> generator_;
@@ -140,7 +137,9 @@ class OperatorDataInitializer {
 
 class OperatorExecutorTiming {
  public:
-  inline test::perf::TimingInstrument& GetTiming() { return timing_; }
+  inline test::perf::TimingInstrument& GetTiming() {
+    return timing_;
+  }
 
  private:
   /*! Timing instrumentation */
@@ -148,10 +147,10 @@ class OperatorExecutorTiming {
 };
 
 /*! \brief Top-level operator test state info structure */
-template<typename OperatorProp, typename OperatorExecutor>
+template <typename OperatorProp, typename OperatorExecutor>
 struct OpInfo {
   /*! \brief The operator data */
-  std::shared_ptr< OperatorExecutor > executor_;
+  std::shared_ptr<OperatorExecutor> executor_;
   /*! \brief The operator prop class */
   std::shared_ptr<OperatorProp> prop_;
   /*! \brief The input type(s) */
@@ -159,16 +158,16 @@ struct OpInfo {
 };
 
 /*! \brief Pair of op info objects, generally for validating ops against each other */
-template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
+template <typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
 struct OpInfoPair {
   /*! \brief Operator item 1 */
-  test::op::OpInfo<OperatorProp1, OperatorExecutor>  info_1_;
+  test::op::OpInfo<OperatorProp1, OperatorExecutor> info_1_;
   /*! \brief Operator item 2 */
-  test::op::OpInfo<OperatorProp2, OperatorExecutor>  info_2_;
+  test::op::OpInfo<OperatorProp2, OperatorExecutor> info_2_;
 };
 
 /*! \brief Base validator class for validating test data */
-template<typename DType, typename AccReal>
+template <typename DType, typename AccReal>
 class Validator {
  public:
   static inline DType ERROR_BOUND() {
@@ -180,10 +179,10 @@ class Validator {
     }
   }
 
-  static inline DType ErrorBound(const TBlob *blob) {
+  static inline DType ErrorBound(const TBlob* blob) {
     // Due to eps, for a small number of entries, the error will be a bit higher for one pass
     if (blob->shape_.ndim() >= 3) {
-      if (blob->Size() / blob->shape_[1] <=4) {
+      if (blob->Size() / blob->shape_[1] <= 4) {
         return ERROR_BOUND() * 15;
       } else {
         return ERROR_BOUND();
@@ -195,11 +194,11 @@ class Validator {
   }
 
   /*! \brief Adjusted error based upon significant digits */
-  template<typename DTypeX>
-  static inline DType ErrorBound(const TBlob *blob, const DTypeX v1, const DTypeX v2) {
+  template <typename DTypeX>
+  static inline DType ErrorBound(const TBlob* blob, const DTypeX v1, const DTypeX v2) {
     const DType initialErrorBound = ErrorBound(blob);
     DType kErrorBound = initialErrorBound;  // This error is based upon the range [0.1x, 0.9x]
-    DTypeX avg = static_cast<DTypeX>((fabs(v1) + fabs(v2)) / 2);
+    DTypeX avg        = static_cast<DTypeX>((fabs(v1) + fabs(v2)) / 2);
     if (avg >= 1) {
       uint64_t vv = static_cast<uint64_t>(avg + 0.5);
       do {
@@ -209,19 +208,21 @@ class Validator {
     return kErrorBound;
   }
 
-  template<typename DTypeX>
+  template <typename DTypeX>
   static bool isNear(const DTypeX v1, const DTypeX v2, const AccReal error) {
     return error >= fabs(v2 - v1);
   }
 
   /*! \brief Convenient setpoint for macro-expanded failures */
-  template<typename Type1, typename Type2>
-  static void on_failure(const size_t i, const size_t n,
-                         const Type1 v1, const Type1 v2, const Type2 kErrorBound) {
-    LOG(WARNING)
-      << "Near test failure: at i = " << i << ", n = "
-      << n << ", kErrorBound = " << kErrorBound << std::endl
-      << std::flush;
+  template <typename Type1, typename Type2>
+  static void on_failure(const size_t i,
+                         const size_t n,
+                         const Type1 v1,
+                         const Type1 v2,
+                         const Type2 kErrorBound) {
+    LOG(WARNING) << "Near test failure: at i = " << i << ", n = " << n
+                 << ", kErrorBound = " << kErrorBound << std::endl
+                 << std::flush;
   }
 
   /*! \brief Compare blob data */
@@ -229,12 +230,12 @@ class Validator {
     if (b1.shape_ == b2.shape_) {
       CHECK_EQ(b1.type_flag_, b2.type_flag_) << "Can't compare blobs of different data types";
       MSHADOW_REAL_TYPE_SWITCH(b1.type_flag_, DTypeX, {
-        const DTypeX *d1 = b1.dptr<DTypeX>();
-        const DTypeX *d2 = b2.dptr<DTypeX>();
+        const DTypeX* d1 = b1.dptr<DTypeX>();
+        const DTypeX* d2 = b2.dptr<DTypeX>();
         CHECK_NE(d1, d2);  // don't compare the same memory
         for (size_t i = 0, n = b1.Size(), warningCount = 0; i < n; ++i) {
-          const DTypeX v1 = *d1++;
-          const DTypeX v2 = *d2++;
+          const DTypeX v1         = *d1++;
+          const DTypeX v2         = *d2++;
           const DType kErrorBound = ErrorBound(&b1, v1, v2);
           EXPECT_NEAR(v1, v2, kErrorBound);
           if (!isNear(v1, v2, kErrorBound) && !warningCount++) {
@@ -249,9 +250,9 @@ class Validator {
   }
 
   /*! \brief Compare blob data to a pointer to data */
-  template<typename DTypeX>
-  static bool compare(const TBlob& b1, const DTypeX *valuePtr) {
-    const DTypeX *d1 = b1.dptr<DType>();
+  template <typename DTypeX>
+  static bool compare(const TBlob& b1, const DTypeX* valuePtr) {
+    const DTypeX* d1 = b1.dptr<DType>();
     CHECK_NE(d1, valuePtr);  // don't compare the same memory
     const DType kErrorBound = ErrorBound(&b1);
     for (size_t i = 0, n = b1.Size(), warningCount = 0; i < n; ++i) {
@@ -270,16 +271,13 @@ class Validator {
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
 
 /*! \brief Create operator data, prop, the operator itself and init default forward input */
-template<
-  typename OperatorProp,
-  typename OperatorExecutor,
-  typename ...Args>
-static test::op::OpInfo<OperatorProp, OperatorExecutor> createOpAndInfoF(const kwargs_t &kwargs,
+template <typename OperatorProp, typename OperatorExecutor, typename... Args>
+static test::op::OpInfo<OperatorProp, OperatorExecutor> createOpAndInfoF(const kwargs_t& kwargs,
                                                                          Args... args) {
   test::op::OpInfo<OperatorProp, OperatorExecutor> info;
   info.executor_ = std::make_shared<OperatorExecutor>(args...);
-  info.prop_ = std::make_shared<OperatorProp>();
-  info.in_type_ = { mshadow::DataType<typename OperatorExecutor::DataType>::kFlag };
+  info.prop_     = std::make_shared<OperatorProp>();
+  info.in_type_  = {mshadow::DataType<typename OperatorExecutor::DataType>::kFlag};
   info.prop_->Init(kwargs);
   info.executor_->initForward(*info.prop_, &info.in_type_);
   return info;
diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h
index b46065bb5cdb..bf641ca24ba4 100644
--- a/tests/cpp/include/test_op_runner.h
+++ b/tests/cpp/include/test_op_runner.h
@@ -21,7 +21,7 @@
  * \file test_op_runner.h
  * \brief Run a generic operator
  * \author Chris Olivier
-*/
+ */
 #ifndef TEST_OP_RUNNER_H_
 #define TEST_OP_RUNNER_H_
 
@@ -39,10 +39,10 @@ namespace test {
  * \tparam OperatorExecutor Data container for forward and backward passes for some given
  *         data types
  */
-template<typename OperatorProp, typename OperatorExecutor>
+template <typename OperatorProp, typename OperatorExecutor>
 class OperatorRunner {
  public:
-  typedef typename OperatorExecutor::DataType    DType;
+  typedef typename OperatorExecutor::DataType DType;
 
   OperatorRunner() {
 #ifdef NDEBUG
@@ -61,21 +61,20 @@ class OperatorRunner {
    * \param count Number of times to run in each direction
    * \return OpInfo object for further opereator analysis
    */
-  test::op::OpInfo<OperatorProp, OperatorExecutor>
-  RunGenericOperatorForward(
-    bool isGPU,
-    const mxnet::ShapeVector& inputShapes,
-    const std::vector<std::pair<std::string, std::string> > &kwargs,
-    const size_t count = 1) {
+  test::op::OpInfo<OperatorProp, OperatorExecutor> RunGenericOperatorForward(
+      bool isGPU,
+      const mxnet::ShapeVector& inputShapes,
+      const std::vector<std::pair<std::string, std::string> >& kwargs,
+      const size_t count = 1) {
 #if MXNET_USE_CUDA
     if (isGPU && !test::unitTestsWithCuda) {
       LOG(INFO) << "GPU not found, running test as non-GPU";
     }
 #else
-    isGPU = false;
+    isGPU             = false;
 #endif
     test::op::OpInfo<OperatorProp, OperatorExecutor> info =
-      test::op::createOpAndInfoF<OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShapes);
+        test::op::createOpAndInfoF<OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShapes);
     info.executor_->initForward(*info.prop_, &info.in_type_);
     info.executor_->forward(count);
     return info;
@@ -88,8 +87,8 @@ class OperatorRunner {
    * \return OpInfo object for further opereator analysis
    */
   test::op::OpInfo<OperatorProp, OperatorExecutor> RunGenericOperatorBackward(
-    test::op::OpInfo<OperatorProp, OperatorExecutor> *info,
-    const size_t count = 1) {
+      test::op::OpInfo<OperatorProp, OperatorExecutor>* info,
+      const size_t count = 1) {
     CHECK(info->executor_->HasBackward());
     info->executor_->initBackward(*info->prop_, &info->in_type_);
     info->executor_->backward(count);
@@ -106,12 +105,12 @@ class OperatorRunner {
    * \return
    */
   test::op::OpInfo<OperatorProp, OperatorExecutor> RunBidirectional(
-    bool isGPU,
-    const mxnet::ShapeVector& inputShapes,
-    const std::vector<std::pair<std::string, std::string> > &kwargs,
-    const size_t count = 1) {
+      bool isGPU,
+      const mxnet::ShapeVector& inputShapes,
+      const std::vector<std::pair<std::string, std::string> >& kwargs,
+      const size_t count = 1) {
     test::op::OpInfo<OperatorProp, OperatorExecutor> info =
-      RunGenericOperatorForward(isGPU, inputShapes, kwargs, count);
+        RunGenericOperatorForward(isGPU, inputShapes, kwargs, count);
     if (info.executor_->HasBackward()) {
       return RunGenericOperatorBackward(&info, count);
     }
@@ -130,18 +129,18 @@ class OperatorRunner {
    * \param dim Data dimensions
    * \param count Number of times to run in each direction
    */
-  std::unordered_map<int, perf::TimingInstrument::Info>
-  TimingTest(const std::string& label,
-             const bool isGPU,
-             const bool stochastic,
-             const test::op::kwargs_t& kwargs,
-             int dim = 0,
-             size_t count = 1,
-             const mxnet::ShapeVector& timing_shapes = {},
-             bool backward = true) {
+  std::unordered_map<int, perf::TimingInstrument::Info> TimingTest(
+      const std::string& label,
+      const bool isGPU,
+      const bool stochastic,
+      const test::op::kwargs_t& kwargs,
+      int dim                                 = 0,
+      size_t count                            = 1,
+      const mxnet::ShapeVector& timing_shapes = {},
+      bool backward                           = true) {
     if (mxnet::test::quick_test) {
       total_iterations_ = 2;
-      count = 1;
+      count             = 1;
     }
 
     test::perf::TimingInstrument timing;
@@ -168,18 +167,18 @@ class OperatorRunner {
 
     for (size_t i = 0; i < total_iterations_; ++i) {
       index_t batchSize = 1;
-      index_t channels = 1;
-      index_t depth = 1;
-      index_t height = 1;
-      index_t width = 1;
+      index_t channels  = 1;
+      index_t depth     = 1;
+      index_t height    = 1;
+      index_t width     = 1;
 
       if (timing_shapes.empty()) {
         do {
           batchSize = stochastic ? test::rangedRand(1U, TEST_BATCH_SIZE * 2U) : TIMING_BATCH_SIZE;
-          channels = stochastic ? test::rangedRand(1U, TEST_CHANNELS * 2U) : TIMING_CHANNELS;
-          depth = stochastic ? test::rangedRand(1U, TEST_DEPTH * 2U) : TIMING_DEPTH;
-          height = stochastic ? test::rangedRand(1U, TEST_DH * 2U) : TIMING_DH;
-          width = stochastic ? test::rangedRand(1U, TEST_DW * 2U) : TIMING_DW;
+          channels  = stochastic ? test::rangedRand(1U, TEST_CHANNELS * 2U) : TIMING_CHANNELS;
+          depth     = stochastic ? test::rangedRand(1U, TEST_DEPTH * 2U) : TIMING_DEPTH;
+          height    = stochastic ? test::rangedRand(1U, TEST_DH * 2U) : TIMING_DH;
+          width     = stochastic ? test::rangedRand(1U, TEST_DW * 2U) : TIMING_DW;
         } while (stochastic && (height * width) == 1U);
       } else {
         dim = timing_shapes[0].ndim() - 1;
@@ -190,37 +189,31 @@ class OperatorRunner {
       test::op::OpInfo<OperatorProp, OperatorExecutor> info;
       switch (D) {
         case 0:
-          info = RunGenericOperatorForward(isGPU,
-                                           !timing_shapes.empty()
-                                           ? timing_shapes
-                                           : mxnet::ShapeVector({mxnet::TShape({batchSize,
-                                                                          channels,
-                                                                          width})}),
-                                           kwargs,
-                                           count);
+          info = RunGenericOperatorForward(
+              isGPU,
+              !timing_shapes.empty() ?
+                  timing_shapes :
+                  mxnet::ShapeVector({mxnet::TShape({batchSize, channels, width})}),
+              kwargs,
+              count);
           break;
         case 1:
-          info = RunGenericOperatorForward(isGPU,
-                                           !timing_shapes.empty()
-                                           ? timing_shapes
-                                           : mxnet::ShapeVector({ mxnet::TShape({batchSize,
-                                                                           channels,
-                                                                           height,
-                                                                           width})}),
-                                           kwargs,
-                                           count);
+          info = RunGenericOperatorForward(
+              isGPU,
+              !timing_shapes.empty() ?
+                  timing_shapes :
+                  mxnet::ShapeVector({mxnet::TShape({batchSize, channels, height, width})}),
+              kwargs,
+              count);
           break;
         case 2:
-          info = RunGenericOperatorForward(isGPU,
-                                           !timing_shapes.empty()
-                                           ? timing_shapes
-                                           : mxnet::ShapeVector({ mxnet::TShape({batchSize,
-                                                                           channels,
-                                                                           depth,
-                                                                           height,
-                                                                           width})}),
-                                           kwargs,
-                                           count);
+          info = RunGenericOperatorForward(
+              isGPU,
+              !timing_shapes.empty() ?
+                  timing_shapes :
+                  mxnet::ShapeVector({mxnet::TShape({batchSize, channels, depth, height, width})}),
+              kwargs,
+              count);
           break;
         default:
           CHECK(false) << "Unsupported dimension count: " << (D + 1);
@@ -240,22 +233,26 @@ class OperatorRunner {
     return timing.data();
   }
 
-  void set_verbose(bool verbose) { verbose_ = verbose; }
+  void set_verbose(bool verbose) {
+    verbose_ = verbose;
+  }
 
-  void set_total_iterations(size_t iterations) { total_iterations_ = iterations; }
+  void set_total_iterations(size_t iterations) {
+    total_iterations_ = iterations;
+  }
 
  protected:
   static constexpr int TEST_BATCH_SIZE = 5;
-  static constexpr int TEST_CHANNELS = 3;
-  static constexpr int TEST_DEPTH = 2;
-  static constexpr int TEST_DH = 2;
-  static constexpr int TEST_DW = 3;
+  static constexpr int TEST_CHANNELS   = 3;
+  static constexpr int TEST_DEPTH      = 2;
+  static constexpr int TEST_DH         = 2;
+  static constexpr int TEST_DW         = 3;
 
   static constexpr int TIMING_BATCH_SIZE = 128;
-  static constexpr int TIMING_CHANNELS = 3;
-  static constexpr int TIMING_DEPTH = 2;
-  static constexpr int TIMING_DH = 64;
-  static constexpr int TIMING_DW = 64;
+  static constexpr int TIMING_CHANNELS   = 3;
+  static constexpr int TIMING_DEPTH      = 2;
+  static constexpr int TIMING_DH         = 64;
+  static constexpr int TIMING_DW         = 64;
   /*! \brief verbose output */
   bool verbose_ = true;
   /*! \brief Tital iterations */
diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h
index 2daee316da12..2f215b5f68ee 100644
--- a/tests/cpp/include/test_perf.h
+++ b/tests/cpp/include/test_perf.h
@@ -21,7 +21,7 @@
  * \file test_perf.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
-*/
+ */
 
 #ifndef TEST_PERF_H_
 #define TEST_PERF_H_
@@ -83,35 +83,32 @@ inline uint64_t getNannoTickCount() {
 #endif
 }
 
-#define MICRO2MS(__micro$)  (((__micro$) + 500)/1000)
-#define MICRO2MSF(__micro$) (static_cast<float>(__micro$)/1000)
-#define MICRO2MSF(__micro$) (static_cast<float>(__micro$)/1000)
-#define MS2MICRO(__ms$)     ((__ms$) * 1000)
-#define NANO2MSF(__nano$)   (static_cast<float>(__nano$)/1000000)
-#define MICRO2S(__micro$)   (((__micro$) + 500000)/1000000)
-#define MICRO2SF(__micro$)  (MICRO2MSF(__micro$)/1000)
+#define MICRO2MS(__micro$)  (((__micro$) + 500) / 1000)
+#define MICRO2MSF(__micro$) (static_cast<float>(__micro$) / 1000)
+#define MICRO2MSF(__micro$) (static_cast<float>(__micro$) / 1000)
+#define MS2MICRO(__ms$)     ((__ms$)*1000)
+#define NANO2MSF(__nano$)   (static_cast<float>(__nano$) / 1000000)
+#define MICRO2S(__micro$)   (((__micro$) + 500000) / 1000000)
+#define MICRO2SF(__micro$)  (MICRO2MSF(__micro$) / 1000)
 
 /*! \brief Calculate time between construction and destruction */
 class TimedScope {
-  std::string     label_;
-  uint64_t        startTime_;
-  uint64_t        stopTime_;
-  const size_t    count_;
+  std::string label_;
+  uint64_t startTime_;
+  uint64_t stopTime_;
+  const size_t count_;
 
  public:
-  explicit inline TimedScope(const char *msg = nullptr, size_t count = 1, const bool start = true)
-    : startTime_(start ? getMicroTickCount() : 0)
-      , stopTime_(0)
-      , count_(count) {
+  explicit inline TimedScope(const char* msg = nullptr, size_t count = 1, const bool start = true)
+      : startTime_(start ? getMicroTickCount() : 0), stopTime_(0), count_(count) {
     CHECK_NE(count, 0U);
     if (msg && *msg) {
       label_ = msg;
     }
   }
 
-  explicit inline TimedScope(const std::string &msg, size_t count = 1, const bool start = true)
-    : startTime_(start ? getMicroTickCount() : 0)
-      , count_(count) {
+  explicit inline TimedScope(const std::string& msg, size_t count = 1, const bool start = true)
+      : startTime_(start ? getMicroTickCount() : 0), count_(count) {
     CHECK_NE(count, 0U);
     if (!msg.empty()) {
       label_ = msg;
@@ -127,7 +124,8 @@ class TimedScope {
   }
 
   inline void stop() {
-    stopTime_ = getMicroTickCount();;
+    stopTime_ = getMicroTickCount();
+    ;
   }
 
   inline float elapsedMilliseconds() const {
@@ -145,8 +143,7 @@ class TimedScope {
     if (!label_.empty()) {
       ss << label_ << " ";
     }
-    ss << "elapsed time: "
-       << std::setprecision(4) << std::fixed << MICRO2MSF(diff) << " ms";
+    ss << "elapsed time: " << std::setprecision(4) << std::fixed << MICRO2MSF(diff) << " ms";
     if (count_ != 0 && count_ != 1) {
       const float microSecondsEach = static_cast<float>(diff) / count_;
       ss << " ( " << MICRO2MSF(microSecondsEach) << " ms each )";
@@ -158,10 +155,8 @@ class TimedScope {
 /*! \brief Accumulate separate timing values mapped by label/id -> total time spent */
 class TimingInstrument {
  public:
-  explicit TimingInstrument(const char *name = "")
-    : name_(name) {
-  }
-  void startTiming(int id, const char *s) {
+  explicit TimingInstrument(const char* name = "") : name_(name) {}
+  void startTiming(int id, const char* s) {
     std::unique_lock<std::recursive_mutex> lk(mutex_);
     auto i = data_.find(id);
     if (i == data_.end()) {
@@ -189,11 +184,11 @@ class TimingInstrument {
     std::unique_lock<std::recursive_mutex> lk(mutex_);
     auto i = data_.find(id);
     if (i != data_.end()) {
-      const Info &info = i->second;
-      const uint64_t duration = info.nestingCount_.load()
-                                ? info.duration_.load() +
-                                  (getMicroTickCount() - info.baseTime_.load())
-                                : info.duration_.load();
+      const Info& info = i->second;
+      const uint64_t duration =
+          info.nestingCount_.load() ?
+              info.duration_.load() + (getMicroTickCount() - info.baseTime_.load()) :
+              info.duration_.load();
       return duration;
     }
     return 0;
@@ -206,21 +201,19 @@ class TimingInstrument {
     return false;
   }
 
-  template<typename StreamType>
-  void print(StreamType *os, const std::string &label_, bool doReset = false) {
+  template <typename StreamType>
+  void print(StreamType* os, const std::string& label_, bool doReset = false) {
     std::unique_lock<std::recursive_mutex> lk(mutex_);
     // Sorted output
     std::map<int, Info> data(data_.begin(), data_.end());
-    for (std::map<int, Info>::const_iterator i = data.begin(), e = data.end();
-         i != e; ++i) {
-      const Info &info = i->second;
+    for (std::map<int, Info>::const_iterator i = data.begin(), e = data.end(); i != e; ++i) {
+      const Info& info        = i->second;
       const uint64_t duration = getDuration(i->first);
       *os << label_ << ": " << name_ << " Timing [" << info.name_ << "] "
-          << (info.nestingCount_.load() ? "*" : "")
-          << MICRO2MSF(duration) << " ms";
+          << (info.nestingCount_.load() ? "*" : "") << MICRO2MSF(duration) << " ms";
       if (info.cycleCount_.load()) {
-        *os << ", avg: " << (MICRO2MSF(duration) / info.cycleCount_)
-            << " ms X " << info.cycleCount_ << " passes";
+        *os << ", avg: " << (MICRO2MSF(duration) / info.cycleCount_) << " ms X " << info.cycleCount_
+            << " passes";
       }
       *os << std::endl;
     }
@@ -232,9 +225,8 @@ class TimingInstrument {
 
   void reset() {
     std::unique_lock<std::recursive_mutex> lk(mutex_);
-    for (auto i = data_.begin(), e = data_.end();
-         i != e; ++i) {
-      const int id = i->first;
+    for (auto i = data_.begin(), e = data_.end(); i != e; ++i) {
+      const int id         = i->first;
       const bool wasTiming = isTiming(id);
       if (wasTiming) {
         stopTiming(id);
@@ -248,12 +240,11 @@ class TimingInstrument {
     }
   }
 
-  TimingInstrument &operator+=(const TimingInstrument &o) {
-    for (auto i = o.data_.begin(), e = o.data_.end();
-         i != e; ++i) {
+  TimingInstrument& operator+=(const TimingInstrument& o) {
+    for (auto i = o.data_.begin(), e = o.data_.end(); i != e; ++i) {
       auto j = data_.find(i->first);
       if (j != data_.end()) {
-        const Info &oInfo = i->second;
+        const Info& oInfo = i->second;
         CHECK_EQ(oInfo.nestingCount_, 0U);
         j->second.duration_ += oInfo.duration_;
         j->second.cycleCount_ += oInfo.cycleCount_;
@@ -265,23 +256,19 @@ class TimingInstrument {
   }
 
   struct Info {
-    explicit inline Info(const char *s)
-      : name_(s ? s : "")
-        , baseTime_(0)
-        , nestingCount_(0)
-        , cycleCount_(0)
-        , duration_(0) {}
+    explicit inline Info(const char* s)
+        : name_(s ? s : ""), baseTime_(0), nestingCount_(0), cycleCount_(0), duration_(0) {}
 
     inline Info(const Info& o)
-      : name_(o.name_)
-        , baseTime_(o.baseTime_.load())
-        , nestingCount_(o.nestingCount_.load())
-        , cycleCount_(o.cycleCount_.load())
-        , duration_(o.duration_.load()) {
+        : name_(o.name_),
+          baseTime_(o.baseTime_.load()),
+          nestingCount_(o.nestingCount_.load()),
+          cycleCount_(o.cycleCount_.load()),
+          duration_(o.duration_.load()) {
       CHECK_EQ(o.nestingCount_, 0U);
     }
 
-    inline Info& operator = (const Info& o) {
+    inline Info& operator=(const Info& o) {
       name_ = o.name_;
       baseTime_.store(baseTime_.load());
       nestingCount_.store(nestingCount_.load());
@@ -298,7 +285,7 @@ class TimingInstrument {
       return static_cast<double>(duration_) / cycleCount_.load() / 1000.0f;
     }
 
-    std::string           name_;
+    std::string name_;
     std::atomic<uint64_t> baseTime_;
     std::atomic<uint64_t> nestingCount_;
     std::atomic<uint64_t> cycleCount_;  // Note that nesting may skew averages
@@ -307,7 +294,7 @@ class TimingInstrument {
 
   typedef std::unordered_map<int, TimingInstrument::Info> timing_map_t;
 
-  const timing_map_t &data() const {
+  const timing_map_t& data() const {
     return data_;
   }
 
@@ -322,13 +309,11 @@ using timing_map_t = TimingInstrument::timing_map_t;
 /*! \brief Accumulated scoped timing, indexed by ID */
 class TimingItem {
  public:
-  inline TimingItem(TimingInstrument *ti,
+  inline TimingItem(TimingInstrument* ti,
                     int id,
-                    const char *name,
+                    const char* name,
                     const size_t subIterationCount = 1)
-    : ti_(ti)
-      , id_(id)
-      , subIterationCount_(subIterationCount) {
+      : ti_(ti), id_(id), subIterationCount_(subIterationCount) {
     if (ti_) {
       ti_->startTiming(id, name);
     }
@@ -340,12 +325,11 @@ class TimingItem {
   }
 
  private:
-  TimingInstrument *ti_;
+  TimingInstrument* ti_;
   const int id_;
   const size_t subIterationCount_;
 };
 
-
 }  // namespace perf
 }  // namespace test
 }  // namespace mxnet
diff --git a/tests/cpp/include/test_tune.h b/tests/cpp/include/test_tune.h
index 9f5a2e04c54e..3b2310f68fa5 100644
--- a/tests/cpp/include/test_tune.h
+++ b/tests/cpp/include/test_tune.h
@@ -21,7 +21,7 @@
  * \file test_tune.h
  * \brief operator tuning tester
  * \author Chris Olivier
-*/
+ */
 
 #ifndef TEST_TUNE_H_
 #define TEST_TUNE_H_
@@ -60,19 +60,19 @@ namespace tune {
  *       trunk unless you've verified the performance characteristics for that chunk of code
  * \tparam DType Data type to test
  */
-template<typename DType>
+template <typename DType>
 class TuningTester {
  public:
   using kwargs_t = test::op::kwargs_t;
 
   using bool_mode_pair = std::pair<bool, ::mxnet::op::tune::TuningMode>;
 
-  using shape_vect = mxnet::ShapeVector;
+  using shape_vect            = mxnet::ShapeVector;
   using shape_vec_to_bool_map = std::map<shape_vect, bool_mode_pair, test::less_shapevect>;
 
  private:
   using ShapesToPerfTimingMap =
-  std::map<shape_vect, test::perf::timing_map_t, test::less_shapevect>;
+      std::map<shape_vect, test::perf::timing_map_t, test::less_shapevect>;
 
   /*!
    * \brief Run timing test on various data shapes and sizes
@@ -83,13 +83,13 @@ class TuningTester {
    * \return ShapesToPerfTimingMap map holsing timing data for shapes
    */
   ShapesToPerfTimingMap RunCoreOpTimingTest(const bool isGPU,
-                                            const kwargs_t &op_kwargs,
+                                            const kwargs_t& op_kwargs,
                                             const std::vector<shape_vect>& shapes,
-                                            const char *op_name,
-                                            const char *backward_op_name = "") {
+                                            const char* op_name,
+                                            const char* backward_op_name = "") {
     ShapesToPerfTimingMap res;
-    const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-      op_kwargs, op_name, backward_op_name);
+    const kwargs_t kwargs =
+        test::op::CoreOpExecutor<DType>::ArgsWithOpName(op_kwargs, op_name, backward_op_name);
 
     // prime code and cache before the performance runs
     test::op::CoreOperatorRunner<DType> runner;
@@ -98,11 +98,14 @@ class TuningTester {
     runner.RunBidirectional(false, {{10, 3, 18, 128}}, kwargs, 1);
 
     // Do the performance runs
-    const char *pu = isGPU ? "GPU" : "CPU";
-    for (const mxnet::ShapeVector &this_run_shapes : shapes) {
+    const char* pu = isGPU ? "GPU" : "CPU";
+    for (const mxnet::ShapeVector& this_run_shapes : shapes) {
       test::perf::timing_map_t tmap = runner.TimingTest(std::string(op_name) + " Operator " + pu,
-                                                        isGPU, false, kwargs,
-                                                        0, calls_per_iteration_,
+                                                        isGPU,
+                                                        false,
+                                                        kwargs,
+                                                        0,
+                                                        calls_per_iteration_,
                                                         this_run_shapes);
       CHECK(res.find(this_run_shapes) == res.end());
       res[this_run_shapes] = tmap;
@@ -110,9 +113,9 @@ class TuningTester {
     return res;
   }
 
-  using tuned_timing_t = std::map<
-    shape_vect,
-    std::map<::mxnet::op::tune::TuningMode, test::perf::timing_map_t>, test::less_shapevect>;
+  using tuned_timing_t = std::map<shape_vect,
+                                  std::map<::mxnet::op::tune::TuningMode, test::perf::timing_map_t>,
+                                  test::less_shapevect>;
 
   using modesort_t = std::multimap<double, ::mxnet::op::tune::TuningMode>;
 
@@ -125,7 +128,7 @@ class TuningTester {
    *         have made the correct decision, and the TuningMode which was closest in timing to
    *         the Auto mode.
    */
-  static bool_mode_pair CheckCorrectTuning(const modesort_t &mode_sort,
+  static bool_mode_pair CheckCorrectTuning(const modesort_t& mode_sort,
                                            const double closeness_factor = 0.25) {
     CHECK_EQ(mode_sort.size(), 3U);
 
@@ -145,9 +148,9 @@ class TuningTester {
     for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) {
       mode2time[i->second] = i->first;
     }
-    const double time_auto = mode2time[::mxnet::op::tune::kAuto];
+    const double time_auto   = mode2time[::mxnet::op::tune::kAuto];
     const double time_no_omp = mode2time[::mxnet::op::tune::kNeverOMP];
-    const double time_omp = mode2time[::mxnet::op::tune::kAlwaysOMP];
+    const double time_omp    = mode2time[::mxnet::op::tune::kAlwaysOMP];
 
     // Figure out which one we are closest to and return that to help in the analysis
     ::mxnet::op::tune::TuningMode closest_to;
@@ -160,11 +163,10 @@ class TuningTester {
     // If difference between OMP and no OMP is < closeness_factor of largest of the two,
     // then we just want to make sure we are close to both of these
     const double fastest_standard_time = std::min(time_no_omp, time_omp);
-    const double allowed_difference = closeness_factor * fastest_standard_time;
-    const double mustbe_asfast = fastest_standard_time + allowed_difference;
+    const double allowed_difference    = closeness_factor * fastest_standard_time;
+    const double mustbe_asfast         = fastest_standard_time + allowed_difference;
 
-    return { time_auto <= mustbe_asfast || closest_to == fastest_standard_mode,
-             closest_to };
+    return {time_auto <= mustbe_asfast || closest_to == fastest_standard_mode, closest_to};
   }
 
  public:
@@ -183,38 +185,37 @@ class TuningTester {
     }
     shape_vec_to_bool_map results;
     // Incredibly inefficient method of grouping the results
-    for (const auto &i : timing_) {
+    for (const auto& i : timing_) {
       // print shapes
-      const shape_vect &shapes = i.first;
+      const shape_vect& shapes = i.first;
       if (verbose || test::csv) {
         if (!test::csv) {
           for (size_t x = 0, n = shapes.size(); x < n; ++x) {
-            const mxnet::TShape &shape = shapes[x];
+            const mxnet::TShape& shape = shapes[x];
             if (x) {
               std::cout << ", ";
             }
             std::cout << shape;
           }
-          const mxnet::TShape &lhs_shape = shapes[0];
+          const mxnet::TShape& lhs_shape = shapes[0];
           std::cout << " lhs=" << test::pretty_num(lhs_shape.Size()) << " items";
           std::cout << "\t(" << TimingDirectionAsString(direction) << ")" << std::endl;
         } else {
           std::cout << test::pretty_num(shapes[0].Size()) << ",";
         }
       }
-      const auto &mode2timing = i.second;
+      const auto& mode2timing = i.second;
       modesort_t mode_sort;
-      for (const auto &j : mode2timing) {
+      for (const auto& j : mode2timing) {
         const ::mxnet::op::tune::TuningMode mode = j.first;
-        const test::perf::timing_map_t &tm = j.second;
+        const test::perf::timing_map_t& tm       = j.second;
         if (tm.find(direction) != tm.end()) {
-          const test::perf::TimingInstrument::Info &info = tm.find(direction)->second;
-          double duration = info.TimeEach();
+          const test::perf::TimingInstrument::Info& info = tm.find(direction)->second;
+          double duration                                = info.TimeEach();
           mode_sort.insert({duration, mode});
           if (test::csv) {
             std::cout << TimingDirectionAsString(direction) << ","
-                      << ::mxnet::op::tune::TuningModeToString(mode) << ","
-                      << duration << ",";
+                      << ::mxnet::op::tune::TuningModeToString(mode) << "," << duration << ",";
           }
         }
       }
@@ -225,9 +226,9 @@ class TuningTester {
         // Now we have modes sorted by performance, fastest to slowest
         const bool_mode_pair result = CheckCorrectTuning(mode_sort);
         if (verbose && !test::csv) {
-          for (const auto &k : mode_sort) {
-            std::cout << "\t" << ::mxnet::op::tune::TuningModeToString(k.second)
-                      << ": " << k.first << " ms";
+          for (const auto& k : mode_sort) {
+            std::cout << "\t" << ::mxnet::op::tune::TuningModeToString(k.second) << ": " << k.first
+                      << " ms";
             if (k.second == ::mxnet::op::tune::kAuto) {
               std::cout << " (" << ::mxnet::op::tune::TuningModeToString(result.second) << ")";
             }
@@ -251,34 +252,31 @@ class TuningTester {
    * \param op_name Name by which the operator is registered with nnvm
    * \param backward_op_name Backward operator name
    */
-  void TestTunedOperator(const kwargs_t &kwargs,
+  void TestTunedOperator(const kwargs_t& kwargs,
                          const bool verbose,
                          const std::vector<shape_vect>& shapevec_vectors,
-                         const char *op_name,
-                         const char *backward_op_name = COREOP_BWD_OP_NAME_VALUE_NONE) {
+                         const char* op_name,
+                         const char* backward_op_name = COREOP_BWD_OP_NAME_VALUE_NONE) {
     timing_.clear();
     using namespace mxnet::op;
     tuned_timing_t timing;
     for (int x = 0; x < 1; ++x) {
       for (auto mode : {::mxnet::op::tune::kNeverOMP,
                         ::mxnet::op::tune::kAuto,
-                        ::mxnet::op::tune::kAlwaysOMP
-                        }) {
+                        ::mxnet::op::tune::kAlwaysOMP}) {
         if (verbose && !test::csv) {
-          std::cout << std::endl << ::mxnet::op::tune::TuningModeToString(mode)
-                    << std::endl << std::flush;
+          std::cout << std::endl
+                    << ::mxnet::op::tune::TuningModeToString(mode) << std::endl
+                    << std::flush;
         }
 
         mxnet::op::OperatorTune<DType>::set_tuning_mode(mode);
-        const ShapesToPerfTimingMap shapes2perfmap = RunCoreOpTimingTest(false,
-                                                                         kwargs,
-                                                                         shapevec_vectors,
-                                                                         op_name,
-                                                                         backward_op_name);
-        for (const auto &item : shapes2perfmap) {
-          const shape_vect &shapes = item.first;
-          const test::perf::timing_map_t &tm = item.second;
-          timing_[shapes][mode] = tm;
+        const ShapesToPerfTimingMap shapes2perfmap =
+            RunCoreOpTimingTest(false, kwargs, shapevec_vectors, op_name, backward_op_name);
+        for (const auto& item : shapes2perfmap) {
+          const shape_vect& shapes           = item.first;
+          const test::perf::timing_map_t& tm = item.second;
+          timing_[shapes][mode]              = tm;
         }
       }
     }
@@ -292,14 +290,14 @@ class TuningTester {
    * \return Success rate ratio (#success/#TOTAL) (0.0-1.0)
    */
   float CalculateSuccessRate(std::vector<test::op::TimingDirection> directions = {},
-                             bool verbose = true) const {
+                             bool verbose                                      = true) const {
     size_t count = 0, success = 0;
     if (directions.empty()) {
       directions = {test::op::kForward, test::op::kBackward};
     }
     for (const test::op::TimingDirection direction : directions) {
       typename test::tune::TuningTester<DType>::shape_vec_to_bool_map res_fwd =
-        CalculateModeSort(direction, verbose);
+          CalculateModeSort(direction, verbose);
       for (auto iter = res_fwd.begin(), e = res_fwd.end(); iter != e; ++iter) {
         ++count;
         if (iter->second.first) {
@@ -319,16 +317,20 @@ class TuningTester {
   size_t calls_per_iteration(size_t calls_per_iterations) const {
     return calls_per_iteration_;
   }
-  void set_total_iterations(size_t iterations) { total_iterations_ = iterations; }
-  size_t total_iterations(size_t iterations) const { return total_iterations_; }
+  void set_total_iterations(size_t iterations) {
+    total_iterations_ = iterations;
+  }
+  size_t total_iterations(size_t iterations) const {
+    return total_iterations_;
+  }
 
  private:
   /*! \brief Number of iterations */
-  size_t          total_iterations_ = 10;
+  size_t total_iterations_ = 10;
   /*! \brief Calls per iteration */
-  size_t          calls_per_iteration_ = 50;
+  size_t calls_per_iteration_ = 50;
   /*! \brief Raw timing data */
-  tuned_timing_t  timing_;
+  tuned_timing_t timing_;
 };
 
 }  // namespace tune
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index 8e270834bbcc..9b495388955c 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -21,7 +21,7 @@
  * \file test_util.h
  * \brief unit test performance analysis functions
  * \author Chris Olivier
-*/
+ */
 #ifndef TEST_UTIL_H_
 #define TEST_UTIL_H_
 
@@ -49,7 +49,7 @@ extern bool performance_run;
 extern bool csv;
 extern bool thread_safety_force_cpu;
 
-template<typename DType>
+template <typename DType>
 inline size_t shapeMemorySize(const mxnet::TShape& shape) {
   return shape.Size() * sizeof(DType);
 }
@@ -62,11 +62,11 @@ class BlobMemory {
   inline ~BlobMemory() {
     Free();
   }
-  void *Alloc(const size_t size) {
+  void* Alloc(const size_t size) {
     CHECK_GT(size, 0U);  // You've probably made a mistake
     mxnet::Context context = isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
-    Storage *storage = mxnet::Storage::Get();
-    handle_ = storage->Alloc(size, context);
+    Storage* storage       = mxnet::Storage::Get();
+    handle_                = storage->Alloc(size, context);
     return handle_.dptr;
   }
   void Free() {
@@ -79,17 +79,17 @@ class BlobMemory {
   }
 
  private:
-  const bool      isGPU_;
+  const bool isGPU_;
   Storage::Handle handle_;
 };
 
 class StandaloneBlob : public TBlob {
  public:
   inline StandaloneBlob(const mxnet::TShape& shape, const bool isGPU, const int dtype)
-    : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype)
-      , memory_(std::make_shared<BlobMemory>(isGPU)) {
-    MSHADOW_TYPE_SWITCH(dtype, DType, {
-      this->dptr_ = memory_->Alloc(shapeMemorySize<DType>(shape)); });
+      : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype),
+        memory_(std::make_shared<BlobMemory>(isGPU)) {
+    MSHADOW_TYPE_SWITCH(
+        dtype, DType, { this->dptr_ = memory_->Alloc(shapeMemorySize<DType>(shape)); });
   }
   inline ~StandaloneBlob() {
     this->dptr_ = nullptr;
@@ -100,7 +100,7 @@ class StandaloneBlob : public TBlob {
 
  private:
   /*! \brief Locally allocated memory block for this blob */
-  std::shared_ptr<BlobMemory>  memory_;
+  std::shared_ptr<BlobMemory> memory_;
 };
 
 /*!
@@ -111,16 +111,14 @@ class StandaloneBlob : public TBlob {
 class CAccessAsCPU {
  public:
   CAccessAsCPU(const RunContext& run_ctx, const TBlob& src, bool copy_back_result = true)
-  : run_ctx_(run_ctx)
-    , src_(src)
-    , copy_back_result_(copy_back_result) {
+      : run_ctx_(run_ctx), src_(src), copy_back_result_(copy_back_result) {
 #if MXNET_USE_CUDA
     if (run_ctx.ctx.dev_type == Context::kCPU) {
       blob_ = src;
     } else {
       Context cpu_ctx, gpu_ctx = run_ctx.ctx;
       cpu_ctx.dev_type = Context::kCPU;
-      cpu_ctx.dev_id = 0;
+      cpu_ctx.dev_id   = 0;
       NDArray on_cpu(src.shape_, cpu_ctx, false, src_.type_flag_);
       on_cpu.CheckAndAlloc();
       blob_ = on_cpu.data();
@@ -140,7 +138,7 @@ class CAccessAsCPU {
       if (run_ctx_.ctx.dev_type == Context::kGPU) {
         Context cpu_ctx, gpu_ctx = run_ctx_.ctx;
         cpu_ctx.dev_type = Context::kCPU;
-        cpu_ctx.dev_id = 0;
+        cpu_ctx.dev_id   = 0;
         run_ctx_.get_stream<gpu>()->Wait();
         mxnet::ndarray::Copy<cpu, gpu>(blob_, &src_, gpu_ctx, cpu_ctx, run_ctx_);
         run_ctx_.get_stream<gpu>()->Wait();
@@ -148,7 +146,7 @@ class CAccessAsCPU {
     }
 #endif
   }
-  inline const TBlob& operator ()() const {
+  inline const TBlob& operator()() const {
     return blob_;
   }
 
@@ -168,16 +166,14 @@ class CAccessAsCPU {
  * \param cb Callback Function to call with CPU-data NDArray
  */
 template <typename CallbackFunction>
-inline void AccessAsCPU(const NDArray &src,
-                               const RunContext &run_ctx,
-                               CallbackFunction cb) {
+inline void AccessAsCPU(const NDArray& src, const RunContext& run_ctx, CallbackFunction cb) {
 #if MXNET_USE_CUDA
   if (src.ctx().dev_type == Context::kCPU) {
     cb(src);
   } else {
     Context cpu_ctx, gpu_ctx = src.ctx();
     cpu_ctx.dev_type = Context::kCPU;
-    cpu_ctx.dev_id = 0;
+    cpu_ctx.dev_id   = 0;
     NDArray on_cpu(src.shape(), cpu_ctx, false, src.dtype());
     on_cpu.CheckAndAlloc();
     TBlob tmp1 = on_cpu.data();
@@ -202,9 +198,7 @@ inline void AccessAsCPU(const NDArray &src,
  * \param cb Callback Function to call with CPU-data TBlob
  */
 template <typename CallbackFunction>
-inline void AccessAsCPU(const TBlob& src,
-                               const RunContext &run_ctx,
-                               CallbackFunction cb) {
+inline void AccessAsCPU(const TBlob& src, const RunContext& run_ctx, CallbackFunction cb) {
 #if MXNET_USE_CUDA
   if (run_ctx.ctx.dev_type == Context::kCPU) {
     cb(src);
@@ -217,11 +211,11 @@ inline void AccessAsCPU(const TBlob& src,
 }
 
 constexpr const size_t MPRINT_PRECISION = 5;
-template<typename DType>
-inline void fill(const RunContext &run_ctx, const TBlob& _blob, const DType val) {
+template <typename DType>
+inline void fill(const RunContext& run_ctx, const TBlob& _blob, const DType val) {
   AccessAsCPU(_blob, run_ctx, [val](const TBlob& blob) {
     MSHADOW_TYPE_SWITCH(blob.type_flag_, DTypeX, {
-      DTypeX *p1 = blob.dptr<DTypeX>();
+      DTypeX* p1 = blob.dptr<DTypeX>();
       for (size_t i = 0, n = blob.Size(); i < n; ++i) {
         *p1++ = val;
       }
@@ -229,16 +223,16 @@ inline void fill(const RunContext &run_ctx, const TBlob& _blob, const DType val)
   });
 }
 
-template<typename DType>
-inline void try_fill(const RunContext &run_ctx, const TBlob *blob, const DType val) {
+template <typename DType>
+inline void try_fill(const RunContext& run_ctx, const TBlob* blob, const DType val) {
   if (blob) {
     fill(run_ctx, *blob, val);
   }
 }
 
-template<typename DType, typename Stream>
-inline void dump(Stream *os, const TBlob& blob, const char *suffix = "f") {
-  DType *p1 = blob.dptr<DType>();
+template <typename DType, typename Stream>
+inline void dump(Stream* os, const TBlob& blob, const char* suffix = "f") {
+  DType* p1 = blob.dptr<DType>();
   for (size_t i = 0, n = blob.Size(); i < n; ++i) {
     if (i) {
       *os << ", ";
@@ -257,7 +251,6 @@ inline void dump(Stream *os, const TBlob& blob, const char *suffix = "f") {
   }
 }
 
-
 /*! \brief Return reference to data at position indexes */
 inline index_t getMult(const mxnet::TShape& shape, const index_t axis) {
   return axis < shape.ndim() ? shape[axis] : 1;
@@ -279,18 +272,19 @@ inline index_t offset(const mxnet::TShape& shape, const std::vector<size_t>& ind
 }
 
 /*! \brief Return reference to data at position indexes */
-template<typename DType>
-inline const DType& data_at(const TBlob *blob, const std::vector<size_t>& indices) {
+template <typename DType>
+inline const DType& data_at(const TBlob* blob, const std::vector<size_t>& indices) {
   return blob->dptr<DType>()[offset(blob->shape_, indices)];
 }
 
 /*! \brief Set data at position indexes */
-template<typename DType>
-inline DType& data_ref(const TBlob *blob, const std::vector<size_t>& indices) {
+template <typename DType>
+inline DType& data_ref(const TBlob* blob, const std::vector<size_t>& indices) {
   return blob->dptr<DType>()[offset(blob->shape_, indices)];
 }
 
-inline std::string repeatedStr(const char *s, const signed int count,
+inline std::string repeatedStr(const char* s,
+                               const signed int count,
                                const bool trailSpace = false) {
   if (count <= 0) {
     return std::string();
@@ -311,9 +305,11 @@ inline std::string repeatedStr(const char *s, const signed int count,
 }
 
 /*! \brief Pretty print a shape with optional label */
-template<typename StreamType>
-inline StreamType& print_shape(StreamType *_os, const std::string& label,
-                               const mxnet::TShape& shape, const bool add_endl = true) {
+template <typename StreamType>
+inline StreamType& print_shape(StreamType* _os,
+                               const std::string& label,
+                               const mxnet::TShape& shape,
+                               const bool add_endl = true) {
   if (!label.empty()) {
     *_os << label << ": ";
   }
@@ -334,21 +330,21 @@ inline StreamType& print_shape(StreamType *_os, const std::string& label,
 }
 
 /*! \brief Pretty print a 1D, 2D, or 3D blob */
-template<typename DType, typename StreamType>
+template <typename DType, typename StreamType>
 inline StreamType& print_blob_(const RunContext& ctx,
-                               StreamType *_os,
-                               const TBlob &blob,
+                               StreamType* _os,
+                               const TBlob& blob,
                                const bool doChannels = true,
-                               const bool doBatches = true,
-                               const bool add_endl = true) {
+                               const bool doBatches  = true,
+                               const bool add_endl   = true) {
 #if MXNET_USE_CUDA
   if (blob.dev_mask() == gpu::kDevMask) {
-    return print_blob_<DType>(ctx, _os, CAccessAsCPU(ctx, blob, false)(), doChannels,
-                              doBatches, add_endl);
+    return print_blob_<DType>(
+        ctx, _os, CAccessAsCPU(ctx, blob, false)(), doChannels, doBatches, add_endl);
   }
 #endif  // MXNET_USE_CUDA
 
-  StreamType &os = *_os;
+  StreamType& os   = *_os;
   const size_t dim = static_cast<size_t>(blob.ndim());
 
   if (dim == 1) {
@@ -372,9 +368,9 @@ inline StreamType& print_blob_(const RunContext& ctx,
   const size_t batchSize = blob.size(0);
 
   size_t channels = 1;
-  size_t depth = 1;
-  size_t height = 1;
-  size_t width = 1;
+  size_t depth    = 1;
+  size_t height   = 1;
+  size_t width    = 1;
   if (dim > 1) {
     channels = blob.size(1);
     if (dim > 2) {
@@ -382,7 +378,7 @@ inline StreamType& print_blob_(const RunContext& ctx,
         width = blob.size(2);
       } else if (dim == 4) {
         height = blob.size(2);
-        width = blob.size(3);
+        width  = blob.size(3);
       } else {
         depth = blob.size(2);
         if (dim > 3) {
@@ -434,8 +430,8 @@ inline StreamType& print_blob_(const RunContext& ctx,
                 break;
             }
             os << repeatedStr("(", dd);
-            os << std::fixed << std::setw(7) << std::setprecision(MPRINT_PRECISION)
-               << std::right << val << " ";
+            os << std::fixed << std::setw(7) << std::setprecision(MPRINT_PRECISION) << std::right
+               << val << " ";
             os << repeatedStr(")", dd, true);
           }
         }
@@ -447,7 +443,8 @@ inline StreamType& print_blob_(const RunContext& ctx,
       if (!doBatches) {
         break;
       } else {
-        os << " |" << std::flush;;
+        os << " |" << std::flush;
+        ;
       }
     }
     if (r < height - 1) {
@@ -468,34 +465,38 @@ inline StreamType& print_blob_(const RunContext& ctx,
   return os;
 }
 
-template<typename StreamType>
+template <typename StreamType>
 inline StreamType& print(const RunContext& ctx,
-                         StreamType *_os,
-                         const TBlob &blob,
+                         StreamType* _os,
+                         const TBlob& blob,
                          const bool doChannels = true,
-                         const bool doBatches = true,
-                         const bool add_endl = true) {
+                         const bool doBatches  = true,
+                         const bool add_endl   = true) {
   MSHADOW_TYPE_SWITCH(blob.type_flag_, DType, {
     print_blob_<DType>(ctx, _os, blob, doChannels, doBatches, add_endl);
   });
   return *_os;
 }
 
-template<typename StreamType>
-inline StreamType& print(const RunContext& ctx, StreamType *_os, const std::string &label,
-                         const TBlob &blob,
+template <typename StreamType>
+inline StreamType& print(const RunContext& ctx,
+                         StreamType* _os,
+                         const std::string& label,
+                         const TBlob& blob,
                          const bool doChannels = true,
-                         bool doBatches = true,
-                         const bool add_endl = true) {
+                         bool doBatches        = true,
+                         const bool add_endl   = true) {
   if (!label.empty()) {
     *_os << label << ": ";
   }
   return print(ctx, _os, blob, doChannels, doBatches, add_endl);
 }
 
-template<typename StreamType>
-inline StreamType& print(const RunContext& ctx, StreamType *_os,
-                         const std::string& label, const NDArray& arr) {
+template <typename StreamType>
+inline StreamType& print(const RunContext& ctx,
+                         StreamType* _os,
+                         const std::string& label,
+                         const NDArray& arr) {
   if (!label.empty()) {
     *_os << label << ": ";
   }
@@ -505,7 +506,7 @@ inline StreamType& print(const RunContext& ctx, StreamType *_os,
       const mxnet::TShape& shape = arr.shape();
       print_shape(_os, "[row_sparse] main shape", shape, false);
       const mxnet::TShape& storage_shape = arr.storage_shape();
-      const bool is_one_row = storage_shape[0] < 2;
+      const bool is_one_row              = storage_shape[0] < 2;
       print_shape(_os, "storage shape", storage_shape, false);
       print(ctx, _os, arr.data(), true, true, !is_one_row);
 
@@ -520,7 +521,7 @@ inline StreamType& print(const RunContext& ctx, StreamType *_os,
       const mxnet::TShape& shape = arr.shape();
       print_shape(_os, "[CSR] main shape", shape, false);
       const mxnet::TShape& storage_shape = arr.storage_shape();
-      const bool is_one_row = storage_shape[0] < 2;
+      const bool is_one_row              = storage_shape[0] < 2;
       print_shape(_os, "storage shape", storage_shape, false);
       print(ctx, _os, arr.data(), true, true, !is_one_row);
 
@@ -539,7 +540,7 @@ inline StreamType& print(const RunContext& ctx, StreamType *_os,
     case kDefaultStorage: {
       // data
       const mxnet::TShape& shape = arr.shape();
-      const bool is_one_row = shape[0] < 2;
+      const bool is_one_row      = shape[0] < 2;
       print_shape(_os, "[dense] main shape", shape, !is_one_row);
       print(ctx, _os, arr.data(), true, true, !is_one_row) << std::endl;
       break;
@@ -575,26 +576,30 @@ inline void print(const RunContext& ctx,
   }
 }
 
-inline std::string demangle(const char *name) {
+inline std::string demangle(const char* name) {
 #if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
   int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void(*)(void*)> res {
-    abi::__cxa_demangle(name, nullptr, nullptr, &status),
-    &std::free
-  };
+  std::unique_ptr<char, void (*)(void*)> res{abi::__cxa_demangle(name, nullptr, nullptr, &status),
+                                             &std::free};
   return status ? name : res.get();
 #else
   return name;
 #endif
 }
 
-template<typename T>
-inline std::string type_name() { return demangle(typeid(T).name()); }
+template <typename T>
+inline std::string type_name() {
+  return demangle(typeid(T).name());
+}
 
-#define PRINT_NDARRAYS(__ctx$, __var)  test::print(__ctx$, __FUNCTION__, #__var, __var)
-#define PRINT_OP_AND_ARRAYS(__ctx$, __op, __var)  test::print(__ctx$, __FUNCTION__, \
-  static_cast<std::stringstream *>(&(std::stringstream() << #__var << \
-  "<" << type_name<__op>() << ">"))->str(), __var)
+#define PRINT_NDARRAYS(__ctx$, __var) test::print(__ctx$, __FUNCTION__, #__var, __var)
+#define PRINT_OP_AND_ARRAYS(__ctx$, __op, __var)                                       \
+  test::print(__ctx$,                                                                  \
+              __FUNCTION__,                                                            \
+              static_cast<std::stringstream*>(                                         \
+                  &(std::stringstream() << #__var << "<" << type_name<__op>() << ">")) \
+                  ->str(),                                                             \
+              __var)
 #define PRINT_OP2_AND_ARRAYS(__ctx$, __op1, __op2, __var)  test::print(__ctx$, __FUNCTION__, \
   static_cast<std::stringstream *>(&(std::stringstream() << #__var << \
   "<" << type_name<__op1>().name()) << ", " \
@@ -606,18 +611,18 @@ inline std::string type_name() { return demangle(typeid(T).name()); }
  *  2D: batch item -> channel -> row -> col
  *  3D: batch item -> channel -> col
  */
-template<typename GetNextData>
+template <typename GetNextData>
 static inline void patternFill(const RunContext& run_ctx,
-                               const TBlob *_blob,
+                               const TBlob* _blob,
                                GetNextData getNextData) {
   AccessAsCPU(*_blob, run_ctx, [getNextData](const TBlob& blob) {
     const size_t dim = static_cast<size_t>(blob.ndim());
     CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)";
-    const size_t num = blob.size(0);
-    const size_t channels = dim > 1 ? blob.size(1) : 1;
-    const size_t depth = dim > 2 ? blob.size(2) : 1;
-    const size_t height = dim > 3 ? blob.size(3) : 1;
-    const size_t width = dim > 4 ? blob.size(4) : 1;
+    const size_t num             = blob.size(0);
+    const size_t channels        = dim > 1 ? blob.size(1) : 1;
+    const size_t depth           = dim > 2 ? blob.size(2) : 1;
+    const size_t height          = dim > 3 ? blob.size(3) : 1;
+    const size_t width           = dim > 4 ? blob.size(4) : 1;
     const size_t numberOfIndexes = blob.shape_.Size();
     for (size_t n = 0; n < num; ++n) {
       if (dim > 1) {
@@ -632,8 +637,8 @@ static inline void patternFill(const RunContext& run_ctx,
                         const size_t idx = test::offset(blob.shape_, {n, ch, d, row, col});
                         CHECK_LT(idx, numberOfIndexes);
                         MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
-                          ThisDataType &f = blob.dptr<ThisDataType>()[idx];
-                          f = getNextData();
+                          ThisDataType& f = blob.dptr<ThisDataType>()[idx];
+                          f               = getNextData();
                         });
                       } else {
                         CHECK(dim <= 5) << "Unimplemented dimension: " << dim;
@@ -643,8 +648,8 @@ static inline void patternFill(const RunContext& run_ctx,
                     const size_t idx = test::offset(blob.shape_, {n, ch, d, row});
                     CHECK_LT(idx, numberOfIndexes);
                     MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
-                      ThisDataType &f = blob.dptr<ThisDataType>()[idx];
-                      f = getNextData();
+                      ThisDataType& f = blob.dptr<ThisDataType>()[idx];
+                      f               = getNextData();
                     });
                   }
                 }
@@ -652,8 +657,8 @@ static inline void patternFill(const RunContext& run_ctx,
                 const size_t idx = test::offset(blob.shape_, {n, ch, d});
                 CHECK_LT(idx, numberOfIndexes);
                 MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
-                  ThisDataType &f = blob.dptr<ThisDataType>()[idx];
-                  f = getNextData();
+                  ThisDataType& f = blob.dptr<ThisDataType>()[idx];
+                  f               = getNextData();
                 });
               }
             }
@@ -661,8 +666,8 @@ static inline void patternFill(const RunContext& run_ctx,
             const size_t idx = test::offset(blob.shape_, {n, ch});
             CHECK_LT(idx, numberOfIndexes);
             MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
-              ThisDataType &f = blob.dptr<ThisDataType>()[idx];
-              f = getNextData();
+              ThisDataType& f = blob.dptr<ThisDataType>()[idx];
+              f               = getNextData();
             });
           }
         }
@@ -670,8 +675,8 @@ static inline void patternFill(const RunContext& run_ctx,
         const size_t idx = test::offset(blob.shape_, {n});
         CHECK_LT(idx, numberOfIndexes);
         MSHADOW_TYPE_SWITCH(blob.type_flag_, ThisDataType, {
-          ThisDataType &f = blob.dptr<ThisDataType>()[idx];
-          f = getNextData();
+          ThisDataType& f = blob.dptr<ThisDataType>()[idx];
+          f               = getNextData();
         });
       }
     }
@@ -679,12 +684,10 @@ static inline void patternFill(const RunContext& run_ctx,
 }
 
 /*! \brief Return a random number within a given range (inclusive) */
-template<class ScalarType>
+template <class ScalarType>
 inline ScalarType rangedRand(const ScalarType min, const ScalarType max) {
-  uint64_t num_bins = static_cast<uint64_t>(max + 1),
-    num_rand = static_cast<uint64_t>(RAND_MAX),
-    bin_size = num_rand / num_bins,
-    defect   = num_rand % num_bins;
+  uint64_t num_bins = static_cast<uint64_t>(max + 1), num_rand = static_cast<uint64_t>(RAND_MAX),
+           bin_size = num_rand / num_bins, defect = num_rand % num_bins;
   ScalarType x;
   do {
     x = std::rand();
@@ -700,7 +703,7 @@ inline ScalarType rangedRand(const ScalarType min, const ScalarType max) {
  * \param s2 Second shape
  * \return true if s1 is less than s2
  */
-inline bool operator < (const mxnet::TShape &s1, const mxnet::TShape &s2) {
+inline bool operator<(const mxnet::TShape& s1, const mxnet::TShape& s2) {
   if (s1.Size() == s2.Size()) {
     if (s1.ndim() == s2.ndim()) {
       for (size_t i = 0, n = s1.ndim(); i < n; ++i) {
@@ -723,8 +726,7 @@ inline bool operator < (const mxnet::TShape &s1, const mxnet::TShape &s2) {
  * \param v2 Second vector of shapes
  * \return true if v1 is less than v2
  */
-inline bool operator < (const std::vector<mxnet::TShape>& v1,
-                        const std::vector<mxnet::TShape>& v2) {
+inline bool operator<(const std::vector<mxnet::TShape>& v1, const std::vector<mxnet::TShape>& v2) {
   if (v1.size() == v2.size()) {
     for (size_t i = 0, n = v1.size(); i < n; ++i) {
       if (v1[i] == v2[i]) {
@@ -774,25 +776,23 @@ inline std::string pretty_num(uint64_t val) {
 }
 
 /*! \brief Change a value during the scope of this declaration */
-template<typename T>
+template <typename T>
 struct ScopeSet {
-  inline ScopeSet(T *var, const T tempValue)
-    : var_(*var)
-      , saveValue_(var) {
+  inline ScopeSet(T* var, const T tempValue) : var_(*var), saveValue_(var) {
     *var = tempValue;
   }
   inline ~ScopeSet() {
     var_ = saveValue_;
   }
   T& var_;
-  T  saveValue_;
+  T saveValue_;
 };
 
-
-static void AssertEqual(const std::vector<NDArray *> &in_arrs,
-                 const std::vector<NDArray *> &out_arrs,
-                 float rtol = 1e-5, float atol = 1e-8,
-                 bool test_first_only = false) {
+static void AssertEqual(const std::vector<NDArray*>& in_arrs,
+                        const std::vector<NDArray*>& out_arrs,
+                        float rtol           = 1e-5,
+                        float atol           = 1e-8,
+                        bool test_first_only = false) {
   for (size_t j = 0; j < in_arrs.size(); ++j) {
     // When test_all is fir
     if (test_first_only && j == 1) {
@@ -811,12 +811,10 @@ static void AssertEqual(const std::vector<NDArray *> &in_arrs,
     tmp2 = tmp2.Reorder2Default();
 #endif
     EXPECT_EQ(tmp1.shape().Size(), tmp2.shape().Size());
-    TBlob blob1 = tmp1.data();
-    TBlob blob2 = tmp2.data();
-    mshadow::default_real_t *d1 =
-        static_cast<mshadow::default_real_t *>(blob1.dptr_);
-    mshadow::default_real_t *d2 =
-        static_cast<mshadow::default_real_t *>(blob2.dptr_);
+    TBlob blob1                 = tmp1.data();
+    TBlob blob2                 = tmp2.data();
+    mshadow::default_real_t* d1 = static_cast<mshadow::default_real_t*>(blob1.dptr_);
+    mshadow::default_real_t* d2 = static_cast<mshadow::default_real_t*>(blob2.dptr_);
     for (int i = 0; i < tmp1.shape().Size(); i++) {
       float abs_err = fabs((d1[i]) - (d2[i]));
       ASSERT_LE(abs_err, (atol + rtol * fabs(d2[i])))
@@ -825,8 +823,6 @@ static void AssertEqual(const std::vector<NDArray *> &in_arrs,
   }
 }
 
-
-
 }  // namespace test
 }  // namespace mxnet
 
@@ -836,7 +832,7 @@ inline void usleep(__int64 usec) {
   LARGE_INTEGER ft;
 
   // Convert to 100 nanosecond interval, negative value indicates relative time
-  ft.QuadPart = -(10*usec);
+  ft.QuadPart = -(10 * usec);
 
   timer = CreateWaitableTimer(NULL, TRUE, NULL);
   SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
diff --git a/tests/cpp/kvstore/gpu_topology_test.cc b/tests/cpp/kvstore/gpu_topology_test.cc
index d26894c21ea7..49c32502cc44 100644
--- a/tests/cpp/kvstore/gpu_topology_test.cc
+++ b/tests/cpp/kvstore/gpu_topology_test.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2018 by Contributors
  * \file gpu_topology_test.cc
  * \brief gpu topology tests
-*/
+ */
 
 #if MXNET_USE_CUDA
 
@@ -33,17 +33,17 @@
 void GenerateMatrix(std::vector<float>* W, int num_gpus, std::mt19937* gen) {
   std::uniform_real_distribution<> dis(0., 1.);
   for (int row = 0; row < num_gpus; ++row) {
-    for (int col = row+1; col < num_gpus; ++col) {
+    for (int col = row + 1; col < num_gpus; ++col) {
       double sample = dis(*gen);
       if (sample < 0.33) {
-        (*W)[row*num_gpus+col] = 1.;
-        (*W)[col*num_gpus+row] = 1.;
+        (*W)[row * num_gpus + col] = 1.;
+        (*W)[col * num_gpus + row] = 1.;
       } else if (sample < 0.66f) {
-        (*W)[row*num_gpus+col] = 2.;
-        (*W)[col*num_gpus+row] = 2.;
+        (*W)[row * num_gpus + col] = 2.;
+        (*W)[col * num_gpus + row] = 2.;
       } else {
-        (*W)[row*num_gpus+col] = 3.;
-        (*W)[col*num_gpus+row] = 3.;
+        (*W)[row * num_gpus + col] = 3.;
+        (*W)[col * num_gpus + row] = 3.;
       }
     }
   }
@@ -53,7 +53,7 @@ bool IsSatisfactory(const std::vector<float>& W, int num_gpus, int depth) {
   for (int row = 0; row < num_gpus; ++row) {
     int out_edges = 0;
     for (int col = 0; col < num_gpus; ++col) {
-      if (W[row*num_gpus+col] > 0.f)
+      if (W[row * num_gpus + col] > 0.f)
         out_edges++;
     }
     if (out_edges < depth)
@@ -63,11 +63,10 @@ bool IsSatisfactory(const std::vector<float>& W, int num_gpus, int depth) {
 }
 
 // Generates random link topology matrix using random number generator
-void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack,
-                                std::mt19937* gen) {
+void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack, std::mt19937* gen) {
   std::uniform_real_distribution<> dis(0.f, 1.f);
   bool satisfied = false;
-  std::vector<float> W(num_gpus*num_gpus, 0.f);
+  std::vector<float> W(num_gpus * num_gpus, 0.f);
   int depth = mxnet::kvstore::ComputeDepth(num_gpus);
   GenerateMatrix(&W, num_gpus, gen);
   satisfied = IsSatisfactory(W, num_gpus, depth);
@@ -82,7 +81,7 @@ void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack,
   mxnet::kvstore::ComputeTrees(W, num_gpus, alpha, backtrack, &topo, &scan);
 
   unsigned correct_topo_size = (1 << (depth + 1)) - 1;
-  unsigned correct_scan_size = depth+2;
+  unsigned correct_scan_size = depth + 2;
   ASSERT_EQ(topo.size(), static_cast<unsigned>(num_gpus));
   for (unsigned i = 0; i < topo.size(); ++i) {
     ASSERT_EQ(correct_topo_size, topo[i].size());
@@ -92,26 +91,24 @@ void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack,
 
 // Permutes matrix W using permutation vector P and stores output in matrix A
 // Assumption: W is square and symmetric
-void PermuteMatrix(const std::vector<int>& W,
-                   const std::vector<int>& P,
-                   std::vector<int>*       A) {
+void PermuteMatrix(const std::vector<int>& W, const std::vector<int>& P, std::vector<int>* A) {
   int nrows = P.size();
-  std::vector<int> temp(nrows*nrows, 0);
+  std::vector<int> temp(nrows * nrows, 0);
 
   int count = 0;
-  for (int row=0; row < nrows; ++row) {
-    for (int col=0; col < nrows; ++col) {
+  for (int row = 0; row < nrows; ++row) {
+    for (int col = 0; col < nrows; ++col) {
       int row_start = P[row];
-      temp[count] = W[row_start*nrows+col];
+      temp[count]   = W[row_start * nrows + col];
       count++;
     }
   }
 
   count = 0;
-  for (int row=0; row < nrows; ++row) {
-    for (int col=0; col < nrows; ++col) {
+  for (int row = 0; row < nrows; ++row) {
+    for (int col = 0; col < nrows; ++col) {
       int col_index = P[col];
-      (*A)[count] = temp[row*nrows+col_index];
+      (*A)[count]   = temp[row * nrows + col_index];
       count++;
     }
   }
@@ -121,7 +118,7 @@ TEST(GpuTopology, TestFormTopology) {
   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   std::vector<size_t> topo0;
   std::vector<size_t> scan0;
-  std::vector<int> correct0 = {3, 3, 0, 3, 1, 0, 4, 3, 2, 1, 5, 0, 0, 4, 6};
+  std::vector<int> correct0      = {3, 3, 0, 3, 1, 0, 4, 3, 2, 1, 5, 0, 0, 4, 6};
   std::vector<int> correct_scan0 = {0, 1, 3, 7, 15};
   mxnet::kvstore::FormTopology(state0, &topo0, &scan0, 3);
   ASSERT_EQ(topo0.size(), correct0.size());
@@ -134,7 +131,7 @@ TEST(GpuTopology, TestFormTopology) {
   std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
   std::vector<size_t> topo1;
   std::vector<size_t> scan1;
-  std::vector<int> correct1 = {3, 3, 1, 3, 0, 1, 5, 3, 2, 0, 4, 1, 1, 5, 6};
+  std::vector<int> correct1      = {3, 3, 1, 3, 0, 1, 5, 3, 2, 0, 4, 1, 1, 5, 6};
   std::vector<int> correct_scan1 = {0, 1, 3, 7, 15};
   mxnet::kvstore::FormTopology(state1, &topo1, &scan1, 3);
   ASSERT_EQ(topo1.size(), correct1.size());
@@ -146,13 +143,8 @@ TEST(GpuTopology, TestFormTopology) {
 }
 
 TEST(GpuTopology, TestComputeTreeWeight) {
-  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0,
-                        2, 0, 3, 2, 0, 3, 0,
-                        2, 3, 0, 3, 0, 0, 2,
-                        3, 2, 3, 0, 0, 0, 0,
-                        3, 0, 0, 0, 0, 2, 2,
-                        0, 3, 0, 0, 2, 0, 3,
-                        0, 0, 2, 0, 2, 3, 0};
+  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
+                        0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};
 
   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state0, 7, 3, false), 16);
@@ -180,7 +172,7 @@ TEST(GpuTopology, TestPostprocess) {
   for (unsigned i = 0; i < correct2.size(); ++i)
     ASSERT_EQ(result2[i], correct2[i]);
 
-  std::vector<int> result3  = {10, 10,  0,  0, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
+  std::vector<int> result3  = {10, 10, 0, 0, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
   std::vector<int> correct3 = {10, 10, 10, 10, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
   mxnet::kvstore::Postprocess(&result3, 11, 4);
   for (unsigned i = 0; i < correct3.size(); ++i)
@@ -198,13 +190,8 @@ TEST(GpuTopology, TestDepth) {
 }
 
 TEST(GpuTopology, TestIsValid) {
-  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0,
-                        2, 0, 3, 2, 0, 3, 0,
-                        2, 3, 0, 3, 0, 0, 2,
-                        3, 2, 3, 0, 0, 0, 0,
-                        3, 0, 0, 0, 0, 2, 2,
-                        0, 3, 0, 0, 2, 0, 3,
-                        0, 0, 2, 0, 2, 3, 0};
+  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 3, 0, 3, 0, 0, 2, 3, 2, 3, 0,
+                        0, 0, 0, 3, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 3, 0, 0, 2, 0, 2, 3, 0};
 
   std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
   ASSERT_EQ(mxnet::kvstore::IsValid(W, state0, 7, 7, 3), true);
@@ -260,7 +247,7 @@ TEST(GpuTopology, TestEwisemult) {
   std::vector<int> x(8, 1);
   std::vector<int> y(8, 0);
   std::iota(y.begin(), y.end(), 0);
-  int alpha = 5;
+  int alpha                  = 5;
   std::vector<int> correct_y = {0, 5, 10, 15, 20, 25, 30, 35};
   mxnet::kvstore::ewisemult(x, alpha, &y);
 
@@ -271,14 +258,9 @@ TEST(GpuTopology, TestEwisemult) {
 
 // FindBestMoveTest
 TEST(GpuTopology, TestFindBestMove) {
-  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
-                        2, 0, 3, 2, 1, 3, 1, 1,
-                        2, 3, 0, 3, 1, 1, 2, 1,
-                        3, 2, 3, 0, 1, 1, 1, 2,
-                        3, 1, 1, 1, 0, 2, 2, 3,
-                        1, 3, 1, 1, 2, 0, 3, 2,
-                        1, 1, 2, 1, 2, 3, 0, 3,
-                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                        2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
+                        2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P(8, 0);
   std::iota(P.begin(), P.end(), 1);
   std::unordered_set<int> used;
@@ -311,32 +293,32 @@ TEST(GpuTopology, TestGetRoot) {
 
   // Test when roots are non-empty, and matches color
   std::unordered_set<int> roots1 = {0, 2, 4, 6};
-  std::vector<int> color1 = {0, 1, 2, 3};
+  std::vector<int> color1        = {0, 1, 2, 3};
   for (unsigned i = 0; i < color1.size(); ++i) {
-    int root1 = mxnet::kvstore::GetRoot(P, color1[i], roots1);
-    int correct_root1 = 2*i;
+    int root1         = mxnet::kvstore::GetRoot(P, color1[i], roots1);
+    int correct_root1 = 2 * i;
     ASSERT_EQ(root1, correct_root1);
   }
 
   // Test when roots is empty
   std::unordered_set<int> roots2;
-  int color2 = 0;
+  int color2        = 0;
   int correct_root2 = -1;
-  int root2  = mxnet::kvstore::GetRoot(P, color2, roots2);
+  int root2         = mxnet::kvstore::GetRoot(P, color2, roots2);
   ASSERT_EQ(root2, correct_root2);
 
   // Test when roots is non-empty, but no root matches color
   std::unordered_set<int> roots3 = {0};
-  int color3 = 1;
-  int correct_root3 = -1;
-  int root3  = mxnet::kvstore::GetRoot(P, color3, roots3);
+  int color3                     = 1;
+  int correct_root3              = -1;
+  int root3                      = mxnet::kvstore::GetRoot(P, color3, roots3);
   ASSERT_EQ(root3, correct_root3);
 
-  std::vector<int> P2 = {0, 1, 1, 0, 2, 3, 3, 2};
+  std::vector<int> P2            = {0, 1, 1, 0, 2, 3, 3, 2};
   std::unordered_set<int> roots4 = roots1;
-  int color4 = 0;
-  int correct_root4 = 0;
-  int root4 = mxnet::kvstore::GetRoot(P, color4, roots4);
+  int color4                     = 0;
+  int correct_root4              = 0;
+  int root4                      = mxnet::kvstore::GetRoot(P, color4, roots4);
   ASSERT_EQ(root4, correct_root4);
 }
 
@@ -345,37 +327,32 @@ TEST(GpuTopology, TestGetChild) {
   std::vector<int> P = {0, 0, 1, 2, 2, 2, 3, 3};
 
   // Test when color is not found
-  int color1  = 4;
-  int parent1 = 4;
+  int color1         = 4;
+  int parent1        = 4;
   int correct_child1 = -1;
-  int child1  = mxnet::kvstore::GetChild(P, color1, parent1);
+  int child1         = mxnet::kvstore::GetChild(P, color1, parent1);
   ASSERT_EQ(child1, correct_child1);
 
   // Test when color is found, but is equal to parent
-  int color2  = 1;
-  int parent2 = 2;
+  int color2         = 1;
+  int parent2        = 2;
   int correct_child2 = -1;
-  int child2  = mxnet::kvstore::GetChild(P, color2, parent2);
+  int child2         = mxnet::kvstore::GetChild(P, color2, parent2);
   ASSERT_EQ(child2, correct_child2);
 
   // Test when color is found and not equal to parent
-  int color3  = 3;
-  int parent3 = 6;
+  int color3         = 3;
+  int parent3        = 6;
   int correct_child3 = 7;
-  int child3  = mxnet::kvstore::GetChild(P, color3, parent3);
+  int child3         = mxnet::kvstore::GetChild(P, color3, parent3);
   ASSERT_EQ(child3, correct_child3);
 }
 
 // FindBestEdgeTest
 TEST(GpuTopology, TestFindBestEdge) {
-  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
-                        2, 0, 3, 2, 1, 3, 1, 1,
-                        2, 3, 0, 3, 1, 1, 2, 1,
-                        3, 2, 3, 0, 1, 1, 1, 2,
-                        3, 1, 1, 1, 0, 2, 2, 3,
-                        1, 3, 1, 1, 2, 0, 3, 2,
-                        1, 1, 2, 1, 2, 3, 0, 3,
-                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                        2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
+                        2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P(8, 0);
   std::unordered_set<int> used;
 
@@ -384,7 +361,7 @@ TEST(GpuTopology, TestFindBestEdge) {
   std::vector<int> b1;
   int g1;
   std::vector<int> correct_b1 = {0, 2};
-  int correct_g1 = 3;
+  int correct_g1              = 3;
   mxnet::kvstore::FindBestEdge(W, P, parent1, dest1, &b1, &g1);
   ASSERT_EQ(b1.size(), correct_b1.size());
   for (unsigned i = 0; i < b1.size(); ++i)
@@ -397,7 +374,7 @@ TEST(GpuTopology, TestFindBestEdge) {
   std::vector<int> b2;
   int g2;
   std::vector<int> correct_b2 = {-1};
-  int correct_g2 = 0;
+  int correct_g2              = 0;
   mxnet::kvstore::FindBestEdge(W, P, parent2, dest2, &b2, &g2);
   ASSERT_EQ(b2.size(), correct_b2.size());
   for (unsigned i = 0; i < b2.size(); ++i)
@@ -407,14 +384,9 @@ TEST(GpuTopology, TestFindBestEdge) {
 
 // KLGenerateBinaryTreeTest
 TEST(GpuTopology, TestKLGenerateBinaryTree1) {
-  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1,
-                        2, 0, 3, 2, 1, 3, 1, 1,
-                        2, 3, 0, 3, 1, 1, 2, 1,
-                        3, 2, 3, 0, 1, 1, 1, 2,
-                        3, 1, 1, 1, 0, 2, 3, 3,
-                        1, 3, 1, 1, 2, 0, 3, 2,
-                        1, 1, 2, 1, 2, 3, 0, 3,
-                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                        2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
+                        2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
   std::vector<std::pair<int, int>> cluster_pairs;
   cluster_pairs.push_back(std::pair<int, int>(0, -2));
@@ -422,11 +394,10 @@ TEST(GpuTopology, TestKLGenerateBinaryTree1) {
   cluster_pairs.push_back(std::pair<int, int>(2, -2));
   cluster_pairs.push_back(std::pair<int, int>(3, -2));
   std::unordered_set<int> roots = {0, 2, 4, 6};
-  std::vector<size_t> topo = {0, 2, 4, 6};
+  std::vector<size_t> topo      = {0, 2, 4, 6};
   std::vector<size_t> scan(2, 0);
   std::mt19937 gen(1);
-  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo,
-                                       &scan, &gen);
+  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
   std::vector<size_t> correct_topo = {0, 2, 4, 6, 0, 3, 2, 1, 4, 7, 6, 5};
   std::vector<size_t> correct_scan = {0, 0, 4};
   ASSERT_EQ(topo.size(), correct_topo.size());
@@ -438,14 +409,9 @@ TEST(GpuTopology, TestKLGenerateBinaryTree1) {
 }
 
 TEST(GpuTopology, TestKLGenerateBinaryTree2) {
-  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1,
-                        2, 0, 3, 2, 1, 3, 1, 1,
-                        2, 3, 0, 3, 1, 1, 2, 1,
-                        3, 2, 3, 0, 1, 1, 1, 2,
-                        3, 1, 1, 1, 0, 2, 3, 3,
-                        1, 3, 1, 1, 2, 0, 3, 2,
-                        1, 1, 2, 1, 2, 3, 0, 3,
-                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                        2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 3, 3, 1, 3, 1, 1,
+                        2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
   std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
   std::vector<std::pair<int, int>> cluster_pairs;
   cluster_pairs.push_back(std::pair<int, int>(0, -2));
@@ -453,11 +419,10 @@ TEST(GpuTopology, TestKLGenerateBinaryTree2) {
   cluster_pairs.push_back(std::pair<int, int>(2, -2));
   cluster_pairs.push_back(std::pair<int, int>(3, -2));
   std::unordered_set<int> roots = {0, 2, 4, 6};
-  std::vector<size_t> topo = {0, 6, 4, 2};
+  std::vector<size_t> topo      = {0, 6, 4, 2};
   std::vector<size_t> scan(2, 0);
   std::mt19937 gen(1);
-  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo,
-                                       &scan, &gen);
+  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo, &scan, &gen);
   std::vector<size_t> correct_topo = {0, 6, 4, 2, 0, 3, 6, 5, 4, 7, 2, 1};
   std::vector<size_t> correct_scan = {0, 0, 4};
   ASSERT_EQ(topo.size(), correct_topo.size());
@@ -470,13 +435,11 @@ TEST(GpuTopology, TestKLGenerateBinaryTree2) {
 
 // UpdateWeightTest
 TEST(GpuTopology, TestUpdateWeight) {
-  std::vector<float> W = {0.f, 1.f,
-                          1.f, 0.f};
-  std::vector<size_t> topo = {1, 1, 0};
-  int num_gpus = 2;
-  float alpha  = 0.7;
-  std::vector<float> correct_W = {0.f, 0.7f,
-                                  0.7f, 0.f};
+  std::vector<float> W         = {0.f, 1.f, 1.f, 0.f};
+  std::vector<size_t> topo     = {1, 1, 0};
+  int num_gpus                 = 2;
+  float alpha                  = 0.7;
+  std::vector<float> correct_W = {0.f, 0.7f, 0.7f, 0.f};
   mxnet::kvstore::UpdateWeight(&W, topo, num_gpus, alpha);
   ASSERT_EQ(W.size(), correct_W.size());
   for (unsigned i = 0; i < W.size(); ++i) {
@@ -486,25 +449,19 @@ TEST(GpuTopology, TestUpdateWeight) {
 
 // ComputeTreesFromRoot
 TEST(GpuTopology, TestComputeTreesFromRoot1) {
-  std::vector<float> W = {0, 2, 2, 3, 3, 1, 1, 1,
-                          2, 0, 3, 2, 1, 3, 1, 1,
-                          2, 3, 0, 3, 1, 1, 2, 1,
-                          3, 2, 3, 0, 1, 1, 1, 2,
-                          3, 1, 1, 1, 0, 2, 2, 3,
-                          1, 3, 1, 1, 2, 0, 3, 2,
-                          1, 1, 2, 1, 2, 3, 0, 3,
-                          1, 1, 1, 2, 3, 2, 3, 0};
-  int num_gpus = 8;
-  int root     = 0;
-  float alpha  = 0.7;
-  bool backtrack = true;
+  std::vector<float> W       = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                          2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
+                          2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
+  int num_gpus               = 8;
+  int root                   = 0;
+  float alpha                = 0.7;
+  bool backtrack             = true;
   unsigned correct_topo_size = 15;
   unsigned correct_scan_size = 5;
   std::vector<size_t> topo;
   std::vector<size_t> scan;
 
-  mxnet::kvstore::ComputeTreesFromRoot(&W, num_gpus, root, alpha, backtrack,
-                                       &topo, &scan);
+  mxnet::kvstore::ComputeTreesFromRoot(&W, num_gpus, root, alpha, backtrack, &topo, &scan);
 
   ASSERT_EQ(topo.size(), correct_topo_size);
   ASSERT_EQ(scan.size(), correct_scan_size);
@@ -513,11 +470,8 @@ TEST(GpuTopology, TestComputeTreesFromRoot1) {
 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected1) {
-  std::vector<float> W = {0, 0, 2, 0,
-                          0, 0, 0, 2,
-                          2, 0, 0, 0,
-                          0, 2, 0, 0};
-  int num_gpus = 4;
+  std::vector<float> W = {0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0};
+  int num_gpus         = 4;
 
   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
 
@@ -528,11 +482,8 @@ TEST(GpuTopology, TestIsConnected1) {
 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected2) {
-  std::vector<float> W = {1, 1, 2, 1,
-                          1, 1, 1, 2,
-                          2, 1, 1, 1,
-                          1, 2, 1, 1};
-  int num_gpus = 4;
+  std::vector<float> W = {1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1};
+  int num_gpus         = 4;
 
   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
 
@@ -543,11 +494,8 @@ TEST(GpuTopology, TestIsConnected2) {
 // IsConnected
 // Test on graph that is "disconnected" by NVLink
 TEST(GpuTopology, TestIsConnected3) {
-  std::vector<float> W = {1, 1, 2, 2,
-                          1, 1, 1, 2,
-                          2, 1, 1, 1,
-                          2, 2, 1, 1};
-  int num_gpus = 4;
+  std::vector<float> W = {1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1};
+  int num_gpus         = 4;
 
   bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
 
@@ -558,7 +506,7 @@ TEST(GpuTopology, TestIsConnected3) {
 // ComputeTreesTest with backtracking
 TEST(GpuTopology, TestComputeTrees1) {
   std::mt19937 gen(1);
-  float alpha = 0.7;
+  float alpha    = 0.7;
   bool backtrack = true;
   for (int num_gpus = 2; num_gpus <= 8; ++num_gpus) {
     LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
@@ -571,7 +519,7 @@ TEST(GpuTopology, TestComputeTrees1) {
 // ComputeTreesTest with Kernighan-Lin
 TEST(GpuTopology, TestComputeTrees2) {
   std::mt19937 gen(1);
-  float alpha = 0.7;
+  float alpha    = 0.7;
   bool backtrack = false;
   // Do 5 randomized tests per GPU count from 2 to 16
   for (int num_gpus = 2; num_gpus <= 16; ++num_gpus) {
@@ -583,35 +531,25 @@ TEST(GpuTopology, TestComputeTrees2) {
 }
 
 TEST(GpuTopology, TestPermuteMatrix) {
-  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
-                        2, 0, 3, 2, 1, 3, 1, 1,
-                        2, 3, 0, 3, 1, 1, 2, 1,
-                        3, 2, 3, 0, 1, 1, 1, 2,
-                        3, 1, 1, 1, 0, 2, 2, 3,
-                        1, 3, 1, 1, 2, 0, 3, 2,
-                        1, 1, 2, 1, 2, 3, 0, 3,
-                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1, 2, 0, 3, 2, 1, 3, 1, 1, 2, 3, 0, 3, 1, 1,
+                        2, 1, 3, 2, 3, 0, 1, 1, 1, 2, 3, 1, 1, 1, 0, 2, 2, 3, 1, 3, 1, 1,
+                        2, 0, 3, 2, 1, 1, 2, 1, 2, 3, 0, 3, 1, 1, 1, 2, 3, 2, 3, 0};
 
   std::vector<int> P1 = {0, 1, 2, 3, 4, 5, 6, 7};
-  std::vector<int> A(8*8, 0);
+  std::vector<int> A(8 * 8, 0);
   PermuteMatrix(W, P1, &A);
-  for (unsigned i=0; i < W.size(); ++i)
+  for (unsigned i = 0; i < W.size(); ++i)
     ASSERT_EQ(A[i], W[i]);
 }
 
 TEST(GpuTopology, TestKernighanLin1) {
-  std::vector<float> W = {0, 1, 2, 3, 2, 4,
-                          1, 0, 1, 4, 2, 1,
-                          2, 1, 0, 3, 2, 1,
-                          3, 4, 3, 0, 4, 3,
-                          2, 2, 2, 4, 0, 2,
-                          4, 1, 1, 3, 2, 0};
+  std::vector<float> W = {0, 1, 2, 3, 2, 4, 1, 0, 1, 4, 2, 1, 2, 1, 0, 3, 2, 1,
+                          3, 4, 3, 0, 4, 3, 2, 2, 2, 4, 0, 2, 4, 1, 1, 3, 2, 0};
   std::vector<int> P(6, 0);
   std::vector<std::pair<int, int>> cluster_pairs;
   int num_partitions = 1;
   std::mt19937 gen(1);
-  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions,
-                                           &cluster_pairs, &gen);
+  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);
 
   std::vector<std::pair<int, int>> correct_pairs;
   correct_pairs.push_back(std::pair<int, int>(0, 1));
@@ -630,26 +568,19 @@ TEST(GpuTopology, TestKernighanLin1) {
       error++;
   }
   EXPECT_TRUE(error == 0 || error == P.size())
-           << "Where real value: "   << error
-           << " not equal neither: " << 0
-           << " nor: "               << P.size() << ".";
+      << "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
+      << ".";
 }
 
 TEST(GpuTopology, TestKernighanLin2) {
-  std::vector<float> W = {0, 1, 0, 0, 1, 1, 0, 0,
-                           1, 0, 0, 0, 1, 1, 0, 0,
-                           0, 0, 0, 1, 0, 1, 1, 1,
-                           0, 0, 1, 0, 0, 0, 1, 1,
-                           1, 1, 0, 0, 0, 1, 0, 0,
-                           1, 1, 1, 0, 1, 0, 0, 0,
-                           0, 0, 1, 1, 0, 0, 0, 1,
-                           0, 0, 1, 1, 0, 0, 1, 0};
+  std::vector<float> W = {0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
+                          1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
+                          1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0};
   std::vector<int> P(8, 0);
   std::vector<std::pair<int, int>> cluster_pairs;
   int num_partitions = 1;
   std::mt19937 gen(1);
-  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions,
-                                           &cluster_pairs, &gen);
+  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions, &cluster_pairs, &gen);
 
   std::vector<std::pair<int, int>> correct_pairs;
   correct_pairs.push_back(std::pair<int, int>(0, 1));
@@ -668,9 +599,8 @@ TEST(GpuTopology, TestKernighanLin2) {
       error++;
   }
   EXPECT_TRUE(error == 0 || error == P.size())
-           << "Where real value: "   << error
-           << " not equal neither: " << 0
-           << " nor: "               << P.size() << ".";
+      << "Where real value: " << error << " not equal neither: " << 0 << " nor: " << P.size()
+      << ".";
 }
 
 #endif  // MXNET_USE_CUDA
diff --git a/tests/cpp/misc/base.cc b/tests/cpp/misc/base.cc
index b560f02a2a96..430ff693737f 100644
--- a/tests/cpp/misc/base.cc
+++ b/tests/cpp/misc/base.cc
@@ -27,20 +27,20 @@ using namespace std;
  * Test that different Context have different hash values
  */
 TEST(ContextHashTest, ContextHashUnique) {
-    set<size_t> hashes;
-    size_t collision_count = 0;
-    size_t total = 0;
-    for (size_t dev_type = 0; dev_type < 32; ++dev_type) {
-        for (size_t dev_id = 0; dev_id < 64; ++dev_id) {
-            auto ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
-            size_t res = std::hash<Context>()(ctx);
-            auto insert_res = hashes.insert(res);
-            if (!insert_res.second)
-                ++collision_count;
-            ++total;
-        }
+  set<size_t> hashes;
+  size_t collision_count = 0;
+  size_t total           = 0;
+  for (size_t dev_type = 0; dev_type < 32; ++dev_type) {
+    for (size_t dev_id = 0; dev_id < 64; ++dev_id) {
+      auto ctx        = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+      size_t res      = std::hash<Context>()(ctx);
+      auto insert_res = hashes.insert(res);
+      if (!insert_res.second)
+        ++collision_count;
+      ++total;
     }
-    double collision = collision_count / static_cast<double>(total);
-    cout << "mxnet::Context std::hash collision ratio: " << collision << endl;
-    EXPECT_LE(collision, 0.04);
+  }
+  double collision = collision_count / static_cast<double>(total);
+  cout << "mxnet::Context std::hash collision ratio: " << collision << endl;
+  EXPECT_LE(collision, 0.04);
 }
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index 0dfefe55f132..ad5f26f101f0 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -32,7 +32,7 @@
 using namespace mxnet;
 
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
-const kwargs_t basic_activation_args = { };
+const kwargs_t basic_activation_args = {};
 
 /*!
  * \brief Generic bidirectional sanity test
@@ -41,25 +41,24 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) {
   using namespace std;
   mxnet::TShape shape({5, 5});
   vector<string> activations = {
-    "relu",
-    "sigmoid",
-    "log_sigmoid",
-    "mish",
-    "tanh",
-    "softrelu",
-    "softsign"
-  };
+      "relu", "sigmoid", "log_sigmoid", "mish", "tanh", "softrelu", "softsign"};
   for (const string& activation : activations) {
     kwargs_t activation_args = {{"act_type", activation}};
     test::op::CoreOperatorRunner<float> runner;
-    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
-            activation_args, "Activation", "_backward_Activation"), 1);
+    runner.RunBidirectional(false,
+                            {shape},
+                            test::op::CoreOpExecutor<float>::ArgsWithOpName(
+                                activation_args, "Activation", "_backward_Activation"),
+                            1);
   }
   for (const string& activation : activations) {
     kwargs_t activation_args = {{"act_type", activation}};
     test::op::CoreOperatorRunner<float> runner;
-    runner.RunBidirectional(true, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
-            activation_args, "Activation", "_backward_Activation"), 1);
+    runner.RunBidirectional(true,
+                            {shape},
+                            test::op::CoreOpExecutor<float>::ArgsWithOpName(
+                                activation_args, "Activation", "_backward_Activation"),
+                            1);
   }
 }
 
@@ -70,29 +69,23 @@ TEST(ACTIVATION_PERF, TimingCPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
-                                                           "_backward_Activation");
+  kwargs =
+      test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation", "_backward_Activation");
   mxnet::TShape shape({10, 10, 10, 10});
   test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+  runner.RunBidirectional(false, {shape}, kwargs, 1);
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest("Activation Operator CPU", false, false, kwargs, 2, 10, { shape });
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest("Activation Operator CPU", false, false, kwargs, 2, 10, {shape});
   }
 }
 
@@ -104,21 +97,15 @@ TEST(ACTIVATION_PERF, TimingGPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
-                                                           "_backward_Activation");
+  kwargs =
+      test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation", "_backward_Activation");
   mxnet::TShape shape({10, 10, 10, 10});
   test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(true, { shape }, kwargs, 1);
-  std::vector <mxnet::TShape> shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest("Activation Operator GPU", true, false, kwargs, 2, 10, { shape });
+  runner.RunBidirectional(true, {shape}, kwargs, 1);
+  std::vector<mxnet::TShape> shapes = {
+      {1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest("Activation Operator GPU", true, false, kwargs, 2, 10, {shape});
   }
 }
 #endif  // MXNET_USE_CUDA == 1
-
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index e66b0b7696c6..39d039c0b55c 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -22,7 +22,7 @@
  * \file batchnorm_test.cc
  * \brief batchnorm operator unit tests and utility functions
  * \author Chris Olivier
-*/
+ */
 
 #include <dmlc/logging.h>
 #include <mxnet/tensor_blob.h>
@@ -40,23 +40,23 @@ using namespace mxnet;
 
 #if !SIMPLE_DIMENSIONS
 static constexpr int BATCH_SIZE = 5;
-static constexpr int CHANNELS = 3;
-static constexpr int DEPTH = 2;
-static constexpr int DH = 2;
-static constexpr int DW = 3;
+static constexpr int CHANNELS   = 3;
+static constexpr int DEPTH      = 2;
+static constexpr int DH         = 2;
+static constexpr int DW         = 3;
 #else
 static constexpr int BATCH_SIZE = 1;
-static constexpr int CHANNELS = 1;
-static constexpr int DEPTH = 1;
-static constexpr int DH = 3;
-static constexpr int DW = 2;
+static constexpr int CHANNELS   = 1;
+static constexpr int DEPTH      = 1;
+static constexpr int DH         = 3;
+static constexpr int DW         = 2;
 #endif
 
 static constexpr int TIMING_BATCH_SIZE = 128;
-static constexpr int TIMING_CHANNELS = 3;
-static constexpr int TIMING_DEPTH = 2;
-static constexpr int TIMING_DH = 28;
-static constexpr int TIMING_DW = 28;
+static constexpr int TIMING_CHANNELS   = 3;
+static constexpr int TIMING_DEPTH      = 2;
+static constexpr int TIMING_DH         = 28;
+static constexpr int TIMING_DW         = 28;
 
 #define PRT(__lbl$, __var$) \
   test::print(ctx.run_ctx, &(std::cout << (__lbl$) << ": "), (__var$), true)
@@ -65,25 +65,35 @@ static constexpr int TIMING_DW = 28;
  * \brief Forward
  */
 enum ForwardInputs {
-  /* in_data */     kForInData, kForGamma, kForBeta,
-  /* aux_states */  kForMovingMean, kForMovingVar
+  /* in_data */ kForInData,
+  kForGamma,
+  kForBeta,
+  /* aux_states */ kForMovingMean,
+  kForMovingVar
 };
 enum ForwardOutputs {
-  /* outputs */     kForOutData , kForOutMean, kForOutVar
+  /* outputs */ kForOutData,
+  kForOutMean,
+  kForOutVar
 };
 
 /*!
  * \brief Backward
  */
 enum BackwardInputs {
-  /* out_grad */    bwd_out_grad_Grad,
-  /* out_data */    bwd_out_data_Mean, bwd_out_data_Var,
-  /* in_data */     bwd_in_data_Data, bwd_in_data_Gamma, bwd_in_data_Beta,
-  /* aux_states */  bwd_aux_states_MovingMean, bwd_aux_states_MovingVar
+  /* out_grad */ bwd_out_grad_Grad,
+  /* out_data */ bwd_out_data_Mean,
+  bwd_out_data_Var,
+  /* in_data */ bwd_in_data_Data,
+  bwd_in_data_Gamma,
+  bwd_in_data_Beta,
+  /* aux_states */ bwd_aux_states_MovingMean,
+  bwd_aux_states_MovingVar
 };
 enum BackwardOutputs {
-  /* in_grad */     bwd_in_grad_Data /* Original input data */,
-  /* weight, bias*/ bwd_in_grad_Gamma, bwd_in_grad_Beta
+  /* in_grad */ bwd_in_grad_Data /* Original input data */,
+  /* weight, bias*/ bwd_in_grad_Gamma,
+  bwd_in_grad_Beta
 };
 
 /**
@@ -104,59 +114,62 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
  public:
   using Super::ctx;
 
-  BNOperatorExecutor(const bool isGPU, const mxnet::TShape& inputShape,
+  BNOperatorExecutor(const bool isGPU,
+                     const mxnet::TShape& inputShape,
                      const test::op::kwargs_t& kwargs,
                      const bool hasWeightAndBias = false)
-    : test::op::CoreOpExecutor<DType, AccReal>(isGPU, { inputShape })
-      , hasWeightAndBias_(hasWeightAndBias) {
+      : test::op::CoreOpExecutor<DType, AccReal>(isGPU, {inputShape}),
+        hasWeightAndBias_(hasWeightAndBias) {
     param_.Init(kwargs);
   }
 
-  const NDArray *GetForwardInArray(const ForwardInputs idx) const {
-    const std::vector<NDArray> &arrs = Super::inputs();
+  const NDArray* GetForwardInArray(const ForwardInputs idx) const {
+    const std::vector<NDArray>& arrs = Super::inputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetForwardOutArray(const ForwardOutputs idx) const {
-    const std::vector<NDArray> &arrs = Super::outputs();
+  const NDArray* GetForwardOutArray(const ForwardOutputs idx) const {
+    const std::vector<NDArray>& arrs = Super::outputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetBackwardInArray(const BackwardInputs idx) {
-    const std::vector<NDArray> &arrs = Super::bwd_inputs();
+  const NDArray* GetBackwardInArray(const BackwardInputs idx) {
+    const std::vector<NDArray>& arrs = Super::bwd_inputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  const NDArray *GetBackwardOutArray(const BackwardOutputs idx) const {
-    const std::vector<NDArray> &arrs = Super::bwd_outputs();
+  const NDArray* GetBackwardOutArray(const BackwardOutputs idx) const {
+    const std::vector<NDArray>& arrs = Super::bwd_outputs();
     CHECK_LT(idx, arrs.size());
     return &arrs[idx];
   }
 
-  NDArray *GetArray(const ForwardInputs idx) {
-    return const_cast<NDArray *>(GetForwardInArray(idx));
+  NDArray* GetArray(const ForwardInputs idx) {
+    return const_cast<NDArray*>(GetForwardInArray(idx));
   }
 
-  NDArray *GetArray(const ForwardOutputs idx) {
-    return const_cast<NDArray *>(GetForwardOutArray(idx));
+  NDArray* GetArray(const ForwardOutputs idx) {
+    return const_cast<NDArray*>(GetForwardOutArray(idx));
   }
 
-  NDArray *GetArray(const BackwardOutputs idx) {
-    return const_cast<NDArray *>(GetBackwardOutArray(idx));
+  NDArray* GetArray(const BackwardOutputs idx) {
+    return const_cast<NDArray*>(GetBackwardOutArray(idx));
   }
 
-  NDArray *GetArray(const BackwardInputs idx) {
-    return const_cast<NDArray *>(GetBackwardInArray(idx));
+  NDArray* GetArray(const BackwardInputs idx) {
+    return const_cast<NDArray*>(GetBackwardInArray(idx));
   }
 
-  inline const TBlob& Blob(const NDArray *arr) { return arr->data(); }
+  inline const TBlob& Blob(const NDArray* arr) {
+    return arr->data();
+  }
 
-  template<typename EnumType>
+  template <typename EnumType>
   const TBlob& GetBlob(const EnumType idx) const {
-    return const_cast<BNOperatorExecutor<DType, AccReal> *>(this)->GetArray(idx)->data();
+    return const_cast<BNOperatorExecutor<DType, AccReal>*>(this)->GetArray(idx)->data();
   }
 
   void resetForward() override {
@@ -179,30 +192,26 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
     double val = 0;
     test::patternFill(ctx().run_ctx, &GetBlob(kForInData), [&val]() -> double { return val += 1; });
 
-    MSHADOW_TYPE_SWITCH(
-      GetBlob(kForGamma).type_flag_,
-      DTypeX, {
-        const TBlob& blob = GetBlob(kForGamma);
-        test::fill(ctx().run_ctx, blob, DTypeX(1));
-        if (hasWeightAndBias_) {
-          if (blob.size(0) > 1) {
-            blob.dptr<DTypeX>()[1] = DTypeX(3);
-          }
+    MSHADOW_TYPE_SWITCH(GetBlob(kForGamma).type_flag_, DTypeX, {
+      const TBlob& blob = GetBlob(kForGamma);
+      test::fill(ctx().run_ctx, blob, DTypeX(1));
+      if (hasWeightAndBias_) {
+        if (blob.size(0) > 1) {
+          blob.dptr<DTypeX>()[1] = DTypeX(3);
         }
-      });
-    MSHADOW_TYPE_SWITCH(
-      GetBlob(kForBeta).type_flag_,
-      DTypeX, {
-        const TBlob& blob = GetBlob(kForBeta);
-        if (!hasWeightAndBias_) {
-          test::fill(ctx().run_ctx, blob, DTypeX(0));
-        } else {  // This will cause forward pass check to fail when calculating sum == 0
-          test::fill(ctx().run_ctx, blob, DTypeX(1));
-          if (blob.size(0) > 0) {
-            blob.dptr<DTypeX>()[0] = DTypeX(3);
-          }
+      }
+    });
+    MSHADOW_TYPE_SWITCH(GetBlob(kForBeta).type_flag_, DTypeX, {
+      const TBlob& blob = GetBlob(kForBeta);
+      if (!hasWeightAndBias_) {
+        test::fill(ctx().run_ctx, blob, DTypeX(0));
+      } else {  // This will cause forward pass check to fail when calculating sum == 0
+        test::fill(ctx().run_ctx, blob, DTypeX(1));
+        if (blob.size(0) > 0) {
+          blob.dptr<DTypeX>()[0] = DTypeX(3);
         }
-      });
+      }
+    });
 
     // Init the moving data (all mean = 0, all var = 1)
     test::try_fill(ctx().run_ctx, &GetBlob(kForMovingMean), 0);
@@ -216,34 +225,29 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
 
     // Join forward input and in_data array
     double val = 0;
-    test::patternFill(ctx().run_ctx, &GetBlob(bwd_in_data_Data), [&val]() -> double {
-      return val += 1;
+    test::patternFill(
+        ctx().run_ctx, &GetBlob(bwd_in_data_Data), [&val]() -> double { return val += 1; });
+
+    MSHADOW_TYPE_SWITCH(GetBlob(bwd_in_data_Gamma).type_flag_, DTypeX, {
+      const TBlob& blob = GetBlob(bwd_in_data_Gamma);
+      test::fill(ctx().run_ctx, blob, DTypeX(1));
+      if (hasWeightAndBias_) {
+        if (blob.size(0) > 1) {
+          blob.dptr<DTypeX>()[1] = DTypeX(3);
+        }
+      }
     });
-
-    MSHADOW_TYPE_SWITCH(
-      GetBlob(bwd_in_data_Gamma).type_flag_,
-      DTypeX, {
-        const TBlob& blob = GetBlob(bwd_in_data_Gamma);
+    MSHADOW_TYPE_SWITCH(GetBlob(bwd_in_data_Beta).type_flag_, DTypeX, {
+      const TBlob& blob = GetBlob(bwd_in_data_Beta);
+      if (!hasWeightAndBias_) {
+        test::fill(ctx().run_ctx, blob, DTypeX(0));
+      } else {  // This will cause forward pass check to fail when calculating sum == 0
         test::fill(ctx().run_ctx, blob, DTypeX(1));
-        if (hasWeightAndBias_) {
-          if (blob.size(0) > 1) {
-            blob.dptr<DTypeX>()[1] = DTypeX(3);
-          }
+        if (blob.size(0) > 0) {
+          blob.dptr<DTypeX>()[0] = DTypeX(3);
         }
-      });
-    MSHADOW_TYPE_SWITCH(
-      GetBlob(bwd_in_data_Beta).type_flag_,
-      DTypeX, {
-        const TBlob& blob = GetBlob(bwd_in_data_Beta);
-        if (!hasWeightAndBias_) {
-          test::fill(ctx().run_ctx, blob, DTypeX(0));
-        } else {  // This will cause forward pass check to fail when calculating sum == 0
-          test::fill(ctx().run_ctx, blob, DTypeX(1));
-          if (blob.size(0) > 0) {
-            blob.dptr<DTypeX>()[0] = DTypeX(3);
-          }
-        }
-      });
+      }
+    });
 
     // Join aux arrays
     test::try_fill(ctx().run_ctx, &GetBlob(bwd_aux_states_MovingMean), 0);
@@ -253,8 +257,8 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
     test::try_fill(ctx().run_ctx, &GetBlob(bwd_out_data_Var), 1.0);
 
     val = -.001;
-    test::patternFill(ctx().run_ctx, &GetBlob(bwd_out_grad_Grad), [&val]() -> double {
-      return val += 0.01; });
+    test::patternFill(
+        ctx().run_ctx, &GetBlob(bwd_out_grad_Grad), [&val]() -> double { return val += 0.01; });
   }
 
   const bool hasWeightAndBias_;  // This will cause forward pass validation to fail
@@ -272,7 +276,7 @@ class BNOperatorExecutor : public test::op::CoreOpExecutor<DType, AccReal> {
  *
  */
 /*! \brief Validate batch norm test outputs */
-template<typename DType, typename AccReal>
+template <typename DType, typename AccReal>
 class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   typedef test::op::Validator<DType, AccReal> Super;
 
@@ -280,13 +284,13 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   BatchNormValidator() = delete;  // NOLINT
 
   /*! \brief Check batch norm output - 1D */
-  static void checkBatchNorm1D(const TBlob *blob) {
+  static void checkBatchNorm1D(const TBlob* blob) {
     const size_t dim = static_cast<size_t>(blob->ndim());
     CHECK_EQ(dim, 3U);
 
-    const size_t num = blob->shape_[0];  // batch size
+    const size_t num      = blob->shape_[0];  // batch size
     const size_t channels = blob->shape_[1];
-    const size_t length = blob->shape_[2];
+    const size_t length   = blob->shape_[2];
 
     size_t itemCount = 0;
 
@@ -313,15 +317,13 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect zero mean
         EXPECT_NEAR(0, sum, kErrorBound);
         if (!Super::isNear(AccReal(0), sum, kErrorBound)) {
-          LOG(WARNING) << "Sum is not close enough to zero: "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Sum is not close enough to zero: " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1: "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Variance is not close enough to 1: " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
       }
@@ -329,14 +331,14 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   }
 
   /*! \brief Check batch norm output - 2D */
-  static void checkBatchNorm2D(const TBlob *blob) {
+  static void checkBatchNorm2D(const TBlob* blob) {
     const size_t dim = static_cast<size_t>(blob->ndim());
     CHECK_EQ(dim, 4U);
 
-    const size_t num = blob->shape_[0];  // batch size
+    const size_t num      = blob->shape_[0];  // batch size
     const size_t channels = blob->shape_[1];
-    const size_t height = blob->shape_[2];
-    const size_t width = blob->shape_[3];
+    const size_t height   = blob->shape_[2];
+    const size_t width    = blob->shape_[3];
 
     size_t itemCount = 0, nonZero = 0;
 
@@ -370,16 +372,14 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect zero mean
         EXPECT_NEAR(0, sum, kErrorBound);
         if (!Super::isNear(AccReal(0), sum, kErrorBound)) {
-          LOG(WARNING) << "Sum is not close enough to zero: "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Sum is not close enough to zero: " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
           test::print(RunContext(), &(std::cerr << "Mean problem:" << std::endl), *blob);
         }
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1: "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Variance is not close enough to 1: " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
           test::print(RunContext(), &(std::cerr << "Variance problem:" << std::endl), *blob);
         }
@@ -388,14 +388,14 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   }
 
   /*! \brief Check batch norm output - 3D */
-  static void checkBatchNorm3D(const TBlob *blob) {
+  static void checkBatchNorm3D(const TBlob* blob) {
     const size_t dim = static_cast<size_t>(blob->ndim());
     CHECK_EQ(dim, 5U);
-    const size_t num = blob->shape_[0];  // batch size
+    const size_t num      = blob->shape_[0];  // batch size
     const size_t channels = blob->shape_[1];
-    const size_t depth = blob->shape_[2];
-    const size_t height = blob->shape_[3];
-    const size_t width = blob->shape_[4];
+    const size_t depth    = blob->shape_[2];
+    const size_t height   = blob->shape_[3];
+    const size_t width    = blob->shape_[4];
 
     size_t itemCount = 0;
 
@@ -406,8 +406,8 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
           for (size_t k = 0; k < height; ++k) {
             for (size_t l = 0; l < width; ++l) {
               const AccReal data = test::data_at<DType>(blob, {i, j, d, k, l});
-              sum = sum + data;
-              var = var + (data * data);
+              sum                = sum + data;
+              var                = var + (data * data);
               ++itemCount;
             }
           }
@@ -425,15 +425,13 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect zero mean
         EXPECT_NEAR(0, sum, kErrorBound);
         if (!Super::isNear(AccReal(0), sum, kErrorBound)) {
-          LOG(WARNING) << "Sum is not close enough to zero "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Sum is not close enough to zero " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1 "
-                       << saveSum << " (" << sum << "), "
+          LOG(WARNING) << "Variance is not close enough to 1 " << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
       }
@@ -447,7 +445,7 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
                              const EnumType idx,
                              bool print = false) {
     test::CAccessAsCPU cpu1(i1.ctx().run_ctx, i1.GetBlob(idx), false),
-      cpu2(i2.ctx().run_ctx, i2.GetBlob(idx), false);
+        cpu2(i2.ctx().run_ctx, i2.GetBlob(idx), false);
     const TBlob& b1 = cpu1();
     const TBlob& b2 = cpu2();
     if (print && test::debug_output) {
@@ -463,9 +461,9 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   }
 
   /*! \brief Check batch norm output */
-  template<typename BNOperatorProp>
+  template <typename BNOperatorProp>
   static void validateForward(const RunContext& run_ctx, const BNOperatorProp& data) {
-    const TBlob &outputBlob = data.GetBlob(ForwardOutputs::kForOutData);
+    const TBlob& outputBlob = data.GetBlob(ForwardOutputs::kForOutData);
     if (test::debug_output) {
       test::print(run_ctx, &(std::cout << "Fwd Output Blob:"), outputBlob, true, true);
     }
@@ -487,20 +485,20 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
     });
   }
 
-#define TEST_ISTRUE(__args$) \
-  do { \
-    bool _rc; \
+#define TEST_ISTRUE(__args$)        \
+  do {                              \
+    bool _rc;                       \
     EXPECT_TRUE((_rc = (__args$))); \
-    if (!_rc) { \
-      rc = false; \
-    } \
+    if (!_rc) {                     \
+      rc = false;                   \
+    }                               \
   } while (0)
 
   /*! \brief Compare entire operator data between two test sets */
-  template<typename PropType1, typename PropType2>
+  template <typename PropType1, typename PropType2>
   static bool compare(
-    const test::op::OpInfo<PropType1, BNOperatorExecutor<DType, AccReal>>& info_1,
-    const test::op::OpInfo<PropType2, BNOperatorExecutor<DType, AccReal>>& info_2) {
+      const test::op::OpInfo<PropType1, BNOperatorExecutor<DType, AccReal>>& info_1,
+      const test::op::OpInfo<PropType2, BNOperatorExecutor<DType, AccReal>>& info_2) {
     bool rc = true;
     // Input
     TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, ForwardInputs::kForInData));
@@ -517,13 +515,10 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
 #endif
 
     if (!info_2.prop_->getParam().use_global_stats) {
-      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
-                          BackwardInputs::bwd_out_data_Mean));
-      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
-                          BackwardInputs::bwd_out_data_Var));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, BackwardInputs::bwd_out_data_Mean));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, BackwardInputs::bwd_out_data_Var));
       // InGrad
-      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
-                          BackwardOutputs::bwd_in_grad_Data));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, BackwardOutputs::bwd_in_grad_Data));
 #if 0
       TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
                           BackwardOutputs::bwd_in_grad_Gamma));
@@ -531,8 +526,7 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
                           BackwardOutputs::bwd_in_grad_Beta));
 #endif
       // OutGrad
-      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_,
-                          BackwardInputs::bwd_out_grad_Grad));
+      TEST_ISTRUE(compare(*info_1.executor_, *info_2.executor_, BackwardInputs::bwd_out_grad_Grad));
     }
     return rc;
   }
@@ -549,24 +543,22 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
  *
  */
 static const test::op::kwargs_t blank_kwargs;
-static const test::op::kwargs_t blank_kwargs_nocudnn = {
-  {"cudnn_off", "True"} };
-static const test::op::kwargs_t nonfixgamma_kwargs = {
-  {"fix_gamma", "False"} };
-static const test::op::kwargs_t nonfixgamma_kwargs_nocudnn = {
-  {"fix_gamma", "False"}, {"cudnn_off", "True"} };
-static const test::op::kwargs_t useglobalstats_kwargs = {
-  {"use_global_stats", "True"} };
-static const test::op::kwargs_t useglobalstats_kwargs_nocudnn = {
-  {"use_global_stats", "True"}, {"cudnn_off", "True"} };
-static const test::op::kwargs_t nfs_ugs_kwargs = {
-  {"fix_gamma", "False"}, {"use_global_stats", "True"}};
-static const test::op::kwargs_t nfs_ugs_kwargs_nocudnn = {
-  {"fix_gamma", "False"}, {"use_global_stats", "True"}, {"cudnn_off", "True"}  };
+static const test::op::kwargs_t blank_kwargs_nocudnn          = {{"cudnn_off", "True"}};
+static const test::op::kwargs_t nonfixgamma_kwargs            = {{"fix_gamma", "False"}};
+static const test::op::kwargs_t nonfixgamma_kwargs_nocudnn    = {{"fix_gamma", "False"},
+                                                              {"cudnn_off", "True"}};
+static const test::op::kwargs_t useglobalstats_kwargs         = {{"use_global_stats", "True"}};
+static const test::op::kwargs_t useglobalstats_kwargs_nocudnn = {{"use_global_stats", "True"},
+                                                                 {"cudnn_off", "True"}};
+static const test::op::kwargs_t nfs_ugs_kwargs                = {{"fix_gamma", "False"},
+                                                  {"use_global_stats", "True"}};
+static const test::op::kwargs_t nfs_ugs_kwargs_nocudnn        = {{"fix_gamma", "False"},
+                                                          {"use_global_stats", "True"},
+                                                          {"cudnn_off", "True"}};
 
 #if !DISABLE_VALIDATION
 static bool isUGS(const test::op::kwargs_t& kwargs) {
-  for (const auto & kwarg : kwargs) {
+  for (const auto& kwarg : kwargs) {
     if (!kwarg.first.compare("use_global_stats")) {
       return kwarg.second.compare("True") == 0;
     }
@@ -585,9 +577,12 @@ static bool isUGS(const test::op::kwargs_t& kwargs) {
  *                            __/ |                  | |
  *                           |___/                   |_|
  */
-template<typename StreamType, typename OperatorExecutor, typename BlobType>
-static StreamType& _DBPRT(const RunContext& run_ctx, const char *label,
-                          StreamType *os, const OperatorExecutor& obj, const BlobType type) {
+template <typename StreamType, typename OperatorExecutor, typename BlobType>
+static StreamType& _DBPRT(const RunContext& run_ctx,
+                          const char* label,
+                          StreamType* os,
+                          const OperatorExecutor& obj,
+                          const BlobType type) {
   *os << label << ": ";
   test::print(RunContext(), os, test::CAccessAsCPU(run_ctx, obj.GetBlob(type), false)());
   return *os;
@@ -595,10 +590,10 @@ static StreamType& _DBPRT(const RunContext& run_ctx, const char *label,
 
 #define DBPRT(__os, __obj, __type$) _DBPRT(run_ctx, #__type$, __os, __obj, __type$)
 
-template<typename StreamType, typename Prop, typename OperatorExecutor>
-static StreamType& dumpF(StreamType *os,
+template <typename StreamType, typename Prop, typename OperatorExecutor>
+static StreamType& dumpF(StreamType* os,
                          const test::op::OpInfo<Prop, OperatorExecutor>& prop,
-                         const size_t x = 0,
+                         const size_t x   = 0,
                          const bool force = test::debug_output) {
   if (force) {
     *os << std::endl;
@@ -622,10 +617,10 @@ static StreamType& dumpF(StreamType *os,
   return *os;
 }
 
-template<typename StreamType, typename Prop, typename OperatorExecutor>
-static StreamType& dumpB(StreamType *os,
+template <typename StreamType, typename Prop, typename OperatorExecutor>
+static StreamType& dumpB(StreamType* os,
                          const test::op::OpInfo<Prop, OperatorExecutor>& prop,
-                         const size_t x = 0,
+                         const size_t x   = 0,
                          const bool force = test::debug_output) {
   if (force) {
     *os << std::endl;
@@ -659,24 +654,26 @@ static StreamType& dumpB(StreamType *os,
  *
  */
 /*! \brief Test batch norm operator forward pass */
-template<typename OperatorProp, typename OperatorExecutor>
+template <typename OperatorProp, typename OperatorExecutor>
 static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorForward(
-  bool isGPU,
-  const mxnet::TShape& inputShape,
-  const std::vector<std::pair<std::string, std::string> >& kwargs,
-  const size_t count = 1) {
+    bool isGPU,
+    const mxnet::TShape& inputShape,
+    const std::vector<std::pair<std::string, std::string>>& kwargs,
+    const size_t count = 1) {
 #if MXNET_USE_CUDA
   if (isGPU && !test::unitTestsWithCuda) {
     LOG(INFO) << "GPU not found, running test as non-GPU";
   }
 #else
-  isGPU = false;
+  isGPU        = false;
 #endif
 
-  test::op::OpInfo<OperatorProp, OperatorExecutor> info = test::op::createOpAndInfoF<
-    OperatorProp, OperatorExecutor>(
-    OperatorExecutor::ArgsWithOpName(kwargs, "BatchNorm", "_backward_BatchNorm"),
-    isGPU, inputShape, kwargs);
+  test::op::OpInfo<OperatorProp, OperatorExecutor> info =
+      test::op::createOpAndInfoF<OperatorProp, OperatorExecutor>(
+          OperatorExecutor::ArgsWithOpName(kwargs, "BatchNorm", "_backward_BatchNorm"),
+          isGPU,
+          inputShape,
+          kwargs);
 
   info.executor_->initForward(*info.prop_, &info.in_type_);
 
@@ -685,8 +682,10 @@ static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorFor
 #if !DISABLE_VALIDATION
   if (!isUGS(kwargs)) {
     BatchNormValidator<typename OperatorExecutor::DataType,
-      typename OperatorExecutor::AccRealType>::validateForward(
-      info.executor_->ctx().run_ctx, *info.executor_);
+                       typename OperatorExecutor::AccRealType>::validateForward(info.executor_
+                                                                                    ->ctx()
+                                                                                    .run_ctx,
+                                                                                *info.executor_);
   }
 #endif
 
@@ -694,10 +693,10 @@ static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorFor
 }
 
 /*! \brief Test batch norm operator backward pass */
-template<typename OperatorProp, typename OperatorExecutor>
+template <typename OperatorProp, typename OperatorExecutor>
 static test::op::OpInfo<OperatorProp, OperatorExecutor> runOperatorBackward(
-  test::op::OpInfo<OperatorProp, OperatorExecutor> *info,
-  const size_t count = 1) {
+    test::op::OpInfo<OperatorProp, OperatorExecutor>* info,
+    const size_t count = 1) {
   info->executor_->initBackward(*info->prop_, &info->in_type_);
 
   info->executor_->backward(count);
@@ -706,25 +705,25 @@ static test::op::OpInfo<OperatorProp, OperatorExecutor> runOperatorBackward(
 
 static constexpr size_t CYCLE_COUNT = 3;
 
-template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
+template <typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
 static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> testForwardAndBackward(
     const bool isGPU1,
     const bool isGPU2,
-    const mxnet::TShape &inputShape,
+    const mxnet::TShape& inputShape,
     const test::op::kwargs_t& kwargs,
-    const size_t count = 1,
+    const size_t count      = 1,
     const size_t cycleCount = CYCLE_COUNT) {
   test::op::OpInfo<OperatorProp1, OperatorExecutor> info_1 =
-    TestBatchNormOperatorForward<OperatorProp1, OperatorExecutor>(isGPU1, inputShape,
-                                                                  kwargs, count);
+      TestBatchNormOperatorForward<OperatorProp1, OperatorExecutor>(
+          isGPU1, inputShape, kwargs, count);
 
   test::op::OpInfo<OperatorProp2, OperatorExecutor> info_2 =
-    TestBatchNormOperatorForward<OperatorProp2, OperatorExecutor>(isGPU2, inputShape,
-                                                                  kwargs, count);
+      TestBatchNormOperatorForward<OperatorProp2, OperatorExecutor>(
+          isGPU2, inputShape, kwargs, count);
 
   size_t thisCount = 0;
 
-  using DType = typename OperatorExecutor::DataType;
+  using DType   = typename OperatorExecutor::DataType;
   using AccReal = typename OperatorExecutor::AccRealType;
 
   do {
@@ -743,9 +742,8 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> test
     // Check that everything is the same after the forward pass
     const bool b1 = BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
 
-    const bool b2 = BatchNormValidator<DType, AccReal>::compare(*info_1.executor_,
-                                                                *info_2.executor_,
-                                                                kForInData, false);
+    const bool b2 = BatchNormValidator<DType, AccReal>::compare(
+        *info_1.executor_, *info_2.executor_, kForInData, false);
     if (!b1 || !b2) {
       dumpF(&std::cout, info_1, 1, true);
       dumpF(&std::cout, info_2, 2, true);
@@ -774,23 +772,17 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> test
     }
   } while (++thisCount < cycleCount);
 
-  return  { info_1, info_2 };
+  return {info_1, info_2};
 }
-template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
-static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor>
-testForwardAndBackward(const bool isGPU,
-                       const mxnet::TShape &inputShape,
-                       const test::op::kwargs_t kwargs,
-                       const size_t count = 1,
-                       const size_t cycleCount = CYCLE_COUNT
-) {
+template <typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
+static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> testForwardAndBackward(
+    const bool isGPU,
+    const mxnet::TShape& inputShape,
+    const test::op::kwargs_t kwargs,
+    const size_t count      = 1,
+    const size_t cycleCount = CYCLE_COUNT) {
   return testForwardAndBackward<OperatorProp1, OperatorProp2, OperatorExecutor>(
-    isGPU,
-    isGPU,
-    inputShape,
-    kwargs,
-    count,
-    cycleCount);
+      isGPU, isGPU, inputShape, kwargs, count, cycleCount);
 }
 
 /**
@@ -811,28 +803,30 @@ struct BatchNormCoreOpProp : public mxnet::test::op::CoreOpProp {
     params_.Init(kwargs, dmlc::parameter::kAllowUnknown);
   }
 
-  const mxnet::op::BatchNormParam& getParam() const { return params_; }
+  const mxnet::op::BatchNormParam& getParam() const {
+    return params_;
+  }
 
   mxnet::op::BatchNormParam params_;
 };
 
-template<typename OperatorExecutor>
+template <typename OperatorExecutor>
 static test::op::OpInfoPair<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>
 testBNForwardAndBackward2D(const bool isGPU,
-                           const mxnet::TShape &inputShape,
+                           const mxnet::TShape& inputShape,
                            const test::op::kwargs_t& kwargs) {
   CHECK_EQ(inputShape.ndim(), 4);  // V1 can only handle 2D
   return testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>(
-    isGPU, isGPU, inputShape, kwargs);
+      isGPU, isGPU, inputShape, kwargs);
 }
 
-template<typename OperatorExecutor>
+template <typename OperatorExecutor>
 static test::op::OpInfoPair<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>
 testBNForwardAndBackward(const bool isGPU,
-                         const mxnet::TShape &inputShape,
+                         const mxnet::TShape& inputShape,
                          const test::op::kwargs_t& kwargs) {
   return testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp, OperatorExecutor>(
-    isGPU, isGPU, inputShape, kwargs);
+      isGPU, isGPU, inputShape, kwargs);
 }
 
 /**
@@ -846,11 +840,9 @@ testBNForwardAndBackward(const bool isGPU,
  *                            |___/
  */
 TEST(BATCH_NORM, TestSanityForwaredAndBackward) {
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32,
-    DType, AccReal, {
+  MSHADOW_REAL_TYPE_SWITCH_EX(mshadow::kFloat32, DType, AccReal, {
     testBNForwardAndBackward2D<BNOperatorExecutor<DType, AccReal>>(
-      false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
+        false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
   });
 }
 
@@ -864,46 +856,44 @@ TEST(BATCH_NORM, TestSanityForwaredAndBackward) {
  *
  *
  */
-static const std::vector<mshadow::TypeFlag> v2_types = {
-  mshadow::kFloat32,
-  mshadow::kFloat64,
-  mshadow::kFloat16
-};
+static const std::vector<mshadow::TypeFlag> v2_types = {mshadow::kFloat32,
+                                                        mshadow::kFloat64,
+                                                        mshadow::kFloat16};
 
 TEST(BATCH_NORM, Test1DForward) {
-  for (const mshadow::TypeFlag type :  v2_types) {
+  for (const mshadow::TypeFlag type : v2_types) {
     MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
       testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
-        false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs);
+          false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs);
     });
   }
 }
 
 TEST(BATCH_NORM, Test2DForward) {
-  for (int type :  v2_types) {
+  for (int type : v2_types) {
     MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
       testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
-        false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
+          false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
     });
   }
 }
 
 TEST(BATCH_NORM, Test3DForward) {
-  for (const mshadow::TypeFlag type :  v2_types) {
+  for (const mshadow::TypeFlag type : v2_types) {
     MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
       testBNForwardAndBackward<BNOperatorExecutor<DType, AccReal>>(
-        false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs);
+          false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs);
     });
   }
 }
 
-template<typename PropType, typename OperatorExecutor>
+template <typename PropType, typename OperatorExecutor>
 static void timingTest(const std::string& label,
                        const bool isGPU,
                        const bool stochastic,
                        const test::op::kwargs_t& kwargs,
                        const int dim = 0,
-                       size_t count = 1) {
+                       size_t count  = 1) {
   std::cout << std::endl << std::flush;
 
 #ifdef NDEBUG
@@ -930,10 +920,10 @@ static void timingTest(const std::string& label,
 
     do {
       batchSize = stochastic ? test::rangedRand(1U, BATCH_SIZE * 2U) : TIMING_BATCH_SIZE;
-      channels = stochastic ? test::rangedRand(1U, CHANNELS * 2U) : TIMING_CHANNELS;
-      depth = stochastic ? test::rangedRand(1U, DEPTH * 2U) : TIMING_DEPTH;
-      height = stochastic ? test::rangedRand(1U, DH * 2U) : TIMING_DH;
-      width = stochastic ? test::rangedRand(1U, DW * 2U) : TIMING_DW;
+      channels  = stochastic ? test::rangedRand(1U, CHANNELS * 2U) : TIMING_CHANNELS;
+      depth     = stochastic ? test::rangedRand(1U, DEPTH * 2U) : TIMING_DEPTH;
+      height    = stochastic ? test::rangedRand(1U, DH * 2U) : TIMING_DH;
+      width     = stochastic ? test::rangedRand(1U, DW * 2U) : TIMING_DW;
     } while (stochastic && (height * width) == 1U);
 
     const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U);
@@ -942,21 +932,15 @@ static void timingTest(const std::string& label,
     switch (D) {
       case 0:
         info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
-          isGPU,
-          {batchSize, channels, width},
-          kwargs, count);
+            isGPU, {batchSize, channels, width}, kwargs, count);
         break;
       case 1:
         info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
-          isGPU,
-          {batchSize, channels, height, width},
-          kwargs, count);
+            isGPU, {batchSize, channels, height, width}, kwargs, count);
         break;
       case 2:
         info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
-          isGPU,
-          {batchSize, channels, depth, height, width},
-          kwargs, count);
+            isGPU, {batchSize, channels, depth, height, width}, kwargs, count);
         break;
       default:
         CHECK(false) << "rangedRand() returned unexpected value";
@@ -965,36 +949,38 @@ static void timingTest(const std::string& label,
       runOperatorBackward<PropType, OperatorExecutor>(&info, count);
       timing += info.executor_->GetTiming();
     }
-  } while (false);
+  }
+  while (false)
+    ;
 
   timing.print(&std::cout, label);
   std::cout << std::endl << std::flush;
 }
 
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-#define GPU_TEST_DIMENSIONS  2  /* Only support 2D */
+#define GPU_TEST_DIMENSIONS 2 /* Only support 2D */
 #else
-#define GPU_TEST_DIMENSIONS  0  /* Allow stochastic */
-#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+#define GPU_TEST_DIMENSIONS 0 /* Allow stochastic */
+#endif                        // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
 
 /*! \brief Stress-test random batch size/channels/dimension(s) */
 TEST(BATCH_NORM, DISABLED_TestStochasticTiming_2D) {
   // Test is disabled due to suspected flakiness
   // https://github.com/apache/incubator-mxnet/issues/14411
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32, DType, AccReal,
-    {
-      timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-        "RANDOM: BatchNormCoreOpProp<cpu>", false, true,
-        blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
+  MSHADOW_REAL_TYPE_SWITCH_EX(mshadow::kFloat32, DType, AccReal, {
+    timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+        "RANDOM: BatchNormCoreOpProp<cpu>", false, true, blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS);
+  });
 #if MXNET_USE_CUDA
   if (test::unitTestsWithCuda) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      mshadow::kFloat32, DType, AccReal,
-      {
-        timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-          "RANDOM: BatchNormCoreOpProp<gpu>", true, true,
-          blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
+    MSHADOW_REAL_TYPE_SWITCH_EX(mshadow::kFloat32, DType, AccReal, {
+      timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          "RANDOM: BatchNormCoreOpProp<gpu>",
+          true,
+          true,
+          blank_kwargs_nocudnn,
+          GPU_TEST_DIMENSIONS);
+    });
   }
 #endif
 }
@@ -1010,50 +996,37 @@ TEST(BATCH_NORM, TestTiming_2D) {
   if (mxnet::test::quick_test) {
     THISCOUNT = 1;
   }
-MSHADOW_REAL_TYPE_SWITCH_EX(
-  mshadow::kFloat32, DType, AccReal, {
+  MSHADOW_REAL_TYPE_SWITCH_EX(mshadow::kFloat32, DType, AccReal, {
 #if MXNET_USE_ONEDNN == 1
-  // MKL
-  timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-    "MKL BatchNormProp<cpu> 2D",
-    false, false,
-    blank_kwargs_nocudnn,
-    2, THISCOUNT);
+    // MKL
+    timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+        "MKL BatchNormProp<cpu> 2D", false, false, blank_kwargs_nocudnn, 2, THISCOUNT);
 #endif  // MXNET_USE_ONEDNN == 1
-  // CPU
-  test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
-  timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-    "BatchNormProp<cpu> 2D",
-    false, false,
-    blank_kwargs_nocudnn,
-    2, THISCOUNT);
-#if MXNET_USE_CUDA
-  if (test::unitTestsWithCuda) {
-    // CUDA
+    // CPU
+    test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
     timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-      "BatchNormProp<gpu> 2D",
-      true, false,
-      blank_kwargs_nocudnn,
-      2, THISCOUNT);
+        "BatchNormProp<cpu> 2D", false, false, blank_kwargs_nocudnn, 2, THISCOUNT);
+#if MXNET_USE_CUDA
+    if (test::unitTestsWithCuda) {
+      // CUDA
+      timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          "BatchNormProp<gpu> 2D", true, false, blank_kwargs_nocudnn, 2, THISCOUNT);
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-    // CUDA-CUDNN
-    timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-      "CUDNN BatchNormProp<gpu> 2D",
-      true, false,
-      blank_kwargs,
-      2, THISCOUNT);
+      // CUDA-CUDNN
+      timingTest<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          "CUDNN BatchNormProp<gpu> 2D", true, false, blank_kwargs, 2, THISCOUNT);
 #endif
-  }
+    }
 #endif
-});
+  });
 }
 #endif  // _WIN32
 
-inline std::ostream& operator << (std::ostream& os, const test::op::kwargs_t& kwargs) {
+inline std::ostream& operator<<(std::ostream& os, const test::op::kwargs_t& kwargs) {
   if (!kwargs.empty()) {
     os << "[";
     size_t count = 0;
-    for (const auto &item : kwargs) {
+    for (const auto& item : kwargs) {
       if (count++) {
         os << ", ";
       }
@@ -1119,20 +1092,18 @@ TEST(BATCH_NORM, TestIterAll) {
 
 #ifndef _WIN32
 TEST(BATCH_NORM, TestBackward3D) {
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32, DType, AccReal,
-    {
-      const mxnet::TShape inputShape({2, 3, 2, 3, 5});
-      test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info =
+  MSHADOW_REAL_TYPE_SWITCH_EX(mshadow::kFloat32, DType, AccReal, {
+    const mxnet::TShape inputShape({2, 3, 2, 3, 5});
+    test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info =
         TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-          false, inputShape, blank_kwargs);
-      info.executor_->initBackward(*info.prop_, &info.in_type_);
-      runOperatorBackward(&info);
-    });
+            false, inputShape, blank_kwargs);
+    info.executor_->initBackward(*info.prop_, &info.in_type_);
+    runOperatorBackward(&info);
+  });
 }
 #endif  // _WIN32
 
-template<typename DType>
+template <typename DType>
 class ChannelAxisTestData {
  protected:
   enum Mode { LOAD, SAVE };
@@ -1140,7 +1111,7 @@ class ChannelAxisTestData {
   void loadOrSave(const RunContext& run_ctx, const TBlob& blob, int channel_axis, const Mode mode) {
     test::CAccessAsCPU cpu_blob(run_ctx, blob, true);
     mxnet::op::batchnorm::BNTensor3<DType> tensor3(cpu_blob(), channel_axis);
-    const mxnet::TShape &shape = blob.shape_;
+    const mxnet::TShape& shape = blob.shape_;
     CHECK_GT(shape.ndim(), 0);
     if (channel_axis < 0) {
       channel_axis = shape.ndim() + channel_axis;
@@ -1149,8 +1120,8 @@ class ChannelAxisTestData {
     const size_t channel_count = shape[channel_axis];
     std::vector<size_t> indexes(channel_count, 0);
     for (size_t outer = 0, outerCount = tensor3.OuterSize(); outer < outerCount; ++outer) {
-      for (size_t channel = 0, channelCount = tensor3.ChannelCount();
-           channel < channelCount; ++channel) {
+      for (size_t channel = 0, channelCount = tensor3.ChannelCount(); channel < channelCount;
+           ++channel) {
         CHECK_LT(channel, channel_data_.size());
         for (size_t inner = 0, innerCount = tensor3.InnerSize(); inner < innerCount; ++inner) {
           CHECK_LT(indexes[channel], channel_data_[channel].size());
@@ -1165,7 +1136,7 @@ class ChannelAxisTestData {
   }
 
  public:
-  std::vector<std::vector<DType>>   channel_data_;
+  std::vector<std::vector<DType>> channel_data_;
 
   static void print(const std::string& label, const std::vector<std::vector<DType>>& m) {
     if (test::debug_output) {
@@ -1173,15 +1144,14 @@ class ChannelAxisTestData {
         std::cout << label << ": ";
       }
       for (size_t i = 0, n = m.size(); i < n; ++i) {
-        const std::vector<DType> &vec = m[i];
+        const std::vector<DType>& vec = m[i];
         for (size_t j = 0, jn = vec.size(); j < jn; ++j) {
           if (j) {
             std::cout << ", ";
           }
           const DType val = vec[j];
           std::cout << std::fixed << std::setw(7)
-                    << std::setprecision(mxnet::test::MPRINT_PRECISION)
-                    << std::right << val;
+                    << std::setprecision(mxnet::test::MPRINT_PRECISION) << std::right << val;
         }
         std::cout << std::endl;
       }
@@ -1217,16 +1187,16 @@ class ChannelAxisTestData {
   }
 };
 
-template<typename DType, typename AccReal>
+template <typename DType, typename AccReal>
 static void compare(const RunContext& run_ctx, const TBlob& blob, const std::vector<DType>& vals) {
   CHECK_EQ(blob.Size(), vals.size());
   test::CAccessAsCPU cpu_blob(run_ctx, blob, false);
-  const DType *v = cpu_blob().dptr<DType>();
+  const DType* v = cpu_blob().dptr<DType>();
   for (size_t i = 0, n = vals.size(); i < n; ++i) {
     const DType vBlob = v[i];
     const DType vVect = vals[i];
-    const bool near = BatchNormValidator<DType, AccReal>::isNear(
-      vBlob, vVect, BatchNormValidator<DType, AccReal>::ErrorBound(&cpu_blob()));
+    const bool near   = BatchNormValidator<DType, AccReal>::isNear(
+        vBlob, vVect, BatchNormValidator<DType, AccReal>::ErrorBound(&cpu_blob()));
     ASSERT_TRUE(near);
     if (!near) {
       LOG(WARNING) << vBlob << " is not near enough to " << vVect << std::endl;
@@ -1235,19 +1205,19 @@ static void compare(const RunContext& run_ctx, const TBlob& blob, const std::vec
 }
 
 #ifndef _WIN32
-template<typename DType, typename AccReal>
+template <typename DType, typename AccReal>
 static void compare(const std::vector<std::vector<float>>& d1,
                     const std::vector<std::vector<float>>& d2) {
   CHECK_EQ(d1.size(), d2.size());
   for (size_t x = 0, xn = d1.size(); x < xn; ++x) {
-    const std::vector<float> &vec1 = d1[x];
-    const std::vector<float> &vec2 = d2[x];
+    const std::vector<float>& vec1 = d1[x];
+    const std::vector<float>& vec2 = d2[x];
     CHECK_EQ(vec1.size(), vec2.size());
     for (size_t i = 0, n = vec1.size(); i < n; ++i) {
-      const DType v1 = vec1[i];
-      const DType v2 = vec2[i];
+      const DType v1  = vec1[i];
+      const DType v2  = vec2[i];
       const bool near = BatchNormValidator<DType, AccReal>::isNear(
-        v1, v2, BatchNormValidator<DType, AccReal>::ERROR_BOUND());
+          v1, v2, BatchNormValidator<DType, AccReal>::ERROR_BOUND());
       if (!near) {
         LOG(WARNING) << v1 << " is not near enough to " << v2 << std::endl;
         ASSERT_TRUE(near);
@@ -1256,7 +1226,7 @@ static void compare(const std::vector<std::vector<float>>& d1,
   }
 }
 
-template<typename DType, typename AccReal>
+template <typename DType, typename AccReal>
 static void testSaveAndLoad(const std::vector<size_t>& dims,
                             const int channelAxis,
                             const std::vector<std::vector<DType>>& inputChannelData,
@@ -1271,10 +1241,10 @@ static void testSaveAndLoad(const std::vector<size_t>& dims,
 
   RunContext cpu_run_ctx;
   cpu_run_ctx.ctx.dev_type = Context::kCPU;
-  cpu_run_ctx.ctx.dev_id = 0;
-  cpu_run_ctx.stream = nullptr;
-  std::unique_ptr<test::StandaloneBlob> blob(new test::StandaloneBlob(
-    shape, false, mshadow::DataType<DType>::kFlag));
+  cpu_run_ctx.ctx.dev_id   = 0;
+  cpu_run_ctx.stream       = nullptr;
+  std::unique_ptr<test::StandaloneBlob> blob(
+      new test::StandaloneBlob(shape, false, mshadow::DataType<DType>::kFlag));
 
   data.save(cpu_run_ctx, *blob, channelAxis);
   ChannelAxisTestData<DType>::print(cpu_run_ctx, "saved to blob", *blob);
@@ -1287,34 +1257,35 @@ static void testSaveAndLoad(const std::vector<size_t>& dims,
 TEST(BATCH_NORM, TestChannelAxisSaveAndLoad) {
   std::cout << std::endl << std::flush;
 
-  using DType = float;
+  using DType   = float;
   using AccReal = float;
 
-  const std::vector<std::vector<DType>> myData =
-    { { 1.0f, 1.0f, 1.0f, 1.0f },
-      { 2.0f, 2.0f, 2.0f, 2.0f },
-      { 3.0f, 3.0f, 3.0f, 3.0f } };
-
-  testSaveAndLoad<DType, AccReal>({ 1, 3, 2, 2 }, 1, myData,
-                                  { 1.0f, 1.0f, 1.0f, 1.0f,
-                                    2.0f, 2.0f, 2.0f, 2.0f,
-                                    3.0f, 3.0f, 3.0f, 3.0f});
-
-  testSaveAndLoad<DType, AccReal>({ 1, 2, 2, 3 }, 3, myData,
-                                  { 1.0f, 2.0f, 3.0f,
-                                    1.0f, 2.0f, 3.0f,
-                                    1.0f, 2.0f, 3.0f,
-                                    1.0f, 2.0f, 3.0f});
-
-  testSaveAndLoad<DType, AccReal>({ 1, 2, 3, 2 }, 2, myData,
-                                  { 1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f,
-                                    1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f});
+  const std::vector<std::vector<DType>> myData = {
+      {1.0f, 1.0f, 1.0f, 1.0f}, {2.0f, 2.0f, 2.0f, 2.0f}, {3.0f, 3.0f, 3.0f, 3.0f}};
+
+  testSaveAndLoad<DType, AccReal>(
+      {1, 3, 2, 2},
+      1,
+      myData,
+      {1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 3.0f, 3.0f, 3.0f});
+
+  testSaveAndLoad<DType, AccReal>(
+      {1, 2, 2, 3},
+      3,
+      myData,
+      {1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f});
+
+  testSaveAndLoad<DType, AccReal>(
+      {1, 2, 3, 2},
+      2,
+      myData,
+      {1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f});
 }
 
 /*! \brief Insert the channel field `channelCount` into the shape at `channelAxis` position */
 static mxnet::TShape MakeShape(const std::vector<index_t>& shape,
-                        signed int channelAxis,
-                        const size_t channelCount) {
+                               signed int channelAxis,
+                               const size_t channelCount) {
   if (channelAxis < 0) {
     channelAxis += shape.size() + 1;
   }
@@ -1333,19 +1304,18 @@ static mxnet::TShape MakeShape(const std::vector<index_t>& shape,
 
 /*! \brief Create and arrange equivalent data with different channel axes, then compare
  * normalized results */
-static void runChannelAxisTest(
-  const bool isGPU1,
-  const bool isGPU2,
-  const test::op::kwargs_t& base_kwargs,
-  const std::vector<index_t> shape,
-  const signed int channelAxis1,
-  const signed int channelAxis2,
-  const size_t channelCount,
-  const bool simpleData,
-  const size_t numberOfPasses = 5
+static void runChannelAxisTest(const bool isGPU1,
+                               const bool isGPU2,
+                               const test::op::kwargs_t& base_kwargs,
+                               const std::vector<index_t> shape,
+                               const signed int channelAxis1,
+                               const signed int channelAxis2,
+                               const size_t channelCount,
+                               const bool simpleData,
+                               const size_t numberOfPasses = 5
 
 ) {
-  using DType = float;
+  using DType   = float;
   using AccReal = float;
 
   size_t spatialSize = 1;
@@ -1392,17 +1362,23 @@ static void runChannelAxisTest(
   // Create operator 1 with ChannelAxis2 (normally the experimental one)
   kwargs.push_back({"axis", std::to_string(channelAxis1)});
   test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info_c1 =
-    test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-      BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
-        kwargs, "BatchNorm", "_backward_BatchNorm"), isGPU1, shape_c1, kwargs);
+      test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
+              kwargs, "BatchNorm", "_backward_BatchNorm"),
+          isGPU1,
+          shape_c1,
+          kwargs);
   kwargs.pop_back();
 
   // Create operator 2 with ChannelAxis2 (normally the control one)
   kwargs.push_back({"axis", std::to_string(channelAxis2)});
   test::op::OpInfo<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>> info_c2 =
-    test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-      BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
-        kwargs, "BatchNorm", "_backward_BatchNorm"), isGPU2, shape_c2, kwargs);
+      test::op::createOpAndInfoF<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          BNOperatorExecutor<DType, AccReal>::ArgsWithOpName(
+              kwargs, "BatchNorm", "_backward_BatchNorm"),
+          isGPU2,
+          shape_c2,
+          kwargs);
   kwargs.pop_back();
 
   // Init operators
@@ -1413,28 +1389,32 @@ static void runChannelAxisTest(
 
   // Save input data to blob with new shape 1
   data_c1.save(info_c1.executor_->ctx().run_ctx,
-               info_c1.executor_->GetBlob(ForwardInputs::kForInData), channelAxis1);
+               info_c1.executor_->GetBlob(ForwardInputs::kForInData),
+               channelAxis1);
   ChannelAxisTestData<DType>::print(info_c1.executor_->ctx().run_ctx,
                                     "blob 1 input",
                                     info_c1.executor_->GetBlob(ForwardInputs::kForInData));
 
   // Save input data to blob with new shape 2
   data_c2.save(info_c2.executor_->ctx().run_ctx,
-               info_c2.executor_->GetBlob(ForwardInputs::kForInData), channelAxis2);
+               info_c2.executor_->GetBlob(ForwardInputs::kForInData),
+               channelAxis2);
   ChannelAxisTestData<DType>::print(info_c2.executor_->ctx().run_ctx,
                                     "blob 2 input",
                                     info_c2.executor_->GetBlob(ForwardInputs::kForInData));
 
   // Save output grad to blob with new shape 1
   grad_c1.save(info_c1.executor_->ctx().run_ctx,
-               info_c1.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad), channelAxis1);
+               info_c1.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad),
+               channelAxis1);
   ChannelAxisTestData<DType>::print(info_c1.executor_->ctx().run_ctx,
                                     "blob 1 output grad",
                                     info_c1.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad));
 
   // Save output grad to blob with new shape 2
   grad_c2.save(info_c2.executor_->ctx().run_ctx,
-               info_c2.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad), channelAxis2);
+               info_c2.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad),
+               channelAxis2);
   ChannelAxisTestData<DType>::print(info_c2.executor_->ctx().run_ctx,
                                     "blob 2 output grad",
                                     info_c2.executor_->GetBlob(BackwardInputs::bwd_out_grad_Grad));
@@ -1453,12 +1433,14 @@ static void runChannelAxisTest(
   //
   // Transform operator 1's blob output to a normalized shape
   data_c1.load(info_c1.executor_->ctx().run_ctx,
-               info_c1.executor_->GetBlob(ForwardOutputs::kForOutData), channelAxis1);
+               info_c1.executor_->GetBlob(ForwardOutputs::kForOutData),
+               channelAxis1);
   ChannelAxisTestData<DType>::print("channel data 1", data_c1.channel_data_);
 
   // Transform operator 2's blob output to a normalized shape
   data_c2.load(info_c2.executor_->ctx().run_ctx,
-               info_c2.executor_->GetBlob(ForwardOutputs::kForOutData), channelAxis2);
+               info_c2.executor_->GetBlob(ForwardOutputs::kForOutData),
+               channelAxis2);
   ChannelAxisTestData<DType>::print("channel data 2", data_c2.channel_data_);
 
   // Compare the operators' output data while they're in a normalized shape
@@ -1469,12 +1451,14 @@ static void runChannelAxisTest(
   //
   // Transform operator 1's input-grad blob to a normalized shape
   grad_c1.load(info_c1.executor_->ctx().run_ctx,
-               info_c1.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data), channelAxis1);
+               info_c1.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data),
+               channelAxis1);
   ChannelAxisTestData<DType>::print("input grad 1", grad_c1.channel_data_);
 
   // Transform operator 2's input-grad blob to a normalized shape
   grad_c2.load(info_c2.executor_->ctx().run_ctx,
-               info_c2.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data), channelAxis2);
+               info_c2.executor_->GetBlob(BackwardOutputs::bwd_in_grad_Data),
+               channelAxis2);
   ChannelAxisTestData<DType>::print("input grad 2", grad_c2.channel_data_);
 
   // Compare the operators' input grad data while they're in a normalized shape
@@ -1483,13 +1467,14 @@ static void runChannelAxisTest(
 
 TEST(BATCH_NORM, TestChannelAxisSimple) {
   std::cout << std::endl << std::flush;
-  const size_t CHANNEL_COUNT = 4;
-  const int DEFAULT_AXIS = 1;
-  const int NEW_AXIS = -2;
-  const bool useSimpleData = true;  // change to true sometimes for troubleshooting
+  const size_t CHANNEL_COUNT       = 4;
+  const int DEFAULT_AXIS           = 1;
+  const int NEW_AXIS               = -2;
+  const bool useSimpleData         = true;  // change to true sometimes for troubleshooting
   const std::vector<index_t> shape = {1, 2, 3};
   // Check against base-case of channel axis position 1
-  runChannelAxisTest(false, false,
+  runChannelAxisTest(false,
+                     false,
                      useglobalstats_kwargs_nocudnn,
                      shape,
                      DEFAULT_AXIS,
@@ -1554,123 +1539,116 @@ TEST(BATCH_NORM, TestChannelAxis) {
 #if MXNET_USE_CUDA
 
 TEST(BATCH_NORM, Test2DForward2D_gpu) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-          true,
-          {BATCH_SIZE, CHANNELS, DH, DW},
-          blank_kwargs);
-        TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
-          true,
-          {BATCH_SIZE, CHANNELS, DH, DW},
-          blank_kwargs_nocudnn);
-      });
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
+      TestBatchNormOperatorForward<BatchNormCoreOpProp, BNOperatorExecutor<DType, AccReal>>(
+          true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs_nocudnn);
+    });
   }
 }
 
 TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({1, 1, 2, 1});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs_nocudnn);
-      });
+    });
   }
 }
 
 TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs_nocudnn);
-      });
+    });
   }
 }
 
 // nonfixgamma_kwargs
 
 TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_nfg) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({1, 1, 2, 1});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs_nocudnn);
-      });
+    });
   }
 }
 
 TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_nfg) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs_nocudnn);
-      });
+    });
   }
 }
 
 // useglobalstats_kwargs
 
 TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_ugs) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({2, 3, 2, 2});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({2, 3, 2, 2});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs_nocudnn);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs);
-      });
+    });
   }
 }
 
 TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) {
-  for (int type :  v2_types) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(
-      type, DType, AccReal,
-      {
-        const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+  for (int type : v2_types) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(type, DType, AccReal, {
+      const mxnet::TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs);
-        testForwardAndBackward<BatchNormCoreOpProp, BatchNormCoreOpProp,
-          BNOperatorExecutor<DType, AccReal>>(
+      testForwardAndBackward<BatchNormCoreOpProp,
+                             BatchNormCoreOpProp,
+                             BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs_nocudnn);
-      });
+    });
   }
 }
 
 #endif  // MXNET_USE_CUDA
 
 #endif
-
diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc
index 14ef625e6915..42ddd926d497 100644
--- a/tests/cpp/operator/coreop_perf.cc
+++ b/tests/cpp/operator/coreop_perf.cc
@@ -33,13 +33,13 @@ using namespace mxnet;
 
 using kwargs_t = test::op::kwargs_t;
 
-template<typename DType = float>
+template <typename DType = float>
 static void RunCoreOpBidirectional(const bool isGPU,
                                    const kwargs_t& op_kwargs,
-                                   const char *op_name,
-                                   const char *backward_op_name = "") {
+                                   const char* op_name,
+                                   const char* backward_op_name = "") {
   const mxnet::TShape shape({5, 5});
-  test::op::CoreOpExecutor<DType> op(isGPU, { shape });
+  test::op::CoreOpExecutor<DType> op(isGPU, {shape});
   op.set_verbose(false);
 
   op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name));
@@ -56,38 +56,32 @@ static void RunCoreOpBidirectional(const bool isGPU,
   }
 }
 
-template<typename DType = float>
+template <typename DType = float>
 static void RunCoreOpTimingTest(const bool isGPU,
                                 const kwargs_t& op_kwargs,
-                                const char *op_name,
-                                const char *backward_op_name = "") {
-  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-    op_kwargs, op_name, backward_op_name);
+                                const char* op_name,
+                                const char* backward_op_name = "") {
+  const kwargs_t kwargs =
+      test::op::CoreOpExecutor<DType>::ArgsWithOpName(op_kwargs, op_name, backward_op_name);
 
   // prime code and cache before the performance runs
   test::op::CoreOperatorRunner<DType> runner;
-  runner.RunBidirectional(false, { {20, 3, 128, 128} }, kwargs, 1);
+  runner.RunBidirectional(false, {{20, 3, 128, 128}}, kwargs, 1);
 
   // Do the performance runs
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  const char *pu = isGPU ? "GPU" : "CPU";
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs,
-                      2, 10, { shape });
+  const char* pu = isGPU ? "GPU" : "CPU";
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest(
+        std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, {shape});
   }
 }
 
@@ -96,11 +90,13 @@ static void RunCoreOpTimingTest(const bool isGPU,
  */
 TEST(COREOP_PERF, ExecuteBidirectional) {
   std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+  RunCoreOpBidirectional(false,
+                         {{"lr", "0.01"}, {"clip_gradient", "-1"}},
                          "sgd_mom_update",
                          COREOP_BWD_OP_NAME_VALUE_NONE);
   std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+  RunCoreOpBidirectional(false,
+                         {{"lr", "0.01"}, {"clip_gradient", "1"}},
                          "sgd_mom_update",
                          COREOP_BWD_OP_NAME_VALUE_NONE);
 }
@@ -110,11 +106,13 @@ TEST(COREOP_PERF, ExecuteBidirectional) {
  */
 TEST(COREOP_PERF, TimingCPU) {
   std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+  RunCoreOpTimingTest(false,
+                      {{"lr", "0.01"}, {"clip_gradient", "-1"}},
                       "sgd_mom_update",
                       COREOP_BWD_OP_NAME_VALUE_NONE);
   std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+  RunCoreOpTimingTest(false,
+                      {{"lr", "0.01"}, {"clip_gradient", "1"}},
                       "sgd_mom_update",
                       COREOP_BWD_OP_NAME_VALUE_NONE);
 }
@@ -125,13 +123,14 @@ TEST(COREOP_PERF, TimingCPU) {
  */
 TEST(COREOP_PERF, TimingGPU) {
   std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+  RunCoreOpTimingTest(true,
+                      {{"lr", "0.01"}, {"clip_gradient", "-1"}},
                       "sgd_mom_update",
                       COREOP_BWD_OP_NAME_VALUE_NONE);
   std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
-  RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+  RunCoreOpTimingTest(true,
+                      {{"lr", "0.01"}, {"clip_gradient", "1"}},
                       "sgd_mom_update",
                       COREOP_BWD_OP_NAME_VALUE_NONE);
 }
 #endif  // MXNET_USE_CUDA == 1
-
diff --git a/tests/cpp/operator/dnnl_operator_test.cc b/tests/cpp/operator/dnnl_operator_test.cc
index 7e2233c9b449..e66fc56bab2c 100644
--- a/tests/cpp/operator/dnnl_operator_test.cc
+++ b/tests/cpp/operator/dnnl_operator_test.cc
@@ -559,8 +559,8 @@ void TestConcatOp(const OpAttrs& attrs, VerifyFunc verify_fn, bool backwards = f
       int dim             = std::stoi(str_dim);
       if (dim >= in_arr.arr.shape().ndim())
         continue;
-      float scale = backwards ? 1 / static_cast<float>(attrs.num_outputs)
-                              : static_cast<float>(attrs.num_inputs);
+      float scale = backwards ? 1 / static_cast<float>(attrs.num_outputs) :
+                                static_cast<float>(attrs.num_inputs);
 
       std::vector<float> scale_vector(in_arr.arr.shape().ndim());
       for (int i = 0; i < in_arr.arr.shape().ndim(); i++)
diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc
index 2a1754e2606f..71aad4395caa 100644
--- a/tests/cpp/operator/dropout_perf.cc
+++ b/tests/cpp/operator/dropout_perf.cc
@@ -32,7 +32,7 @@
 using namespace mxnet;
 
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
-const kwargs_t basic_dropout_args = { };
+const kwargs_t basic_dropout_args = {};
 
 /*!
  * \brief Generic bidirectional sanity test
@@ -42,10 +42,9 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) {
   kwargs_t kwargs = basic_dropout_args;
   kwargs.push_back({"mode", "always"});
   test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout", "_backward_Dropout");
   runner.set_verbose(true);
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+  runner.RunBidirectional(false, {shape}, kwargs, 1);
 }
 
 /*!
@@ -53,32 +52,25 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) {
  */
 TEST(DROPOUT_PERF, TimingCPU) {
   kwargs_t kwargs = basic_dropout_args;
-// Which math function is arbitrary since it will have roughly constant timing among approaches
+  // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"mode", "always"});
   mxnet::TShape shape({10, 10, 10, 10});
   test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
-  std::vector <mxnet::TShape> shapes;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout", "_backward_Dropout");
+  runner.RunBidirectional(false, {shape}, kwargs, 1);
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                             "_backward_Dropout");
-    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }, false);
+  for (const mxnet::TShape& shape : shapes) {
+    kwargs =
+        test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout", "_backward_Dropout");
+    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, {shape}, false);
   }
 }
 
@@ -92,20 +84,14 @@ TEST(DROPOUT_PERF, TimingGPU) {
   kwargs.push_back({"mode", "always"});
   mxnet::TShape shape({10, 10, 10, 10});
   test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
-  std::vector <mxnet::TShape> shapes = {
-    {1,  1, 28,  28},
-    {1,  3, 28,  28},
-    {50, 1, 18,  32},
-    {50, 3, 18,  32},
-    {20, 3, 128, 128}
-  };
-  for (const mxnet::TShape &shape : shapes) {
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                             "_backward_Dropout");
-    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }, false);
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout", "_backward_Dropout");
+  runner.RunBidirectional(false, {shape}, kwargs, 1);
+  std::vector<mxnet::TShape> shapes = {
+      {1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
+  for (const mxnet::TShape& shape : shapes) {
+    kwargs =
+        test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout", "_backward_Dropout");
+    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, {shape}, false);
   }
 }
 #endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
index 9fd70261dc93..b7bcde3f8c0e 100644
--- a/tests/cpp/operator/fully_conn_perf.cc
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -34,7 +34,7 @@ using namespace mxnet;
 
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
 
-const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"}, {"no_bias", "true"} };
+const kwargs_t basic_fullyconn_args = {{"num_hidden", "250"}, {"no_bias", "true"}};
 /*!
  * \brief Generic bidirectional sanity test
  */
@@ -44,9 +44,9 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
   kwargs_t kwargs = basic_fullyconn_args;
   test::op::CoreOperatorRunner<float> runner;
   runner.set_verbose(true);
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(
+      kwargs, "FullyConnected", "_backward_FullyConnected");
+  runner.RunBidirectional(false, {shape1, shape2}, kwargs, 1);
 }
 
 /*!
@@ -57,30 +57,23 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
   mxnet::TShape shape1({10, 10, 10, 10});
   mxnet::TShape shape2({250, 1000});
   test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
-  std::vector <mxnet::TShape> shapes;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(
+      kwargs, "FullyConnected", "_backward_FullyConnected");
+  runner.RunBidirectional(false, {shape1, shape2}, kwargs, 1);
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
   for (const mxnet::TShape& shape : shapes) {
     mxnet::TShape shape2({250, static_cast<nnvm::dim_t>(shape.ProdShape(1, shape.ndim()))});
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                             "_backward_FullyConnected");
-    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10,
-                      { shape, shape2 }, false);
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(
+        kwargs, "FullyConnected", "_backward_FullyConnected");
+    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, {shape, shape2}, false);
   }
 }
 
@@ -93,30 +86,23 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
   mxnet::TShape shape1({10, 10, 10, 10});
   mxnet::TShape shape2({250, 1000});
   test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunBidirectional(false, { shape1, shape2 }, kwargs, 1);
-  std::vector <mxnet::TShape> shapes;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(
+      kwargs, "FullyConnected", "_backward_FullyConnected");
+  runner.RunBidirectional(false, {shape1, shape2}, kwargs, 1);
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
   for (const mxnet::TShape& shape : shapes) {
     mxnet::TShape shape2({250, static_cast<nnvm::dim_t>(shape.ProdShape(1, shape.ndim()))});
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                             "_backward_FullyConnected");
-    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10,
-                      { shape, shape2 }, false);
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(
+        kwargs, "FullyConnected", "_backward_FullyConnected");
+    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, {shape, shape2}, false);
   }
 }
 #endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/krprod_test.cc b/tests/cpp/operator/krprod_test.cc
index 66ddddd771f8..df0812d59f32 100644
--- a/tests/cpp/operator/krprod_test.cc
+++ b/tests/cpp/operator/krprod_test.cc
@@ -36,16 +36,16 @@ using namespace mshadow;
 using namespace mshadow::expr;
 using DType = double;
 
-#define EXPECT_DOUBLE_EQ_MATRIX(expected, actual) \
-{                                                \
-  for (int i = 0; i < static_cast<int>(actual.size(0)); ++i) \
-    for (int j = 0; j < static_cast<int>(actual.size(1)); ++j) \
-      EXPECT_LE(std::abs(actual[i][j] - expected[i][j]), 1e-10); \
-} \
+#define EXPECT_DOUBLE_EQ_MATRIX(expected, actual)                  \
+  {                                                                \
+    for (int i = 0; i < static_cast<int>(actual.size(0)); ++i)     \
+      for (int j = 0; j < static_cast<int>(actual.size(1)); ++j)   \
+        EXPECT_LE(std::abs(actual[i][j] - expected[i][j]), 1e-10); \
+  }
 
 TEST(row_wise_kronecker, OneInputMatrix) {
   // Input matrices of shape (2, 4) which is also the expected result
-  DType mat[8] {1, 2, 3, 4, 5, 6, 7, 8};
+  DType mat[8]{1, 2, 3, 4, 5, 6, 7, 8};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -64,12 +64,12 @@ TEST(row_wise_kronecker, OneInputMatrix) {
 
 TEST(row_wise_kronecker, TwoInputMatrices) {
   // Input matrices of shape (2, 3) and (2, 4)
-  DType mat1[6] {1, 2, 3, 4, 5, 6};
-  DType mat2[8] {1, 2, 3, 4, 5, 6, 7, 8};
+  DType mat1[6]{1, 2, 3, 4, 5, 6};
+  DType mat2[8]{1, 2, 3, 4, 5, 6, 7, 8};
 
   // Expect result of shape (2, 12)
-  DType expected[24] {1, 2, 3, 4, 2, 4, 6, 8, 3, 6, 9, 12,
-                      20, 24, 28, 32, 25, 30, 35, 40, 30, 36, 42, 48};
+  DType expected[24]{1,  2,  3,  4,  2,  4,  6,  8,  3,  6,  9,  12,
+                     20, 24, 28, 32, 25, 30, 35, 40, 30, 36, 42, 48};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -90,11 +90,11 @@ TEST(row_wise_kronecker, TwoInputMatrices) {
 
 TEST(row_wise_kronecker, TwoInputMatrices2) {
   // Input matrices of shape (2, 3) and (2, 1)
-  DType mat1[6] {1, 2, 3, 4, 5, 6};
-  DType mat2[2] {1, 2};
+  DType mat1[6]{1, 2, 3, 4, 5, 6};
+  DType mat2[2]{1, 2};
 
   // Expect result of shape (2, 3)
-  DType expected[6] {1, 2, 3, 8, 10, 12};
+  DType expected[6]{1, 2, 3, 8, 10, 12};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -117,9 +117,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 2)),
-    in3(Shape2(3, 3)), kr12(Shape2(3, 8)), kr13(Shape2(3, 24)),
-    result(Shape2(3, 24));
+  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 2)), in3(Shape2(3, 3)), kr12(Shape2(3, 8)),
+      kr13(Shape2(3, 24)), result(Shape2(3, 24));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
@@ -127,8 +126,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices) {
   AllocSpace(&kr13);
   AllocSpace(&result);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -139,7 +138,7 @@ TEST(row_wise_kronecker, ThreeInputMatrices) {
   row_wise_kronecker(result, ts_arr);
   EXPECT_DOUBLE_EQ_MATRIX(kr13, result);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&kr12);
   FreeSpace(&kr13);
@@ -150,9 +149,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices2) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 1)),
-    in3(Shape2(3, 3)), kr12(Shape2(3, 4)), kr13(Shape2(3, 12)),
-    result(Shape2(3, 12));
+  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 1)), in3(Shape2(3, 3)), kr12(Shape2(3, 4)),
+      kr13(Shape2(3, 12)), result(Shape2(3, 12));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
@@ -160,8 +158,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices2) {
   AllocSpace(&kr13);
   AllocSpace(&result);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -172,7 +170,7 @@ TEST(row_wise_kronecker, ThreeInputMatrices2) {
   row_wise_kronecker(result, ts_arr);
   EXPECT_DOUBLE_EQ_MATRIX(kr13, result);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&kr12);
   FreeSpace(&kr13);
@@ -183,9 +181,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices3) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(3, 1)), in2(Shape2(3, 4)),
-    in3(Shape2(3, 3)), kr12(Shape2(3, 4)), kr13(Shape2(3, 12)),
-    result(Shape2(3, 12));
+  Tensor<cpu, 2, DType> in1(Shape2(3, 1)), in2(Shape2(3, 4)), in3(Shape2(3, 3)), kr12(Shape2(3, 4)),
+      kr13(Shape2(3, 12)), result(Shape2(3, 12));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
@@ -193,8 +190,8 @@ TEST(row_wise_kronecker, ThreeInputMatrices3) {
   AllocSpace(&kr13);
   AllocSpace(&result);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -205,7 +202,7 @@ TEST(row_wise_kronecker, ThreeInputMatrices3) {
   row_wise_kronecker(result, ts_arr);
   EXPECT_DOUBLE_EQ_MATRIX(kr13, result);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&kr12);
   FreeSpace(&kr13);
@@ -216,10 +213,9 @@ TEST(row_wise_kronecker, FourInputMatrices) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(3, 47)), in2(Shape2(3, 1)),
-    in3(Shape2(3, 5)), in4(Shape2(3, 2173)), kr12(Shape2(3, 47)),
-    kr13(Shape2(3, 47 * 5)), kr14(Shape2(3, 47 * 5 * 2173)),
-    result(Shape2(3, 47 * 5 * 2173));
+  Tensor<cpu, 2, DType> in1(Shape2(3, 47)), in2(Shape2(3, 1)), in3(Shape2(3, 5)),
+      in4(Shape2(3, 2173)), kr12(Shape2(3, 47)), kr13(Shape2(3, 47 * 5)),
+      kr14(Shape2(3, 47 * 5 * 2173)), result(Shape2(3, 47 * 5 * 2173));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
@@ -229,8 +225,8 @@ TEST(row_wise_kronecker, FourInputMatrices) {
   AllocSpace(&kr14);
   AllocSpace(&result);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3, in4};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3, in4};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -242,7 +238,7 @@ TEST(row_wise_kronecker, FourInputMatrices) {
   row_wise_kronecker(result, ts_arr);
   EXPECT_DOUBLE_EQ_MATRIX(kr14, result);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&kr12);
   FreeSpace(&kr13);
@@ -250,11 +246,10 @@ TEST(row_wise_kronecker, FourInputMatrices) {
   FreeSpace(&result);
 }
 
-
 #if MXNET_USE_LAPACK == 1
 TEST(khatri_rao, OneInputMatrix) {
   // Input matrices of shape (2, 4) which is also the expected result
-  DType mat[8] {1, 2, 3, 4, 5, 6, 7, 8};
+  DType mat[8]{1, 2, 3, 4, 5, 6, 7, 8};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -273,12 +268,12 @@ TEST(khatri_rao, OneInputMatrix) {
 
 TEST(khatri_rao, TwoInputMatrices) {
   // Input matrices of shape (3, 2) and (4, 2)
-  DType mat1[6] {1, 4, 2, 5, 3, 6};
-  DType mat2[8] {1, 5, 2, 6, 3, 7, 4, 8};
+  DType mat1[6]{1, 4, 2, 5, 3, 6};
+  DType mat2[8]{1, 5, 2, 6, 3, 7, 4, 8};
 
   // Expect result of shape (12, 2)
-  DType expected[24] {1, 20, 2, 24, 3, 28, 4, 32, 2, 25, 4, 30,
-                      6, 35, 8, 40, 3, 30, 6, 36, 9, 42, 12, 48};
+  DType expected[24]{1, 20, 2, 24, 3, 28, 4, 32, 2, 25, 4,  30,
+                     6, 35, 8, 40, 3, 30, 6, 36, 9, 42, 12, 48};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -301,9 +296,8 @@ TEST(khatri_rao, ThreeInputMatrices) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(4, 3)), in2(Shape2(2, 3)),
-    in3(Shape2(3, 3)), kr12(Shape2(8, 3)), kr13(Shape2(24, 3)),
-    result(Shape2(24, 3));
+  Tensor<cpu, 2, DType> in1(Shape2(4, 3)), in2(Shape2(2, 3)), in3(Shape2(3, 3)), kr12(Shape2(8, 3)),
+      kr13(Shape2(24, 3)), result(Shape2(24, 3));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
@@ -311,8 +305,8 @@ TEST(khatri_rao, ThreeInputMatrices) {
   AllocSpace(&kr13);
   AllocSpace(&result);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -323,7 +317,7 @@ TEST(khatri_rao, ThreeInputMatrices) {
   khatri_rao(result, ts_arr);
   EXPECT_DOUBLE_EQ_MATRIX(kr13, result);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&kr12);
   FreeSpace(&kr13);
@@ -331,7 +325,7 @@ TEST(khatri_rao, ThreeInputMatrices) {
 }
 
 TEST(inv_khatri_rao, OneInputMatrixTransposed) {
-  DType mat[8] {1, 2, 3, 4, 5, 6, 7, 8};
+  DType mat[8]{1, 2, 3, 4, 5, 6, 7, 8};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -354,8 +348,8 @@ TEST(inv_khatri_rao, OneInputMatrixTransposed) {
 
 TEST(inv_khatri_rao, TwoInputMatrices) {
   // Input matrices of shape (3, 2) and (4, 2)
-  DType mat1[6] {1, 4, 2, 5, 3, 6};
-  DType mat2[8] {1, 5, 2, 6, 3, 7, 4, 8};
+  DType mat1[6]{1, 4, 2, 5, 3, 6};
+  DType mat2[8]{1, 5, 2, 6, 3, 7, 4, 8};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -382,8 +376,8 @@ TEST(inv_khatri_rao, TwoInputMatrices) {
 
 TEST(inv_khatri_rao, TwoInputMatricesTransposed) {
   // Transposed input matrices of shape (2, 3) and (2, 4)
-  DType mat1[6] {1, 2, 3, 4, 5, 6};
-  DType mat2[8] {1, 2, 3, 4, 5, 6, 7, 8};
+  DType mat1[6]{1, 2, 3, 4, 5, 6};
+  DType mat2[8]{1, 2, 3, 4, 5, 6, 7, 8};
 
   // Make input tensors
   std::vector<Tensor<cpu, 2, DType> > ts_arr;
@@ -413,14 +407,13 @@ TEST(inv_khatri_rao, ThreeInputMatricesTranposed) {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(1, 6);
 
-  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 2)),
-    in3(Shape2(3, 3));
+  Tensor<cpu, 2, DType> in1(Shape2(3, 4)), in2(Shape2(3, 2)), in3(Shape2(3, 3));
   AllocSpace(&in1);
   AllocSpace(&in2);
   AllocSpace(&in3);
 
-  std::vector<Tensor<cpu, 2, DType> > ts_arr {in1, in2, in3};
-  for (auto & in : ts_arr) {
+  std::vector<Tensor<cpu, 2, DType> > ts_arr{in1, in2, in3};
+  for (auto& in : ts_arr) {
     for (int i = 0; i < static_cast<int>(in.size(0)); ++i)
       for (int j = 0; j < static_cast<int>(in.size(1)); ++j)
         in[i][j] = distribution(generator);
@@ -440,7 +433,7 @@ TEST(inv_khatri_rao, ThreeInputMatricesTranposed) {
   actual_dot = implicit_dot(implicit_dot(inv_kr, kr_t.T()), inv_kr);
   EXPECT_DOUBLE_EQ_MATRIX(inv_kr, actual_dot);
 
-  for (auto & in : ts_arr)
+  for (auto& in : ts_arr)
     FreeSpace(&in);
   FreeSpace(&inv_kr);
   FreeSpace(&kr_t);
diff --git a/tests/cpp/operator/runner/core_op_runner_test.cc b/tests/cpp/operator/runner/core_op_runner_test.cc
index 6e6cb91096fe..733d933c811d 100644
--- a/tests/cpp/operator/runner/core_op_runner_test.cc
+++ b/tests/cpp/operator/runner/core_op_runner_test.cc
@@ -39,19 +39,17 @@ using kwargs_t = test::op::kwargs_t;
 static const kwargs_t basic_args = {};
 
 static const std::vector<std::pair<std::string, std::string>> test_unary_operators = {
-  { "relu",    "" },  // Code can figure out what the backward op is for some
-  { "sigmoid", "" },
-  { "sqrt",    "" }
-};
+    {"relu", ""},  // Code can figure out what the backward op is for some
+    {"sigmoid", ""},
+    {"sqrt", ""}};
 
 static const std::vector<std::pair<std::string, std::string>> test_binary_operators = {
-  { "elemwise_add", "_backward_add" },
-  { "elemwise_mul", "_backward_mul" }
-};
+    {"elemwise_add", "_backward_add"},
+    {"elemwise_mul", "_backward_mul"}};
 
-template<typename TT>
+template <typename TT>
 inline std::vector<TT> AsVect(const TT& t) {
-  return std::move(std::vector<TT>({ t }));
+  return std::move(std::vector<TT>({t}));
 }
 
 /*!
@@ -62,8 +60,8 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) {
   kwargs_t kwargs = basic_args;
 
   for (const std::pair<std::string, std::string>& i : test_unary_operators) {
-    const char *op_name = i.first.c_str();
-    const char *backward_op_name = i.second.c_str();
+    const char* op_name          = i.first.c_str();
+    const char* backward_op_name = i.second.c_str();
 
     test::op::CoreOpExecutor<float> op(false, AsVect(shape));
     op.set_verbose(false);
@@ -87,8 +85,8 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) {
  */
 TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) {
   for (const std::pair<std::string, std::string>& i : test_binary_operators) {
-    const char *op_name = i.first.c_str();
-    const char *backward_op_name = i.second.c_str();
+    const char* op_name          = i.first.c_str();
+    const char* backward_op_name = i.second.c_str();
 
     mxnet::TShape shape({5, 5});
     kwargs_t kwargs = basic_args;
@@ -114,12 +112,12 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) {
  * \brief Execute bidirectional dot product, which has different shaped inputs and outputs
  */
 TEST(CORE_OP_RUNNER, ExecuteBidirectionalDotProduct) {
-  const char *op_name = "dot";
-  const char *backward_op_name = "_backward_dot";
+  const char* op_name          = "dot";
+  const char* backward_op_name = "_backward_dot";
 
   kwargs_t kwargs = basic_args;
 
-  test::op::CoreOpExecutor<float> op(false, { mxnet::TShape({ 2, 3 }), mxnet::TShape({ 3, 2 }) });
+  test::op::CoreOpExecutor<float> op(false, {mxnet::TShape({2, 3}), mxnet::TShape({3, 2})});
 
   op.set_verbose(false);
   op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name));
@@ -139,11 +137,14 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerSimpleUnary) {
   typedef float DType;
   mxnet::TShape shape({5, 5});
   for (const std::pair<std::string, std::string>& i : test_unary_operators) {
-    const char *op_name = i.first.c_str();
-    const char *backward_op_name = i.second.c_str();
+    const char* op_name          = i.first.c_str();
+    const char* backward_op_name = i.second.c_str();
     test::op::CoreOperatorRunner<DType> runner;
-    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-      basic_args, op_name, backward_op_name), 1);
+    runner.RunBidirectional(
+        false,
+        {shape},
+        test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name),
+        1);
   }
 }
 
@@ -151,11 +152,14 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunner) {
   using DType = float;
   mxnet::TShape shape({5, 5});
   for (const std::pair<std::string, std::string>& i : test_binary_operators) {
-    const char *op_name = i.first.c_str();
-    const char *backward_op_name = i.second.c_str();
+    const char* op_name          = i.first.c_str();
+    const char* backward_op_name = i.second.c_str();
     test::op::CoreOperatorRunner<DType> runner;
-    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-      basic_args, op_name, backward_op_name), 1);
+    runner.RunBidirectional(
+        false,
+        {shape},
+        test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name),
+        1);
   }
 }
 
@@ -163,16 +167,15 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunner) {
  * \brief Test RunBidirectional dot product, which has different shaped inputs and outputs
  */
 TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerDotProduct) {
-  using DType = float;
-  const char *op_name = "dot";
-  const char *backward_op_name = "_backward_dot";
+  using DType                  = float;
+  const char* op_name          = "dot";
+  const char* backward_op_name = "_backward_dot";
   test::op::CoreOperatorRunner<DType> runner;
-  runner.RunBidirectional(false,
-                          { mxnet::TShape({ 2, 3 }), mxnet::TShape({ 3, 2 }) },
-                          test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args,
-                                                                          op_name,
-                                                                          backward_op_name),
-                          1);
+  runner.RunBidirectional(
+      false,
+      {mxnet::TShape({2, 3}), mxnet::TShape({3, 2})},
+      test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name),
+      1);
 }
 
 /*!
@@ -181,64 +184,50 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerDotProduct) {
 TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) {
   using DType = float;
 
-  const char *op_name = "relu";
+  const char* op_name = "relu";
 
   const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name);
 
   test::op::CoreOperatorRunner<DType> runner;
-  runner.RunBidirectional(false, { mxnet::TShape({10, 10, 10, 10}) }, kwargs, 1);
+  runner.RunBidirectional(false, {mxnet::TShape({10, 10, 10, 10})}, kwargs, 1);
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest(std::string(op_name) +  "Operator CPU",
-                      false, false, kwargs, 2, 10, { shape });
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, {shape});
   }
 }
 
 TEST(CORE_OP_RUNNER, TimingCPUBinary) {
   using DType = float;
 
-  const char *op_name = "elemwise_add";
-  const char *backward_op_name = "_backward_add";
+  const char* op_name          = "elemwise_add";
+  const char* backward_op_name = "_backward_add";
 
-  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-    basic_args, op_name, backward_op_name);
+  const kwargs_t kwargs =
+      test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name);
 
   test::op::CoreOperatorRunner<DType> runner;
-  runner.RunBidirectional(false, { mxnet::TShape({10, 10, 10, 10}) }, kwargs, 1);
+  runner.RunBidirectional(false, {mxnet::TShape({10, 10, 10, 10})}, kwargs, 1);
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest(std::string(op_name) + "Operator CPU", false,
-                      false, kwargs, 2, 10, { shape });
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, {shape});
   }
 }
 
@@ -248,94 +237,83 @@ TEST(CORE_OP_RUNNER, TimingCPUBinary) {
 TEST(CORE_OP_RUNNER, TimingCPUBinaryDotProduct) {
   using DType = float;
 
-  const char *op_name = "dot";
-  const char *backward_op_name = "_backward_dot";
+  const char* op_name          = "dot";
+  const char* backward_op_name = "_backward_dot";
 
-  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-    basic_args, op_name, backward_op_name);
+  const kwargs_t kwargs =
+      test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name);
 
   test::op::CoreOperatorRunner<DType> runner;
-  runner.RunBidirectional(false, { {2, 3}, {3, 2} }, kwargs, 1);  // prime code and cache
+  runner.RunBidirectional(false, {{2, 3}, {3, 2}}, kwargs, 1);  // prime code and cache
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = { {28,  28}, {18,  32}, {128, 24}, {128, 256} };
+    shapes = {{28, 28}, {18, 32}, {128, 24}, {128, 256}};
   } else {
-    shapes = { {28,  28}, {128, 24} };
+    shapes = {{28, 28}, {128, 24}};
   }
   mxnet::ShapeVector input_shapes(2);
-  for (const mxnet::TShape &shape : shapes) {
+  for (const mxnet::TShape& shape : shapes) {
     input_shapes[0] = shape;
     input_shapes[1] = mxnet::TShape({shape[1], shape[0]});
-    runner.TimingTest(std::string(op_name) + " Operator CPU", false,
-                      false, kwargs, 2, 10, input_shapes);
+    runner.TimingTest(
+        std::string(op_name) + " Operator CPU", false, false, kwargs, 2, 10, input_shapes);
   }
 }
 #if MXNET_USE_CUDA == 1
 TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) {
   typedef float DType;
 
-  const char *op_name = "relu";
+  const char* op_name = "relu";
 
   const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name);
 
   test::op::CoreOperatorRunner<DType> runner;
   runner.RunBidirectional(false,
-                          { mxnet::TShape({10, 10, 10, 10}) },
+                          {mxnet::TShape({10, 10, 10, 10})},
                           kwargs,
                           1);  // prime code and cache
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape });
-  }}
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, {shape});
+  }
+}
 
 TEST(CORE_OP_RUNNER, TimingGPUBinary) {
   typedef float DType;
 
-  const char *op_name = "elemwise_add";
-  const char *backward_op_name = "_backward_add";
+  const char* op_name          = "elemwise_add";
+  const char* backward_op_name = "_backward_add";
 
-  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
-    basic_args, op_name, backward_op_name);
+  const kwargs_t kwargs =
+      test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name, backward_op_name);
 
   test::op::CoreOperatorRunner<DType> runner;
   runner.RunBidirectional(true,
-                          { mxnet::TShape({10, 10, 10, 10}) },
+                          {mxnet::TShape({10, 10, 10, 10})},
                           kwargs,
                           1);  // prime code and cache
 
-  std::vector <mxnet::TShape> shapes;
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1,  1, 28,  28},
-      {1,  3, 28,  28},
-      {50, 1, 18,  32},
-      {50, 3, 18,  32},
-      {20, 3, 128, 128}
-    };
+    shapes = {{1, 1, 28, 28}, {1, 3, 28, 28}, {50, 1, 18, 32}, {50, 3, 18, 32}, {20, 3, 128, 128}};
   } else {
     shapes = {
-      {1,  1, 28,  28},
-      {50, 3, 18,  32},
+        {1, 1, 28, 28},
+        {50, 3, 18, 32},
     };
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape });
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, {shape});
   }
 }
 
diff --git a/tests/cpp/operator/slice_channel_perf.cc b/tests/cpp/operator/slice_channel_perf.cc
index 638613ea1ec9..6a3e622eb5f4 100644
--- a/tests/cpp/operator/slice_channel_perf.cc
+++ b/tests/cpp/operator/slice_channel_perf.cc
@@ -31,8 +31,8 @@
 
 using namespace mxnet;
 
-typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
-const kwargs_t basic_activation_args = { };
+typedef std::vector<std::pair<std::string, std::string>> kwargs_t;
+const kwargs_t basic_activation_args = {};
 
 /*!
  * \brief Generic bidirectional sanity test
@@ -42,7 +42,7 @@ TEST(SLICE_CHANNEL_PERF, ExecuteBidirectional) {
   kwargs_t kwargs = basic_activation_args;
   kwargs.push_back({"num_outputs", "160"});
   test::op::LegacyOpRunner<mxnet::op::SliceChannelProp, float, float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+  runner.RunBidirectional(false, {shape}, kwargs, 1);
 }
 
 /*!
@@ -53,26 +53,16 @@ TEST(SLICE_CHANNEL_PERF, TimingCPU) {
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"num_outputs", "160"});
   test::op::LegacyOpRunner<mxnet::op::SliceChannelProp, float, float> runner;
-  runner.RunBidirectional(false,
-                          { mxnet::TShape({1, 160, 200}) },
-                          kwargs, 1);  // prime code and cache
-  std::vector <mxnet::TShape> shapes;
+  runner.RunBidirectional(
+      false, {mxnet::TShape({1, 160, 200})}, kwargs, 1);  // prime code and cache
+  std::vector<mxnet::TShape> shapes;
   if (test::performance_run) {
-    shapes = {
-      {1, 160, 200},
-      {10, 160, 200},
-      {100, 160, 200},
-      {10, 160, 500},
-      {100, 160, 500}
-    };
+    shapes = {{1, 160, 200}, {10, 160, 200}, {100, 160, 200}, {10, 160, 500}, {100, 160, 500}};
   } else {
-    shapes = {
-      {1, 160, 200},
-      {1, 160, 200}
-    };
+    shapes = {{1, 160, 200}, {1, 160, 200}};
   }
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest("SliceChannel Operator CPU", false, false, kwargs, 2, 10, { shape });
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest("SliceChannel Operator CPU", false, false, kwargs, 2, 10, {shape});
   }
 }
 
@@ -84,21 +74,13 @@ TEST(SLICE_CHANNEL_PERF, TimingGPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"num_outputs", "160"});
-  test::OperatorRunner<mxnet::op::SliceChannelProp,
-    test::op::LegacyOperatorExecutor<float, float>> runner;
-  runner.RunBidirectional(true,
-                          { mxnet::TShape({1, 160, 200}) },
-                          kwargs, 1);  // prime code and cache
-  std::vector <mxnet::TShape> shapes = {
-      {1, 160, 200},
-      {1, 160, 200},
-      {1, 160, 200},
-      {1, 160, 200},
-      {1, 160, 200}
-    };
-  for (const mxnet::TShape &shape : shapes) {
-    runner.TimingTest("SliceChannel Operator GPU", true, false, kwargs, 2, 10, { shape });
+  test::OperatorRunner<mxnet::op::SliceChannelProp, test::op::LegacyOperatorExecutor<float, float>>
+      runner;
+  runner.RunBidirectional(true, {mxnet::TShape({1, 160, 200})}, kwargs, 1);  // prime code and cache
+  std::vector<mxnet::TShape> shapes = {
+      {1, 160, 200}, {1, 160, 200}, {1, 160, 200}, {1, 160, 200}, {1, 160, 200}};
+  for (const mxnet::TShape& shape : shapes) {
+    runner.TimingTest("SliceChannel Operator GPU", true, false, kwargs, 2, 10, {shape});
   }
 }
 #endif  // MXNET_USE_CUDA == 1
-
diff --git a/tests/cpp/operator/tune/operator_tune_test.cc b/tests/cpp/operator/tune/operator_tune_test.cc
index 00a062698b17..7b78b0a6cd2a 100644
--- a/tests/cpp/operator/tune/operator_tune_test.cc
+++ b/tests/cpp/operator/tune/operator_tune_test.cc
@@ -33,7 +33,7 @@ using namespace mxnet;
  */
 TEST(OMP_TUNING, ShowAllTunedOps) {
   const std::unordered_set<std::string>& op_names =
-    mxnet::op::OperatorTune<float>::TunedOperatorNames();
+      mxnet::op::OperatorTune<float>::TunedOperatorNames();
   for (auto iter = op_names.begin(), e_iter = op_names.end(); iter != e_iter; ++iter) {
     std::cout << *iter << std::endl;
   }
@@ -45,21 +45,19 @@ static std::vector<mxnet::ShapeVector> tuning_shapes() {
   std::vector<mxnet::ShapeVector> shapes;
   if (test::performance_run || test::csv) {
     shapes = {
-      {{1,  1, 28,  28}},
-      {{1,  3, 28,  28}},
-      {{50, 1, 18,  32}},
-      {{25, 3, 64,  64}},
-      {{10, 3, 128, 128}},
-      {{20, 3, 128, 128}},
-      {{30, 3, 128, 128}},
-      {{30, 3, 256, 128}},
+        {{1, 1, 28, 28}},
+        {{1, 3, 28, 28}},
+        {{50, 1, 18, 32}},
+        {{25, 3, 64, 64}},
+        {{10, 3, 128, 128}},
+        {{20, 3, 128, 128}},
+        {{30, 3, 128, 128}},
+        {{30, 3, 256, 128}},
     };
   } else {
-    shapes = {
-      // Non-performance dataset acts as a sanity test
-      {{1,  1, 28, 28}},
-      {{50, 3, 18, 32}}
-    };
+    shapes = {// Non-performance dataset acts as a sanity test
+              {{1, 1, 28, 28}},
+              {{50, 3, 18, 32}}};
   }
   return shapes;
 }
@@ -68,8 +66,8 @@ static std::vector<mxnet::ShapeVector> tuning_shapes() {
  * \brief Generic bidirectional sanity test
  */
 TEST(OMP_TUNING, ExecuteBidirectional) {
-  test::op::BasicRunCoreOpBidirectional(false, true, {}, {tuning_shapes()[0]},
-                                        "elemwise_add", "_backward_add");
+  test::op::BasicRunCoreOpBidirectional(
+      false, true, {}, {tuning_shapes()[0]}, "elemwise_add", "_backward_add");
 }
 
 /* Some test results:
@@ -93,26 +91,20 @@ TEST(OMP_TUNING, ExecuteBidirectional) {
  * \brief Rune a tuning evaluation
  * \tparam DType Data type for which to evaluate tuning
  */
-template<typename DType>
+template <typename DType>
 static float EvaluateTune(const bool verbose = true) {
   std::vector<std::pair<std::string, std::string>> binary_operators;
   if (test::csv) {
-    binary_operators = {
-      {"elemwise_add", COREOP_BWD_OP_NAME_VALUE_NONE}
-    };
+    binary_operators = {{"elemwise_add", COREOP_BWD_OP_NAME_VALUE_NONE}};
   } else if (test::performance_run) {
-    binary_operators = {
-      {"relu",         ""},  // Code can figure out what the backward op is for some
-      {"sigmoid",      ""},
-      {"sqrt",         ""},
-      {"elemwise_add", "_backward_add"},
-      {"elemwise_mul", "_backward_mul"},
-      {"elemwise_div", "_backward_div"}
-    };
+    binary_operators = {{"relu", ""},  // Code can figure out what the backward op is for some
+                        {"sigmoid", ""},
+                        {"sqrt", ""},
+                        {"elemwise_add", "_backward_add"},
+                        {"elemwise_mul", "_backward_mul"},
+                        {"elemwise_div", "_backward_div"}};
   } else {
-    binary_operators = {
-      {"elemwise_add", "_backward_add"}
-    };
+    binary_operators = {{"elemwise_add", "_backward_add"}};
   }
   std::vector<float> rates;
   for (size_t i = 0, n = binary_operators.size(); i < n; ++i) {
@@ -120,18 +112,15 @@ static float EvaluateTune(const bool verbose = true) {
     tuningTester.set_calls_per_iteration(10);
     tuningTester.set_total_iterations(5);
     std::cout << "******************************" << std::endl;
-    std::cout << "Operators: " << binary_operators[i].first
-              << ", " << binary_operators[i].second
-              << " for type: " << test::type_name<DType>()
-              << std::endl;
+    std::cout << "Operators: " << binary_operators[i].first << ", " << binary_operators[i].second
+              << " for type: " << test::type_name<DType>() << std::endl;
     std::cout << "******************************" << std::endl;
 
     // Do the performance runs
     std::vector<mxnet::ShapeVector> shapes = tuning_shapes();
 
-    tuningTester.TestTunedOperator({}, verbose, shapes,
-                                   binary_operators[i].first.c_str(),
-                                   binary_operators[i].second.c_str());
+    tuningTester.TestTunedOperator(
+        {}, verbose, shapes, binary_operators[i].first.c_str(), binary_operators[i].second.c_str());
     rates.push_back(tuningTester.CalculateSuccessRate());
   }
   return std::accumulate(rates.begin(), rates.end(), 0.0f) / rates.size();
@@ -175,4 +164,3 @@ TEST(OMP_TUNING, EvaluateTuneTestInt64) {
 }
 
 #endif  // MXNET_USE_OPERATOR_TUNING
-
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 8cd7fd2e8569..ae33d9664ddd 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -20,7 +20,7 @@
  * Copyright (c) 2017 by Contributors
  * \file storage_test.cc
  * \brief cpu/gpu storage tests
-*/
+ */
 #include <gtest/gtest.h>
 #include <dmlc/logging.h>
 #include <mxnet/storage.h>
@@ -30,7 +30,7 @@
 
 TEST(Storage, Basic_CPU) {
   constexpr size_t kSize = 1024;
-  auto&& storage = mxnet::Storage::Get();
+  auto&& storage         = mxnet::Storage::Get();
   mxnet::Context context_cpu{};
   auto&& handle = storage->Alloc(kSize, context_cpu);
   EXPECT_EQ(handle.ctx, context_cpu);
@@ -48,7 +48,7 @@ TEST(Storage, Basic_CPU) {
 }
 
 TEST(Storage, CPU_MemAlign) {
-  #if MXNET_USE_ONEDNN == 1
+#if MXNET_USE_ONEDNN == 1
   // DNNL requires special alignment. 64 is used by the DNNL library in
   // memory allocation.
   static constexpr size_t alignment_ = mxnet::kDNNLAlign;
@@ -56,12 +56,12 @@ TEST(Storage, CPU_MemAlign) {
   static constexpr size_t alignment_ = 16;
 #endif
 
-  auto&& storage = mxnet::Storage::Get();
+  auto&& storage             = mxnet::Storage::Get();
   mxnet::Context context_cpu = mxnet::Context::CPU(0);
 
   for (int i = 0; i < 5; ++i) {
     const size_t kSize = (std::rand() % 1024) + 1;
-    auto&& handle = storage->Alloc(kSize, context_cpu);
+    auto&& handle      = storage->Alloc(kSize, context_cpu);
     EXPECT_EQ(handle.ctx, context_cpu);
     EXPECT_EQ(handle.size, kSize);
     EXPECT_EQ(reinterpret_cast<intptr_t>(handle.dptr) % alignment_, 0);
@@ -69,22 +69,21 @@ TEST(Storage, CPU_MemAlign) {
   }
 }
 
-
 #if MXNET_USE_CUDA
 TEST(Storage_GPU, Basic_GPU) {
   if (mxnet::test::unitTestsWithCuda) {
     setenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", "20", 1);
     setenv("MXNET_GPU_MEM_POOL_TYPE", "Round", 1);
 
-    auto &&storage = mxnet::Storage::Get();
+    auto&& storage             = mxnet::Storage::Get();
     mxnet::Context context_gpu = mxnet::Context::GPU(0);
-    auto &&handle = storage->Alloc(32, context_gpu);
-    auto &&handle2 = storage->Alloc(2097153, context_gpu);
+    auto&& handle              = storage->Alloc(32, context_gpu);
+    auto&& handle2             = storage->Alloc(2097153, context_gpu);
     EXPECT_EQ(handle.ctx, context_gpu);
     EXPECT_EQ(handle.size, 32);
     EXPECT_EQ(handle2.ctx, context_gpu);
     EXPECT_EQ(handle2.size, 2097153);
-    auto ptr = handle.dptr;
+    auto ptr  = handle.dptr;
     auto ptr2 = handle2.dptr;
     storage->Free(handle);
     storage->Free(handle2);
@@ -109,10 +108,10 @@ TEST(Storage_GPU, Basic_GPU) {
     unsetenv("MXNET_GPU_MEM_POOL_TYPE");
   }
   if (mxnet::test::unitTestsWithCuda) {
-    constexpr size_t kSize = 1024;
+    constexpr size_t kSize     = 1024;
     mxnet::Context context_gpu = mxnet::Context::GPU(0);
-    auto &&storage = mxnet::Storage::Get();
-    auto &&handle = storage->Alloc(kSize, context_gpu);
+    auto&& storage             = mxnet::Storage::Get();
+    auto&& handle              = storage->Alloc(kSize, context_gpu);
     assert(handle.ctx == context_gpu);
     assert(handle.size == kSize);
     auto ptr = handle.dptr;
@@ -129,4 +128,3 @@ TEST(Storage_GPU, Basic_GPU) {
   }
 }
 #endif  // MXNET_USE_CUDA
-
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index 4f91a4f67c09..69029ca3824d 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -22,7 +22,7 @@
  * \file test_main.cc
  * \brief operator unit test utility functions
  * \author Chris Olivier
-*/
+ */
 #include <gtest/gtest.h>
 #include "mxnet/base.h"
 
@@ -30,7 +30,8 @@
 #include <breakpad/client/linux/handler/exception_handler.h>
 
 static bool dumpCallback(const google_breakpad::MinidumpDescriptor& descriptor,
-                         void* context, bool succeeded) {
+                         void* context,
+                         bool succeeded) {
   printf("Dump path: %s\n", descriptor.path());
   return succeeded;
 }
@@ -44,9 +45,9 @@ bool debug_output = false;
 #else
 bool debug_output = false;
 #endif
-bool quick_test = false;
-bool performance_run = false;
-bool csv = false;
+bool quick_test              = false;
+bool performance_run         = false;
+bool csv                     = false;
 bool thread_safety_force_cpu = false;
 }  // namespace test
 }  // namespace mxnet
@@ -60,8 +61,8 @@ static bool checkForWorkingCuda() {
     for (int device = 0; device < device_count; ++device) {
       cudaDeviceProp prop;
       if (cudaSuccess == cudaGetDeviceProperties(&prop, device)) {
-        std::cout << "Found CUDA Device #: " << device << " properties: " << prop.major
-                  << "." << prop.minor << std::endl;
+        std::cout << "Found CUDA Device #: " << device << " properties: " << prop.major << "."
+                  << prop.minor << std::endl;
         workingCuda = true;
       }
     }
@@ -80,7 +81,7 @@ void backtrace_test() {
   CHECK(false) << "backtrace()";
 }
 
-int main(int argc, char ** argv) {
+int main(int argc, char** argv) {
 #ifdef USE_BREAKPAD
   google_breakpad::MinidumpDescriptor descriptor("/tmp");
   google_breakpad::ExceptionHandler eh(descriptor, NULL, dumpCallback, NULL, true, -1);
@@ -92,7 +93,7 @@ int main(int argc, char ** argv) {
   mxnet::test::unitTestsWithCuda = checkForWorkingCuda();  // auto-determine
 
   for (int x = 1; x < argc; ++x) {
-    const char *arg = argv[x];
+    const char* arg = argv[x];
     // force checks with CUDA
     if (!strcmp(arg, "--with-cuda")) {
       // override (ie force attempt CUDA)
@@ -108,8 +109,8 @@ int main(int argc, char ** argv) {
     } else if (!strcmp(arg, "--thread-safety-with-cpu")) {
       mxnet::test::thread_safety_force_cpu = true;
     } else if (!strcmp(arg, "--backtrace")) {
-        backtrace_test();
-        return 0;
+      backtrace_test();
+      return 0;
     }
   }
 

From 6ac1280ad0c74e9f78d76c79253205393f267fc2 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:01:11 +0100
Subject: [PATCH 03/10] [INCLUDE] Re-format .cc .h files

---
 include/mxnet/base.h                   |  100 +-
 include/mxnet/c_api.h                  | 1340 ++++++++++++------------
 include/mxnet/c_api_error.h            |   42 +-
 include/mxnet/c_api_test.h             |   21 +-
 include/mxnet/engine.h                 |   53 +-
 include/mxnet/executor.h               |   94 +-
 include/mxnet/expr_operator.h          |   11 +-
 include/mxnet/imperative.h             |  112 +-
 include/mxnet/io.h                     |   46 +-
 include/mxnet/ir/expr.h                |    2 +-
 include/mxnet/kvstore.h                |   50 +-
 include/mxnet/lib_api.h                | 1286 ++++++++++++++---------
 include/mxnet/libinfo.h                |    9 +-
 include/mxnet/node/container.h         |   66 +-
 include/mxnet/node/node.h              |   10 +-
 include/mxnet/op_attr_types.h          |  112 +-
 include/mxnet/operator.h               |  121 +--
 include/mxnet/operator_util.h          |  105 +-
 include/mxnet/random_generator.h       |   77 +-
 include/mxnet/resource.h               |   74 +-
 include/mxnet/rtc.h                    |   18 +-
 include/mxnet/runtime/c_runtime_api.h  |   27 +-
 include/mxnet/runtime/container.h      |   43 +-
 include/mxnet/runtime/container_ext.h  |  289 +++--
 include/mxnet/runtime/data_type.h      |   22 +-
 include/mxnet/runtime/ffi_helper.h     |   40 +-
 include/mxnet/runtime/memory.h         |   52 +-
 include/mxnet/runtime/ndarray.h        |    2 +-
 include/mxnet/runtime/ndarray_handle.h |    4 +-
 include/mxnet/runtime/object.h         |  193 ++--
 include/mxnet/runtime/packed_func.h    |  345 +++---
 include/mxnet/runtime/py_arg.h         |    3 +-
 include/mxnet/runtime/registry.h       |   47 +-
 include/mxnet/storage.h                |   20 +-
 include/mxnet/tensor_blob.h            |  239 +++--
 include/mxnet/tuple.h                  |  241 +++--
 36 files changed, 2827 insertions(+), 2489 deletions(-)
 mode change 100755 => 100644 include/mxnet/tensor_blob.h

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index dc428da8e484..0934250fec80 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -36,7 +36,6 @@
 #include "libinfo.h"
 #include "tuple.h"
 
-
 /*!
  * \brief define dllexport for Visual Studio
  */
@@ -64,7 +63,7 @@
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
-#define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+#define MXNET_VERSION (MXNET_MAJOR * 10000 + MXNET_MINOR * 100 + MXNET_PATCH)
 /*! \brief helper for making version number */
 #define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
 /*!
@@ -91,8 +90,8 @@ using Op = nnvm::Op;
 struct Context {
   /*! \brief Type of device */
   enum DeviceType {
-    kCPU = cpu::kDevMask,
-    kGPU = gpu::kDevMask,
+    kCPU       = cpu::kDevMask,
+    kGPU       = gpu::kDevMask,
     kCPUPinned = 3,
     kCPUShared = 5,
   };
@@ -107,14 +106,16 @@ struct Context {
    * \return cpu::kDevMask or gpu::kDevMask
    */
   inline DeviceType dev_mask() const {
-    if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
+    if (dev_type == kCPUPinned || dev_type == kCPUShared)
+      return kCPU;
     return dev_type;
   }
   /*!
    * \brief Returns dev_id for kGPU and kCPUPinned, 0 otherwise
    */
   inline int real_dev_id() const {
-    if (dev_type == kCPUPinned || dev_type == kGPU) return dev_id;
+    if (dev_type == kCPUPinned || dev_type == kGPU)
+      return dev_id;
     return 0;
   }
   /*!
@@ -122,13 +123,13 @@ struct Context {
    * \param b another context to compare
    * \return compared result
    */
-  inline bool operator<(const Context &b) const;
+  inline bool operator<(const Context& b) const;
   /*!
    * \brief check if current context equals another one
    * \param b another context to compare
    * \return whether dev mask and id are same
    */
-  inline bool operator==(const Context &b) const {
+  inline bool operator==(const Context& b) const {
     return dev_type == b.dev_type && dev_id == b.dev_id;
   }
   /*!
@@ -136,14 +137,14 @@ struct Context {
    * \param b another context to compare
    * \return whether they are not the same
    */
-  inline bool operator!=(const Context &b) const {
+  inline bool operator!=(const Context& b) const {
     return !(*this == b);
   }
   /*!
    * \brief save the content into binary stream
    * \param strm the output stream
    */
-  inline void Save(dmlc::Stream *strm) const {
+  inline void Save(dmlc::Stream* strm) const {
     strm->Write(&dev_type, sizeof(dev_type));
     strm->Write(&dev_id, sizeof(dev_id));
   }
@@ -152,9 +153,11 @@ struct Context {
    * \param strm the output stream
    * \return whether the load is successful
    */
-  inline bool Load(dmlc::Stream *strm) {
-    if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type)) return false;
-    if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t)) return false;
+  inline bool Load(dmlc::Stream* strm) {
+    if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type))
+      return false;
+    if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t))
+      return false;
     return true;
   }
   /*! \brief the maximal device type */
@@ -197,7 +200,7 @@ struct Context {
    * \param total_mem pointer to the uint64_t holding total GPU memory
    * \return No return value
    */
-  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
+  inline static void GetGPUMemoryInformation(int dev, uint64_t* free, uint64_t* total);
   /*!
    * Create a pinned CPU context.
    * \param dev_id the device id for corresponding GPU.
@@ -219,10 +222,10 @@ struct Context {
 
  private:
 #if MXNET_USE_CUDA
-    static void CudaLibChecks();
+  static void CudaLibChecks();
 #endif
 #if MXNET_USE_CUDNN
-    static void CuDNNLibChecks();
+  static void CuDNNLibChecks();
 #endif
 };
 
@@ -234,19 +237,18 @@ class GPUAuxStream {
    * \brief constructor.
    * \param primary_stream gpu stream that is synced with the created auxiliary stream.
    */
-  explicit GPUAuxStream(mshadow::Stream<gpu> *primary_stream) :
-      primary_stream_(primary_stream),
-      aux_stream_(primary_stream),
-      gpu_stream_sync_event_(nullptr) {
+  explicit GPUAuxStream(mshadow::Stream<gpu>* primary_stream)
+      : primary_stream_(primary_stream),
+        aux_stream_(primary_stream),
+        gpu_stream_sync_event_(nullptr) {
     if (Context::GetGPUStreamsPerWorker() >= 2) {
       // Create auxiliary stream on the same device with the same properties as the primary stream
       bool primary_has_blas_handle =
           primary_stream->blas_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
       bool primary_has_dnn_handle =
           primary_stream->dnn_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
-      aux_stream_ = mshadow::NewStream<gpu>(primary_has_blas_handle,
-                                            primary_has_dnn_handle,
-                                            primary_stream->dev_id);
+      aux_stream_ = mshadow::NewStream<gpu>(
+          primary_has_blas_handle, primary_has_dnn_handle, primary_stream->dev_id);
       MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
     }
   }
@@ -275,21 +277,23 @@ class GPUAuxStream {
       StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
   }
   /*! \brief Getter for created auxiliary stream. */
-  mshadow::Stream<gpu> *GetStream() { return aux_stream_; }
+  mshadow::Stream<gpu>* GetStream() {
+    return aux_stream_;
+  }
   /*!
    * \brief Make future work enqueued to `s2` wait on completion of current work enqueued to `s1`.
    * \param s1 stream with work that must be completed before future s2 work can begin.
    * \param s2 stream whose future work is made to wait on the completion of existing s1 work.
    * \param event used to pass s1 state to s2.
    */
-  static void StreamSync(mshadow::Stream<gpu> *s1, mshadow::Stream<gpu> *s2, cudaEvent_t event) {
+  static void StreamSync(mshadow::Stream<gpu>* s1, mshadow::Stream<gpu>* s2, cudaEvent_t event) {
     MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
     MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
   }
 
  private:
-  mshadow::Stream<gpu> *primary_stream_;
-  mshadow::Stream<gpu> *aux_stream_;
+  mshadow::Stream<gpu>* primary_stream_;
+  mshadow::Stream<gpu>* aux_stream_;
   cudaEvent_t gpu_stream_sync_event_;
 };
 
@@ -307,7 +311,7 @@ class SyncedGPUAuxStream {
    * \brief constructor.
    * \param gpu_aux_stream auxilary gpu stream that is managed by this RAII object.
    */
-  explicit SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
+  explicit SyncedGPUAuxStream(GPUAuxStream* gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
     gpu_aux_stream_->PreAuxStreamUseSync();
   }
   /*! \brief destructor */
@@ -328,7 +332,7 @@ class SyncedGPUAuxStream {
   }
 
  private:
-  GPUAuxStream *gpu_aux_stream_;
+  GPUAuxStream* gpu_aux_stream_;
 };
 #endif  // MXNET_USE_CUDA
 
@@ -342,11 +346,11 @@ struct RunContext {
   /*!
    * \brief the stream of the device, can be nullptr or Stream<gpu>* in GPU mode
    */
-  void *stream;
+  void* stream;
   /*!
    * \brief the auxiliary stream of the device, can be nullptr or Stream<gpu>* in GPU mode
    */
-  void *aux_stream;
+  void* aux_stream;
   /*!
    * \brief pointer to the cuda event pool used by the dependency engine
    */
@@ -356,7 +360,7 @@ struct RunContext {
    * \return the mshadow stream
    * \tparam xpu the device type of the stream
    */
-  template<typename xpu>
+  template <typename xpu>
   inline mshadow::Stream<xpu>* get_stream() const {
     return static_cast<mshadow::Stream<xpu>*>(stream);
   }
@@ -379,7 +383,7 @@ struct RunContext {
 //! \cond Doxygen_Suppress
 namespace mxnet {
 // implementing Context
-inline bool Context::operator<(const Context &b) const {
+inline bool Context::operator<(const Context& b) const {
   if (dev_type == b.dev_type) {
     return dev_id < b.dev_id;
   } else {
@@ -389,7 +393,7 @@ inline bool Context::operator<(const Context &b) const {
 inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
   Context ctx;
   ctx.dev_type = dev_type;
-  ctx.dev_id = dev_id < 0 ? 0 : dev_id;
+  ctx.dev_id   = dev_id < 0 ? 0 : dev_id;
   if (dev_type & kGPU) {
 #if MXNET_USE_CUDA
     CudaLibChecks();
@@ -461,8 +465,7 @@ inline int32_t Context::GetGPUStreamsPerWorker() {
   return num_streams;
 }
 
-inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
-                                             uint64_t *total_mem) {
+inline void Context::GetGPUMemoryInformation(int dev, uint64_t* free_mem, uint64_t* total_mem) {
 #if MXNET_USE_CUDA
 
   size_t memF, memT;
@@ -481,12 +484,11 @@ inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
   e = cudaSetDevice(curDevice);
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
 
-  *free_mem = static_cast<uint64_t>(memF);
+  *free_mem  = static_cast<uint64_t>(memF);
   *total_mem = static_cast<uint64_t>(memT);
 
 #else
-  LOG(FATAL)
-      << "This call is only supported for MXNet built with CUDA support.";
+  LOG(FATAL) << "This call is only supported for MXNet built with CUDA support.";
 #endif
 }
 
@@ -496,10 +498,10 @@ inline Context Context::FromString(const std::string& str) {
     const std::string::size_type l = str.find('(');
     CHECK_NE(l, std::string::npos);
     const std::string::size_type r = str.find(')');
-    CHECK_EQ(r, str.length()-1);
+    CHECK_EQ(r, str.length() - 1);
 
     const std::string type = str.substr(0, l);
-    int id = std::stoi(str.substr(l+1, r-l-1));
+    int id                 = std::stoi(str.substr(l + 1, r - l - 1));
     if (type == "cpu") {
       ret = CPU(id);
     } else if (type == "gpu") {
@@ -517,7 +519,7 @@ inline Context Context::FromString(const std::string& str) {
   return ret;
 }
 
-inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
+inline std::ostream& operator<<(std::ostream& out, const Context& ctx) {
   if (ctx.dev_type == Context::kCPU) {
     out << "cpu(";
   } else if (ctx.dev_type == Context::kGPU) {
@@ -535,10 +537,9 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 
 // describe op registration point
 #define STRINGIZE_DETAIL(x) #x
-#define STRINGIZE(x) STRINGIZE_DETAIL(x)
+#define STRINGIZE(x)        STRINGIZE_DETAIL(x)
 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
-#define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
-
+#define ADD_FILELINE        "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
 
 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
 constexpr size_t kDNNLAlign = 64;
@@ -547,17 +548,18 @@ constexpr size_t kDNNLAlign = 64;
 }  // namespace mxnet
 
 namespace std {
-template<> struct hash<mxnet::Context> {
+template <>
+struct hash<mxnet::Context> {
   size_t operator()(const mxnet::Context& ctx) const {
     size_t res = 0;
-    res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
-    res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
+    res        = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
+    res        = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
     return res;
   }
 };
 
 #if __cplusplus < 201402L && !defined(_MSC_VER)
-template<typename T, typename... Args>
+template <typename T, typename... Args>
 inline std::unique_ptr<T> make_unique(Args&&... args) {
   return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 0aff74772c47..7611236e50e7 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -63,51 +63,51 @@ typedef int64_t dim_t;
 // will be casted internally to specific pointers types
 // these typedefs are mainly used for readablity reasons
 /*! \brief handle to NDArray */
-typedef void *NDArrayHandle;
+typedef void* NDArrayHandle;
 /*! \brief handle to a mxnet narray function that changes NDArray */
-typedef const void *FunctionHandle;
+typedef const void* FunctionHandle;
 /*! \brief handle to a function that takes param and creates symbol */
-typedef void *AtomicSymbolCreator;
+typedef void* AtomicSymbolCreator;
 /*! \brief handle to cached operator */
-typedef void *CachedOpHandle;
+typedef void* CachedOpHandle;
 /*! \brief handle to a symbol that can be bind as operator */
-typedef void *SymbolHandle;
+typedef void* SymbolHandle;
 /*! \brief handle to a AtomicSymbol */
-typedef void *AtomicSymbolHandle;
+typedef void* AtomicSymbolHandle;
 /*! \brief handle to an Executor */
-typedef void *ExecutorHandle;
+typedef void* ExecutorHandle;
 /*! \brief handle a dataiter creator */
-typedef void *DataIterCreator;
+typedef void* DataIterCreator;
 /*! \brief handle to a DataIterator */
-typedef void *DataIterHandle;
+typedef void* DataIterHandle;
 /*! \brief handle a dataset creator */
-typedef void *DatasetCreator;
+typedef void* DatasetCreator;
 /*! \brief handle to a Dataset */
-typedef void *DatasetHandle;
+typedef void* DatasetHandle;
 /*! \brief handle to a BatchifyFunction creator*/
-typedef void *BatchifyFunctionCreator;
+typedef void* BatchifyFunctionCreator;
 /*! \brief handle to a BatchifyFunction */
-typedef void *BatchifyFunctionHandle;
+typedef void* BatchifyFunctionHandle;
 /*! \brief handle to KVStore */
-typedef void *KVStoreHandle;
+typedef void* KVStoreHandle;
 /*! \brief handle to RecordIO */
-typedef void *RecordIOHandle;
+typedef void* RecordIOHandle;
 /*! \brief handle to MXRtc*/
-typedef void *RtcHandle;
+typedef void* RtcHandle;
 /*! \brief handle to rtc cuda module*/
-typedef void *CudaModuleHandle;
+typedef void* CudaModuleHandle;
 /*! \brief handle to rtc cuda kernel*/
-typedef void *CudaKernelHandle;
+typedef void* CudaKernelHandle;
 /*! \brief handle to a Profile object (domain, duration, counter, etc.) */
-typedef void *ProfileHandle;
+typedef void* ProfileHandle;
 /*! \brief handle to DLManagedTensor*/
-typedef void *DLManagedTensorHandle;
+typedef void* DLManagedTensorHandle;
 /*! \brief handle to Context */
-typedef const void *ContextHandle;
+typedef const void* ContextHandle;
 /*! \brief handle to Engine FnProperty */
-typedef const void *EngineFnPropertyHandle;
+typedef const void* EngineFnPropertyHandle;
 /*! \brief handle to Engine VarHandle */
-typedef void *EngineVarHandle;
+typedef void* EngineVarHandle;
 
 /*! \brief Engine asynchronous operation */
 typedef void (*EngineAsyncFunc)(void*, void*, void*, void*);
@@ -116,10 +116,7 @@ typedef void (*EngineSyncFunc)(void*, void*);
 /*! \brief Callback to free the param for EngineAsyncFunc/EngineSyncFunc */
 typedef void (*EngineFuncParamDeleter)(void*);
 /*! \brief Monitor callback called at operator level for cached op */
-typedef void (*CachedOpMonitorCallback)(const char*,
-                                        const char*,
-                                        NDArrayHandle);
-
+typedef void (*CachedOpMonitorCallback)(const char*, const char*, NDArrayHandle);
 
 struct NativeOpInfo {
   void (*forward)(int, float**, int*, unsigned**, int*, void*);
@@ -141,8 +138,7 @@ struct NDArrayOpInfo {
   bool (*infer_shape)(int, int*, unsigned**, void*);
   bool (*list_outputs)(char***, void*);
   bool (*list_arguments)(char***, void*);
-  bool (*declare_backward_dependency)(const int*, const int*, const int*,
-                                      int*, int**, void*);
+  bool (*declare_backward_dependency)(const int*, const int*, const int*, int*, int**, void*);
   // all functions also pass a payload void* pointer
   void* p_forward;
   void* p_backward;
@@ -157,7 +153,7 @@ typedef int (*MXGenericCallback)(void);
 struct MXCallbackList {
   int num_callbacks;
   int (**callbacks)(void);
-  void **contexts;
+  void** contexts;
 };
 
 struct LibFeature {
@@ -165,11 +161,7 @@ struct LibFeature {
   bool enabled;
 };
 
-enum CustomOpCallbacks {
-  kCustomOpDelete,
-  kCustomOpForward,
-  kCustomOpBackward
-};
+enum CustomOpCallbacks { kCustomOpDelete, kCustomOpForward, kCustomOpBackward };
 
 enum CustomOpPropCallbacks {
   kCustomOpPropDelete,
@@ -184,39 +176,50 @@ enum CustomOpPropCallbacks {
   kCustomOpPropBackwardInferStorageType
 };
 
-
-typedef int (*CustomOpFBFunc)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
-                              const int* /*reqs*/, const int /*is_train*/,
+typedef int (*CustomOpFBFunc)(int /*size*/,
+                              void** /*ptrs*/,
+                              int* /*tags*/,
+                              const int* /*reqs*/,
+                              const int /*is_train*/,
                               void* /*state*/);
 typedef int (*CustomOpDelFunc)(void* /*state*/);
 typedef int (*CustomOpListFunc)(char*** /*args*/, void* /*state*/);
-typedef int (*CustomOpInferShapeFunc)(int /*num_input*/, int* /*ndims*/,
-                                      int** /*shapes*/, void* /*state*/);
+typedef int (*CustomOpInferShapeFunc)(int /*num_input*/,
+                                      int* /*ndims*/,
+                                      int** /*shapes*/,
+                                      void* /*state*/);
 typedef int (*CustomOpInferStorageTypeFunc)(int /*num_input*/, int* /*stypes*/, void* /*state*/);
 typedef int (*CustomOpBackwardInferStorageTypeFunc)(int /*num_input*/,
-                                                    int * /*stypes*/,
-                                                    int * /*tags*/,
-                                                    void * /*state*/);
+                                                    int* /*stypes*/,
+                                                    int* /*tags*/,
+                                                    void* /*state*/);
 typedef int (*CustomOpInferTypeFunc)(int /*num_input*/, int* /*types*/, void* /*state*/);
-typedef int (*CustomOpBwdDepFunc)(const int* /*out_grad*/, const int* /*in_data*/,
-                                  const int* /*out_data*/, int* /*num_deps*/,
-                                  int** /*rdeps*/, void* /*state*/);
-typedef int (*CustomOpCreateFunc)(const char* /*ctx*/, int /*num_inputs*/,
-                                  unsigned** /*shapes*/, const int* /*ndims*/,
-                                  const int* /*dtypes*/, struct MXCallbackList* /*ret*/,
+typedef int (*CustomOpBwdDepFunc)(const int* /*out_grad*/,
+                                  const int* /*in_data*/,
+                                  const int* /*out_data*/,
+                                  int* /*num_deps*/,
+                                  int** /*rdeps*/,
                                   void* /*state*/);
-typedef int (*CustomOpPropCreator)(const char* /*op_type*/, const int /*num_kwargs*/,
-                                   const char** /*keys*/, const char** /*values*/,
+typedef int (*CustomOpCreateFunc)(const char* /*ctx*/,
+                                  int /*num_inputs*/,
+                                  unsigned** /*shapes*/,
+                                  const int* /*ndims*/,
+                                  const int* /*dtypes*/,
+                                  struct MXCallbackList* /*ret*/,
+                                  void* /*state*/);
+typedef int (*CustomOpPropCreator)(const char* /*op_type*/,
+                                   const int /*num_kwargs*/,
+                                   const char** /*keys*/,
+                                   const char** /*values*/,
                                    struct MXCallbackList* /*ret*/);
 
+enum CustomFunctionCallbacks { kCustomFunctionBackward, kCustomFunctionDelete };
 
-enum CustomFunctionCallbacks {
-  kCustomFunctionBackward,
-  kCustomFunctionDelete
-};
-
-typedef int (*CustomFunctionBwdFunc)(int /*num_ograds*/, int /*num_igrads*/, void** /*ptrs*/,
-                                     const int* /*reqs*/, const int /*is_train*/,
+typedef int (*CustomFunctionBwdFunc)(int /*num_ograds*/,
+                                     int /*num_igrads*/,
+                                     void** /*ptrs*/,
+                                     const int* /*reqs*/,
+                                     const int /*is_train*/,
                                      void* /*state*/);
 typedef int (*CustomFunctionDelFunc)(void* /*state*/);
 
@@ -229,7 +232,7 @@ typedef int (*CustomFunctionDelFunc)(void* /*state*/);
  *  this function is threadsafe and can be called by different thread
  *  \return error info
  */
-MXNET_DLL const char *MXGetLastError();
+MXNET_DLL const char* MXGetLastError();
 
 //-------------------------------------
 // Part 0: Global State setups
@@ -241,7 +244,7 @@ MXNET_DLL const char *MXGetLastError();
  * \param 0 for quiet, 1 for verbose
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXLoadLib(const char *path, unsigned verbose, void** lib);
+MXNET_DLL int MXLoadLib(const char* path, unsigned verbose, void** lib);
 
 /*!
  * \brief Get list of features supported on the runtime
@@ -249,7 +252,7 @@ MXNET_DLL int MXLoadLib(const char *path, unsigned verbose, void** lib);
  * \param size of the array
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXLibInfoFeatures(const struct LibFeature **libFeature, size_t *size);
+MXNET_DLL int MXLibInfoFeatures(const struct LibFeature** libFeature, size_t* size);
 
 /*!
  * \brief return whether the mxnet library is compiled with cxx11 abi
@@ -299,7 +302,8 @@ MXNET_DLL int MXNotifyShutdown();
  * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProcessProfilerConfig(int num_params, const char* const* keys,
+MXNET_DLL int MXSetProcessProfilerConfig(int num_params,
+                                         const char* const* keys,
                                          const char* const* vals,
                                          KVStoreHandle kvstoreHandle);
 
@@ -323,7 +327,8 @@ MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, const
  * \param kvstoreHandle handle to kvstore, needed for server process profiling
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXSetProcessProfilerState(int state, int profile_process,
+MXNET_DLL int MXSetProcessProfilerState(int state,
+                                        int profile_process,
                                         KVStoreHandle kvStoreHandle);
 
 /*!
@@ -353,7 +358,6 @@ MXNET_DLL int MXSetProfilerScope(const char* scope);
  */
 MXNET_DLL int MXDumpProcessProfile(int finished, int profile_process, KVStoreHandle kvStoreHandle);
 
-
 /*!
  * \brief Save profile and stop profiler for worker/current process
  * \param finished true if stat output should stop after this point
@@ -372,8 +376,11 @@ MXNET_DLL int MXDumpProfile(int finished);
  * \return 0 when success, -1 when failure happens.
  * \note
  */
-MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset, int format,
-                                           int sort_by, int ascending);
+MXNET_DLL int MXAggregateProfileStatsPrint(const char** out_str,
+                                           int reset,
+                                           int format,
+                                           int sort_by,
+                                           int ascending);
 
 /*!
  * \brief Pause profiler tuning collection
@@ -399,7 +406,7 @@ MXNET_DLL int MXProfilePause(int paused);
  * \param out Return domain object
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXProfileCreateDomain(const char *domain, ProfileHandle *out);
+MXNET_DLL int MXProfileCreateDomain(const char* domain, ProfileHandle* out);
 
 /*!
  * \brief Create profile task
@@ -408,9 +415,7 @@ MXNET_DLL int MXProfileCreateDomain(const char *domain, ProfileHandle *out);
  * \param out Output handle
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXProfileCreateTask(ProfileHandle domain,
-                                  const char *task_name,
-                                  ProfileHandle *out);
+MXNET_DLL int MXProfileCreateTask(ProfileHandle domain, const char* task_name, ProfileHandle* out);
 
 /*!
  * \brief Create profile frame
@@ -420,8 +425,8 @@ MXNET_DLL int MXProfileCreateTask(ProfileHandle domain,
  * \return 0 when success, -1 when failure happens.
  */
 MXNET_DLL int MXProfileCreateFrame(ProfileHandle domain,
-                                   const char *frame_name,
-                                   ProfileHandle *out);
+                                   const char* frame_name,
+                                   ProfileHandle* out);
 
 /*!
  * \brief Create profile event
@@ -429,7 +434,7 @@ MXNET_DLL int MXProfileCreateFrame(ProfileHandle domain,
  * \param out Output handle
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXProfileCreateEvent(const char *event_name, ProfileHandle *out);
+MXNET_DLL int MXProfileCreateEvent(const char* event_name, ProfileHandle* out);
 
 /*!
  * \brief Create profile counter
@@ -439,8 +444,8 @@ MXNET_DLL int MXProfileCreateEvent(const char *event_name, ProfileHandle *out);
  * \return 0 when success, -1 when failure happens.
  */
 MXNET_DLL int MXProfileCreateCounter(ProfileHandle domain,
-                                     const char *counter_name,
-                                     ProfileHandle *out);
+                                     const char* counter_name,
+                                     ProfileHandle* out);
 
 /*!
  * \brief Destroy a frame
@@ -487,8 +492,8 @@ MXNET_DLL int MXProfileAdjustCounter(ProfileHandle counter_handle, int64_t value
  * \return 0 when success, -1 when failure happens.
  */
 MXNET_DLL int MXProfileSetMarker(ProfileHandle domain,
-                                 const char *instant_marker_name,
-                                 const char *scope);
+                                 const char* instant_marker_name,
+                                 const char* scope);
 
 /*!
  * \brief Set the number of OMP threads to use
@@ -519,7 +524,7 @@ MXNET_DLL int MXGetGPUCount(int* out);
  * \param total_mem pointer to the integer holding total GPU memory
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);
+MXNET_DLL int MXGetGPUMemoryInformation(int dev, int* free_mem, int* total_mem);
 
 /*!
  * \brief get the free and total available memory on a GPU
@@ -528,14 +533,14 @@ MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);
  * \param total_mem pointer to the uint64_t holding total GPU memory
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem);
+MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t* free_mem, uint64_t* total_mem);
 
 /*!
  * \brief get the MXNet library version as an integer
  * \param pointer to the integer holding the version number
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXGetVersion(int *out);
+MXNET_DLL int MXGetVersion(int* out);
 
 /*!
  * \brief Load TVM operator from the binary library
@@ -543,7 +548,7 @@ MXNET_DLL int MXGetVersion(int *out);
  * \return 0 when success, -1 when failure happens
  */
 #if MXNET_USE_TVM_OP
-MXNET_DLL int MXLoadTVMOp(const char *libpath);
+MXNET_DLL int MXLoadTVMOp(const char* libpath);
 
 struct OtherOptionEntity {
   int val;
@@ -572,7 +577,6 @@ typedef struct ConfigSpaces {
 MXNET_DLL int MXLoadTVMConfig(ConfigSpaces config);
 #endif  // MXNET_USE_TVM_OP
 
-
 //-------------------------------------
 // Part 1: NDArray creation and deletion
 //-------------------------------------
@@ -583,7 +587,7 @@ MXNET_DLL int MXLoadTVMConfig(ConfigSpaces config);
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreateNone(NDArrayHandle *out);
+MXNET_DLL int MXNDArrayCreateNone(NDArrayHandle* out);
 
 /*!
  * \brief create a NDArray with specified shape and data type
@@ -599,13 +603,13 @@ MXNET_DLL int MXNDArrayCreateNone(NDArrayHandle *out);
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreate(const uint32_t *shape,
+MXNET_DLL int MXNDArrayCreate(const uint32_t* shape,
                               uint32_t ndim,
                               int dev_type,
                               int dev_id,
                               int delay_alloc,
                               int dtype,
-                              NDArrayHandle *out);
+                              NDArrayHandle* out);
 #define MXNDArrayCreateEx MXNDArrayCreate  // backward compatibility for external deps
 
 /*!
@@ -622,13 +626,13 @@ MXNET_DLL int MXNDArrayCreate(const uint32_t *shape,
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreate64(const int64_t *shape,
+MXNET_DLL int MXNDArrayCreate64(const int64_t* shape,
                                 int ndim,
                                 int dev_type,
                                 int dev_id,
                                 int delay_alloc,
                                 int dtype,
-                                NDArrayHandle *out);
+                                NDArrayHandle* out);
 
 /*!
  * \brief create an empty sparse NDArray with specified shape and data type
@@ -650,17 +654,17 @@ MXNET_DLL int MXNDArrayCreate64(const int64_t *shape,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
-                                      const uint32_t *shape,
+                                      const uint32_t* shape,
                                       uint32_t ndim,
                                       int dev_type,
                                       int dev_id,
                                       int delay_alloc,
                                       int dtype,
                                       uint32_t num_aux,
-                                      int *aux_type,
-                                      uint32_t *aux_ndims,
-                                      const uint32_t *aux_shape,
-                                      NDArrayHandle *out);
+                                      int* aux_type,
+                                      uint32_t* aux_ndims,
+                                      const uint32_t* aux_shape,
+                                      NDArrayHandle* out);
 
 /*!
  * \brief create an empty sparse NDArray with specified shape and data type
@@ -682,17 +686,17 @@ MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayCreateSparseEx64(int storage_type,
-                                        const int64_t *shape,
+                                        const int64_t* shape,
                                         int ndim,
                                         int dev_type,
                                         int dev_id,
                                         int delay_alloc,
                                         int dtype,
                                         uint32_t num_aux,
-                                        int *aux_type,
-                                        int *aux_ndims,
-                                        const int64_t *aux_shape,
-                                        NDArrayHandle *out);
+                                        int* aux_type,
+                                        int* aux_ndims,
+                                        const int64_t* aux_shape,
+                                        NDArrayHandle* out);
 
 /*!
  * \brief create a NDArray handle that is loaded from raw bytes.
@@ -701,9 +705,7 @@ MXNET_DLL int MXNDArrayCreateSparseEx64(int storage_type,
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayLoadFromRawBytes(const void *buf,
-                                        size_t size,
-                                        NDArrayHandle *out);
+MXNET_DLL int MXNDArrayLoadFromRawBytes(const void* buf, size_t size, NDArrayHandle* out);
 /*!
  * \brief save the NDArray into raw bytes.
  * \param handle the NDArray handle
@@ -711,9 +713,7 @@ MXNET_DLL int MXNDArrayLoadFromRawBytes(const void *buf,
  * \param out_buf the head of returning memory bytes.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArraySaveRawBytes(NDArrayHandle handle,
-                                    size_t *out_size,
-                                    const char **out_buf);
+MXNET_DLL int MXNDArraySaveRawBytes(NDArrayHandle handle, size_t* out_size, const char** out_buf);
 /*!
  * \brief Save list of narray into the file.
  * \param fname name of the file.
@@ -748,9 +748,9 @@ MXNET_DLL int MXNDArraySave(const char* fname,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayLoad(const char* fname,
-                            uint32_t *out_size,
+                            uint32_t* out_size,
                             NDArrayHandle** out_arr,
-                            uint32_t *out_name_size,
+                            uint32_t* out_name_size,
                             const char*** out_names);
 
 /*!
@@ -767,11 +767,11 @@ MXNET_DLL int MXNDArrayLoad(const char* fname,
  * \param out_names the names of returning NDArrays, can be NULL
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayLoadFromBuffer(const void *ndarray_buffer,
+MXNET_DLL int MXNDArrayLoadFromBuffer(const void* ndarray_buffer,
                                       size_t size,
-                                      uint32_t *out_size,
+                                      uint32_t* out_size,
                                       NDArrayHandle** out_arr,
-                                      uint32_t *out_name_size,
+                                      uint32_t* out_name_size,
                                       const char*** out_names);
 
 /*!
@@ -785,9 +785,7 @@ MXNET_DLL int MXNDArrayLoadFromBuffer(const void *ndarray_buffer,
  * \param data the data source to copy from.
  * \param size the memory size we want to copy from.
  */
-MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
-                                       const void *data,
-                                       size_t size);
+MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle, const void* data, size_t size);
 /*!
  * \brief Perform a synchronize copyto a contiguous CPU memory region.
  *
@@ -799,9 +797,7 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
  * \param data the data source to copy into.
  * \param size the memory size we want to copy into.
  */
-MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
-                                     void *data,
-                                     size_t size);
+MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle, void* data, size_t size);
 
 /*!
  * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
@@ -864,7 +860,7 @@ MXNET_DLL int MXNDArrayFree(NDArrayHandle handle);
 MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
                              uint32_t slice_begin,
                              uint32_t slice_end,
-                             NDArrayHandle *out);
+                             NDArrayHandle* out);
 
 /*!
  * \brief Slice the NDArray along axis 0.
@@ -879,7 +875,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
 MXNET_DLL int MXNDArraySlice64(NDArrayHandle handle,
                                int64_t slice_begin,
                                int64_t slice_end,
-                               NDArrayHandle *out);
+                               NDArrayHandle* out);
 
 /*!
  * \brief Index the NDArray along axis 0.
@@ -890,9 +886,7 @@ MXNET_DLL int MXNDArraySlice64(NDArrayHandle handle,
  * \param out The NDArrayHandle of output NDArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
-                          uint32_t idx,
-                          NDArrayHandle *out);
+MXNET_DLL int MXNDArrayAt(NDArrayHandle handle, uint32_t idx, NDArrayHandle* out);
 
 /*!
  * \brief Index the NDArray along axis 0.
@@ -903,15 +897,12 @@ MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
  * \param out The NDArrayHandle of output NDArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayAt64(NDArrayHandle handle,
-                            int64_t idx,
-                            NDArrayHandle *out);
+MXNET_DLL int MXNDArrayAt64(NDArrayHandle handle, int64_t idx, NDArrayHandle* out);
 
 /*!
  * \brief get the storage type of the array
  */
-MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle,
-                                      int *out_storage_type);
+MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle, int* out_storage_type);
 
 /*!
  * \brief Reshape the NDArray.
@@ -921,10 +912,7 @@ MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle,
  * \param out the NDArrayHandle of reshaped NDArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
-                               int ndim,
-                               int *dims,
-                               NDArrayHandle *out);
+MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle, int ndim, int* dims, NDArrayHandle* out);
 
 /*!
  * \brief Reshape the NDArray.
@@ -936,9 +924,9 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  int ndim,
-                                 dim_t *dims,
+                                 dim_t* dims,
                                  bool reverse,
-                                 NDArrayHandle *out);
+                                 NDArrayHandle* out);
 
 /*!
  * \brief get the shape of the array
@@ -949,9 +937,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
-                                int *out_dim,
-                                const int **out_pdata);
+MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle, int* out_dim, const int** out_pdata);
 
 /*!
  * \brief get the shape of the array
@@ -962,9 +948,7 @@ MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetShape64(NDArrayHandle handle,
-                                  int *out_dim,
-                                  const int64_t **out_pdata);
+MXNET_DLL int MXNDArrayGetShape64(NDArrayHandle handle, int* out_dim, const int64_t** out_pdata);
 
 /*!
  * \brief get the content of the data in NDArray
@@ -972,37 +956,35 @@ MXNET_DLL int MXNDArrayGetShape64(NDArrayHandle handle,
  * \param out_pdata pointer holder to get pointer of data
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
-                               void **out_pdata);
-/*!
-* \brief Create a reference view of NDArray that
-*  represents as DLManagedTensor
-*  Notice: MXNet uses asynchronous execution. Please call MXNDArrayWaitToRead or
-*          MXNDArrayWaitToWrite before calling MXNDArrayToDLPack.
-* \param handle the handle to the ndarray
-* \param out_dlpack pointer holder to get pointer of DLManagedTensor
-* \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
-                                       DLManagedTensorHandle *out_dlpack);
-
-/*!
-* \brief Create a NDArray backed by a dlpack tensor.
-*
-* This allows us to create a NDArray using the memory
-* allocated by an external deep learning framework
-* that is DLPack compatible.
-*
-* The memory is retained until the NDArray went out of scope.
-*
-* \param dlpack the pointer of the input DLManagedTensor
-* \param transient_handle whether the handle will be destructed before calling the deleter
-* \param out_handle pointer holder to get pointer of NDArray
-* \return 0 when success, -1 when failure happens
-*/
+MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle, void** out_pdata);
+/*!
+ * \brief Create a reference view of NDArray that
+ *  represents as DLManagedTensor
+ *  Notice: MXNet uses asynchronous execution. Please call MXNDArrayWaitToRead or
+ *          MXNDArrayWaitToWrite before calling MXNDArrayToDLPack.
+ * \param handle the handle to the ndarray
+ * \param out_dlpack pointer holder to get pointer of DLManagedTensor
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle, DLManagedTensorHandle* out_dlpack);
+
+/*!
+ * \brief Create a NDArray backed by a dlpack tensor.
+ *
+ * This allows us to create a NDArray using the memory
+ * allocated by an external deep learning framework
+ * that is DLPack compatible.
+ *
+ * The memory is retained until the NDArray went out of scope.
+ *
+ * \param dlpack the pointer of the input DLManagedTensor
+ * \param transient_handle whether the handle will be destructed before calling the deleter
+ * \param out_handle pointer holder to get pointer of NDArray
+ * \return 0 when success, -1 when failure happens
+ */
 MXNET_DLL int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
                                   const bool transient_handle,
-                                  NDArrayHandle *out_handle);
+                                  NDArrayHandle* out_handle);
 
 /*!
  * \brief Delete a dlpack tensor
@@ -1017,8 +999,7 @@ MXNET_DLL int MXNDArrayCallDLPackDeleter(DLManagedTensorHandle dlpack);
  * \param out_dtype pointer holder to get type of data
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
-                               int *out_dtype);
+MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle, int* out_dtype);
 
 /*!
  * \brief get the type of the ith aux data in NDArray
@@ -1029,9 +1010,7 @@ MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
  * \param out_type pointer holder to get type of aux data
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle,
-                                  uint32_t i,
-                                  int *out_type);
+MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle, uint32_t i, int* out_type);
 
 /*!
  * \brief get the type of the ith aux data in NDArray
@@ -1042,9 +1021,7 @@ MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle,
  * \param out_type pointer holder to get type of aux data
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetAuxType64(NDArrayHandle handle,
-                                    int64_t i,
-                                    int *out_type);
+MXNET_DLL int MXNDArrayGetAuxType64(NDArrayHandle handle, int64_t i, int* out_type);
 
 /*!
  * \brief Get a deep copy of the ith aux data blob
@@ -1053,9 +1030,7 @@ MXNET_DLL int MXNDArrayGetAuxType64(NDArrayHandle handle,
  * in the form of an NDArray of default storage type.
  * This function blocks. Do not use it in performance critical code.
  */
-MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
-                                     uint32_t i,
-                                     NDArrayHandle *out);
+MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle, uint32_t i, NDArrayHandle* out);
 
 /*!
  * \brief Get a deep copy of the ith aux data blob
@@ -1064,17 +1039,14 @@ MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
  * in the form of an NDArray of default storage type.
  * This function blocks. Do not use it in performance critical code.
  */
-MXNET_DLL int MXNDArrayGetAuxNDArray64(NDArrayHandle handle,
-                                       int64_t i,
-                                       NDArrayHandle *out);
+MXNET_DLL int MXNDArrayGetAuxNDArray64(NDArrayHandle handle, int64_t i, NDArrayHandle* out);
 
 /*!
  * \brief Get a deep copy of the data blob
  * in the form of an NDArray of default storage type.
  * This function blocks. Do not use it in performance critical code.
  */
-MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle,
-                                      NDArrayHandle *out);
+MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle, NDArrayHandle* out);
 /*!
  * \brief get the context of the NDArray
  * \param handle the handle to the narray
@@ -1082,21 +1054,19 @@ MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle,
  * \param out_dev_id the output device id
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetContext(NDArrayHandle handle,
-                                  int *out_dev_type,
-                                  int *out_dev_id);
+MXNET_DLL int MXNDArrayGetContext(NDArrayHandle handle, int* out_dev_type, int* out_dev_id);
 /*!
  * \brief return gradient buffer attached to this NDArray
  * \param handle NDArray handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetGrad(NDArrayHandle handle, NDArrayHandle *out);
+MXNET_DLL int MXNDArrayGetGrad(NDArrayHandle handle, NDArrayHandle* out);
 /*!
  * \brief detach and ndarray from computation graph by clearing entry_
  * \param handle NDArray handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayDetach(NDArrayHandle handle, NDArrayHandle *out);
+MXNET_DLL int MXNDArrayDetach(NDArrayHandle handle, NDArrayHandle* out);
 /*!
  * \brief set the flag for gradient array state.
  * \param handle NDArray handle
@@ -1110,7 +1080,7 @@ MXNET_DLL int MXNDArraySetGradState(NDArrayHandle handle, int state);
  * \param state the new state.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetGradState(NDArrayHandle handle, int *out);
+MXNET_DLL int MXNDArrayGetGradState(NDArrayHandle handle, int* out);
 //--------------------------------
 // Part 2: functions on NDArray
 //--------------------------------
@@ -1121,8 +1091,7 @@ MXNET_DLL int MXNDArrayGetGradState(NDArrayHandle handle, int *out);
  * \param out_array the output function array
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXListFunctions(uint32_t *out_size,
-                              FunctionHandle **out_array);
+MXNET_DLL int MXListFunctions(uint32_t* out_size, FunctionHandle** out_array);
 
 /*!
  * \brief get the function handle by name
@@ -1130,8 +1099,7 @@ MXNET_DLL int MXListFunctions(uint32_t *out_size,
  * \param out the corresponding function handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXGetFunction(const char *name,
-                            FunctionHandle *out);
+MXNET_DLL int MXGetFunction(const char* name, FunctionHandle* out);
 /*!
  * \brief Get the information of the function handle.
  * \param fun The function handle.
@@ -1145,13 +1113,13 @@ MXNET_DLL int MXGetFunction(const char *name,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXFuncGetInfo(FunctionHandle fun,
-                            const char **name,
-                            const char **description,
-                            uint32_t *num_args,
-                            const char ***arg_names,
-                            const char ***arg_type_infos,
-                            const char ***arg_descriptions,
-                            const char **return_type DEFAULT(NULL));
+                            const char** name,
+                            const char** description,
+                            uint32_t* num_args,
+                            const char*** arg_names,
+                            const char*** arg_type_infos,
+                            const char*** arg_descriptions,
+                            const char** return_type DEFAULT(NULL));
 /*!
  * \brief get the argument requirements of the function
  * \param fun input function handle
@@ -1163,10 +1131,10 @@ MXNET_DLL int MXFuncGetInfo(FunctionHandle fun,
  * \sa MXFuncInvoke
  */
 MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
-                             uint32_t *num_use_vars,
-                             uint32_t *num_scalars,
-                             uint32_t *num_mutate_vars,
-                             int *type_mask);
+                             uint32_t* num_use_vars,
+                             uint32_t* num_scalars,
+                             uint32_t* num_mutate_vars,
+                             int* type_mask);
 /*!
  * \brief invoke a function, the array size of passed in arguments
  *   must match the values in the
@@ -1181,12 +1149,12 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
  * \sa MXFuncDescribeArgs
  */
 MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
-                           NDArrayHandle *use_vars,
-                           float *scalar_args,
-                           NDArrayHandle *mutate_vars,
+                           NDArrayHandle* use_vars,
+                           float* scalar_args,
+                           NDArrayHandle* mutate_vars,
                            int num_params,
-                           char **param_keys,
-                           char **param_vals);
+                           char** param_keys,
+                           char** param_vals);
 /*!
  * \brief invoke a nnvm op and imperative function
  * \param creator the op
@@ -1202,13 +1170,13 @@ MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
  */
 MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
                                  int num_inputs,
-                                 NDArrayHandle *inputs,
-                                 int *num_outputs,
-                                 NDArrayHandle **outputs,
+                                 NDArrayHandle* inputs,
+                                 int* num_outputs,
+                                 NDArrayHandle** outputs,
                                  int num_params,
-                                 const char **param_keys,
-                                 const char **param_vals,
-                                 const int **out_stypes);
+                                 const char** param_keys,
+                                 const char** param_vals,
+                                 const int** out_stypes);
 /*!
  * \brief set whether to record operator for autograd
  * \param is_recording 1 when recording, 0 when not recording.
@@ -1270,25 +1238,23 @@ MXNET_DLL int MXSetIsNumpyDefaultDtype(bool dtype_flag, bool* prev);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXAutogradMarkVariables(uint32_t num_var,
-                                      NDArrayHandle *var_handles,
-                                      uint32_t *reqs_array,
-                                      NDArrayHandle *grad_handles);
+                                      NDArrayHandle* var_handles,
+                                      uint32_t* reqs_array,
+                                      NDArrayHandle* grad_handles);
 /*!
  * \brief unmark nonleaf NDArrays to free the memory
  * \param num_var number of variable NDArrays
  * \param var_handles variable NDArrays
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXAutogradDropGrads(uint32_t num_var,
-                                  NDArrayHandle *var_handles);
+MXNET_DLL int MXAutogradDropGrads(uint32_t num_var, NDArrayHandle* var_handles);
 /*!
  * \brief compute the gradient of outputs w.r.t variabels
  * \param num_output number of output NDArray
  * \param output_handles output NDArrays
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXAutogradComputeGradient(uint32_t num_output,
-                                        NDArrayHandle* output_handles);
+MXNET_DLL int MXAutogradComputeGradient(uint32_t num_output, NDArrayHandle* output_handles);
 /*!
  * \brief compute the gradient of outputs w.r.t variabels
  * \param num_output number of output NDArray
@@ -1313,21 +1279,21 @@ MXNET_DLL int MXAutogradBackward(uint32_t num_output,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXAutogradBackwardEx(uint32_t num_output,
-                                   NDArrayHandle *output_handles,
-                                   NDArrayHandle *ograd_handles,
+                                   NDArrayHandle* output_handles,
+                                   NDArrayHandle* ograd_handles,
                                    uint32_t num_variables,
-                                   NDArrayHandle *var_handles,
+                                   NDArrayHandle* var_handles,
                                    int retain_graph,
                                    int create_graph,
                                    int is_train,
-                                   NDArrayHandle **grad_handles,
-                                   int **grad_stypes);
+                                   NDArrayHandle** grad_handles,
+                                   int** grad_stypes);
 /*
  * \brief get the graph constructed by autograd.
  * \param handle ndarray handle
  * \param out output symbol handle
  */
-MXNET_DLL int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out);
+MXNET_DLL int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle* out);
 
 /*!
  * \brief create cached operator, allows to choose thread_safe version
@@ -1337,7 +1303,7 @@ MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
                                int num_flags,
                                const char** keys,
                                const char** vals,
-                               CachedOpHandle *out,
+                               CachedOpHandle* out,
                                bool thread_safe DEFAULT(false));
 
 /*!
@@ -1348,8 +1314,7 @@ MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle);
 /*!
  * \brief get optimized graph from the cached op
  */
-MXNET_DLL int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
-                                           SymbolHandle *out);
+MXNET_DLL int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle, SymbolHandle* out);
 
 /*!
  * \brief invoke a cached op
@@ -1365,11 +1330,11 @@ MXNET_DLL int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
  */
 MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
                                int num_inputs,
-                               NDArrayHandle *inputs,
+                               NDArrayHandle* inputs,
                                int default_dev_type,
                                int default_dev_id,
-                               int *num_outputs,
-                               NDArrayHandle **outputs,
+                               int* num_outputs,
+                               NDArrayHandle** outputs,
                                const int** out_stypes);
 
 /*!
@@ -1384,7 +1349,7 @@ MXNET_DLL int MXCachedOpRegisterOpHook(CachedOpHandle handle,
  * \param curr returns the current status.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayIsDeferredCompute(int *curr);
+MXNET_DLL int MXNDArrayIsDeferredCompute(int* curr);
 
 /*!
  * \brief set whether to enable deferred compute mode
@@ -1392,7 +1357,7 @@ MXNET_DLL int MXNDArrayIsDeferredCompute(int *curr);
  * \param prev returns the previous status before this set.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArraySetIsDeferredCompute(int deferred_compute_enabled, int *prev);
+MXNET_DLL int MXNDArraySetIsDeferredCompute(int deferred_compute_enabled, int* prev);
 
 /*!
  * \brief Associate variables with deferred compute arrays
@@ -1401,8 +1366,8 @@ MXNET_DLL int MXNDArraySetIsDeferredCompute(int deferred_compute_enabled, int *p
  * \param num number of arrays and variables respectively
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArraySetDeferredComputeVariable(NDArrayHandle *arrays,
-                                                  SymbolHandle *variables,
+MXNET_DLL int MXNDArraySetDeferredComputeVariable(NDArrayHandle* arrays,
+                                                  SymbolHandle* variables,
                                                   int num);
 
 /*!
@@ -1413,9 +1378,9 @@ MXNET_DLL int MXNDArraySetDeferredComputeVariable(NDArrayHandle *arrays,
  * Construct a Symbol for the deferred computation graph. output_handles
  * specifies the outputs of interest which the returned symbol will compute.
  */
-MXNET_DLL int MXNDArrayGetDeferredComputeSymbol(NDArrayHandle *output_handles,
+MXNET_DLL int MXNDArrayGetDeferredComputeSymbol(NDArrayHandle* output_handles,
                                                 int num_outputs,
-                                                SymbolHandle *out);
+                                                SymbolHandle* out);
 
 /*!
  * \brief Clear the deferred compute info associated with the ndarrays.
@@ -1423,7 +1388,7 @@ MXNET_DLL int MXNDArrayGetDeferredComputeSymbol(NDArrayHandle *output_handles,
  * \param num number of ndarrays
  * \return 0 when success, -1 otherwise
  */
-MXNET_DLL int MXNDArrayClearDeferredCompute(NDArrayHandle *arrays, int num);
+MXNET_DLL int MXNDArrayClearDeferredCompute(NDArrayHandle* arrays, int num);
 
 //--------------------------------------------
 // Part 3: symbolic configuration generation
@@ -1434,8 +1399,7 @@ MXNET_DLL int MXNDArrayClearDeferredCompute(NDArrayHandle *arrays, int num);
  * \param out_array the output operator name array.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXListAllOpNames(uint32_t *out_size,
-                               const char ***out_array);
+MXNET_DLL int MXListAllOpNames(uint32_t* out_size, const char*** out_array);
 
 /*!
  * \brief list all the available AtomicSymbolEntry
@@ -1443,16 +1407,14 @@ MXNET_DLL int MXListAllOpNames(uint32_t *out_size,
  * \param out_array the output AtomicSymbolCreator array
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolListAtomicSymbolCreators(uint32_t *out_size,
-                                               AtomicSymbolCreator **out_array);
+MXNET_DLL int MXSymbolListAtomicSymbolCreators(uint32_t* out_size, AtomicSymbolCreator** out_array);
 
 /*!
  * \brief Get the name of an atomic symbol.
  * \param creator the AtomicSymbolCreator.
  * \param name The returned name of the creator.
  */
-MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
-                                          const char **name);
+MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator, const char** name);
 
 /*!
  * \brief Get the input symbols of the graph.
@@ -1460,8 +1422,7 @@ MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
  * \param inputs The input symbols of the graph.
  * \param input_size the number of input symbols returned.
  */
-MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
-                                      int *input_size);
+MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle** inputs, int* input_size);
 
 /*!
  * \brief Cut a subgraph whose nodes are marked with a subgraph attribute.
@@ -1472,8 +1433,7 @@ MXNET_DLL int MXSymbolGetInputSymbols(SymbolHandle sym, SymbolHandle **inputs,
  * \param inputs The nodes that connect to the subgraph.
  * \param input_size The number of such nodes.
  */
-MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
-                                  int *input_size);
+MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle** inputs, int* input_size);
 
 /*!
  * \brief Get the detailed information about atomic symbol.
@@ -1493,14 +1453,14 @@ MXNET_DLL int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **inputs,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
-                                          const char **name,
-                                          const char **description,
-                                          uint32_t *num_args,
-                                          const char ***arg_names,
-                                          const char ***arg_type_infos,
-                                          const char ***arg_descriptions,
-                                          const char **key_var_num_args,
-                                          const char **return_type DEFAULT(NULL));
+                                          const char** name,
+                                          const char** description,
+                                          uint32_t* num_args,
+                                          const char*** arg_names,
+                                          const char*** arg_type_infos,
+                                          const char*** arg_descriptions,
+                                          const char** key_var_num_args,
+                                          const char** return_type DEFAULT(NULL));
 /*!
  * \brief Create an AtomicSymbol.
  *
@@ -1516,16 +1476,16 @@ MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
  */
 MXNET_DLL int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator creator,
                                          uint32_t num_param,
-                                         const char **keys,
-                                         const char **vals,
-                                         SymbolHandle *out);
+                                         const char** keys,
+                                         const char** vals,
+                                         SymbolHandle* out);
 /*!
  * \brief Create a Variable Symbol.
  * \param name name of the variable
  * \param out pointer to the created symbol handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolCreateVariable(const char *name, SymbolHandle *out);
+MXNET_DLL int MXSymbolCreateVariable(const char* name, SymbolHandle* out);
 /*!
  * \brief Create a Symbol by grouping list of symbols together
  * \param num_symbols number of symbols to be grouped
@@ -1533,23 +1493,21 @@ MXNET_DLL int MXSymbolCreateVariable(const char *name, SymbolHandle *out);
  * \param out pointer to the created symbol handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolCreateGroup(uint32_t num_symbols,
-                                  SymbolHandle *symbols,
-                                  SymbolHandle *out);
+MXNET_DLL int MXSymbolCreateGroup(uint32_t num_symbols, SymbolHandle* symbols, SymbolHandle* out);
 /*!
  * \brief Load a symbol from a json file.
  * \param fname the file name.
  * \param out the output symbol.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out);
+MXNET_DLL int MXSymbolCreateFromFile(const char* fname, SymbolHandle* out);
 /*!
  * \brief Load a symbol from a json string.
  * \param json the json string.
  * \param out the output symbol.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolCreateFromJSON(const char *json, SymbolHandle *out);
+MXNET_DLL int MXSymbolCreateFromJSON(const char* json, SymbolHandle* out);
 /*!
  * \brief Remove the operators amp_cast and amp_multicast
  * \param sym_handle the input symbol.
@@ -1563,14 +1521,14 @@ MXNET_DLL int MXSymbolRemoveAmpCast(SymbolHandle sym_handle, SymbolHandle* ret_s
  * \param fname the file name.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolSaveToFile(SymbolHandle symbol, const char *fname);
+MXNET_DLL int MXSymbolSaveToFile(SymbolHandle symbol, const char* fname);
 /*!
  * \brief Save a symbol into a json string
  * \param symbol the input symbol.
  * \param out_json output json string.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolSaveToJSON(SymbolHandle symbol, const char **out_json);
+MXNET_DLL int MXSymbolSaveToJSON(SymbolHandle symbol, const char** out_json);
 /*!
  * \brief Free the symbol handle.
  * \param symbol the symbol
@@ -1583,14 +1541,14 @@ MXNET_DLL int MXSymbolFree(SymbolHandle symbol);
  * \param out used to hold the result of copy
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolCopy(SymbolHandle symbol, SymbolHandle *out);
+MXNET_DLL int MXSymbolCopy(SymbolHandle symbol, SymbolHandle* out);
 /*!
  * \brief Print the content of symbol, used for debug.
  * \param symbol the symbol
  * \param out_str pointer to hold the output string of the printing.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolPrint(SymbolHandle symbol, const char **out_str);
+MXNET_DLL int MXSymbolPrint(SymbolHandle symbol, const char** out_str);
 /*!
  * \brief Get string name from symbol
  * \param symbol the source symbol
@@ -1598,9 +1556,7 @@ MXNET_DLL int MXSymbolPrint(SymbolHandle symbol, const char **out_str);
  * \param success Whether the result is contained in out.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetName(SymbolHandle symbol,
-                              const char** out,
-                              int *success);
+MXNET_DLL int MXSymbolGetName(SymbolHandle symbol, const char** out, int* success);
 /*!
  * \brief Get string attribute from symbol
  * \param symbol the source symbol
@@ -1609,13 +1565,11 @@ MXNET_DLL int MXSymbolGetName(SymbolHandle symbol,
  * \param success Whether the result is contained in out.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetAttr(SymbolHandle symbol,
-                              const char* key,
-                              const char** out,
-                              int *success);
+MXNET_DLL int MXSymbolGetAttr(SymbolHandle symbol, const char* key, const char** out, int* success);
 /*!
  * \brief Set string attribute from symbol.
- *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
+ *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic
+ * graph.
  *
  *  Safe recommendaton: use  immutable graph
  *  - Only allow set attributes during creation of new symbol as optional parameter
@@ -1629,9 +1583,7 @@ MXNET_DLL int MXSymbolGetAttr(SymbolHandle symbol,
  * \param value The value to be saved.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolSetAttr(SymbolHandle symbol,
-                              const char* key,
-                              const char* value);
+MXNET_DLL int MXSymbolSetAttr(SymbolHandle symbol, const char* key, const char* value);
 /*!
  * \brief Get all attributes from symbol, including all descendents.
  * \param symbol the source symbol
@@ -1639,9 +1591,7 @@ MXNET_DLL int MXSymbolSetAttr(SymbolHandle symbol,
  * \param out 2*out_size strings representing key value pairs.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolListAttr(SymbolHandle symbol,
-                               uint32_t *out_size,
-                               const char*** out);
+MXNET_DLL int MXSymbolListAttr(SymbolHandle symbol, uint32_t* out_size, const char*** out);
 /*!
  * \brief Get all attributes from symbol, excluding descendents.
  * \param symbol the source symbol
@@ -1649,9 +1599,7 @@ MXNET_DLL int MXSymbolListAttr(SymbolHandle symbol,
  * \param out 2*out_size strings representing key value pairs.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolListAttrShallow(SymbolHandle symbol,
-                                      uint32_t *out_size,
-                                      const char*** out);
+MXNET_DLL int MXSymbolListAttrShallow(SymbolHandle symbol, uint32_t* out_size, const char*** out);
 /*!
  * \brief List arguments in the symbol.
  * \param symbol the symbol
@@ -1660,8 +1608,8 @@ MXNET_DLL int MXSymbolListAttrShallow(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolListArguments(SymbolHandle symbol,
-                                    uint32_t *out_size,
-                                    const char ***out_str_array);
+                                    uint32_t* out_size,
+                                    const char*** out_str_array);
 
 /*!
  * \brief List returns in the symbol.
@@ -1671,8 +1619,8 @@ MXNET_DLL int MXSymbolListArguments(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolListOutputs(SymbolHandle symbol,
-                                  uint32_t *out_size,
-                                  const char ***out_str_array);
+                                  uint32_t* out_size,
+                                  const char*** out_str_array);
 
 /*!
  * \brief Get number of outputs of the symbol.
@@ -1680,8 +1628,7 @@ MXNET_DLL int MXSymbolListOutputs(SymbolHandle symbol,
  * \param out_size number of outputs
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetNumOutputs(SymbolHandle symbol,
-                                    uint32_t *output_count);
+MXNET_DLL int MXSymbolGetNumOutputs(SymbolHandle symbol, uint32_t* output_count);
 
 /*!
  * \brief Get a symbol that contains all the internals.
@@ -1689,24 +1636,21 @@ MXNET_DLL int MXSymbolGetNumOutputs(SymbolHandle symbol,
  * \param out The output symbol whose outputs are all the internals.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetInternals(SymbolHandle symbol,
-                                   SymbolHandle *out);
+MXNET_DLL int MXSymbolGetInternals(SymbolHandle symbol, SymbolHandle* out);
 /*!
  * \brief Get a symbol that contains all the inputs.
  * \param symbol The symbol
  * \param out The output symbol whose outputs are all the internals.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetInputs(SymbolHandle symbol,
-                                SymbolHandle *out);
+MXNET_DLL int MXSymbolGetInputs(SymbolHandle symbol, SymbolHandle* out);
 /*!
  * \brief Get a symbol that contains only direct children.
  * \param symbol The symbol
  * \param out The output symbol whose outputs are the direct children.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetChildren(SymbolHandle symbol,
-                                  SymbolHandle *out);
+MXNET_DLL int MXSymbolGetChildren(SymbolHandle symbol, SymbolHandle* out);
 /*!
  * \brief Get index-th outputs of the symbol.
  * \param symbol The symbol
@@ -1714,9 +1658,7 @@ MXNET_DLL int MXSymbolGetChildren(SymbolHandle symbol,
  * \param out The output symbol whose outputs are the index-th symbol.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGetOutput(SymbolHandle symbol,
-                                uint32_t index,
-                                SymbolHandle *out);
+MXNET_DLL int MXSymbolGetOutput(SymbolHandle symbol, uint32_t index, SymbolHandle* out);
 
 /*!
  * \brief List auxiliary states in the symbol.
@@ -1726,8 +1668,8 @@ MXNET_DLL int MXSymbolGetOutput(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolListAuxiliaryStates(SymbolHandle symbol,
-                                          uint32_t *out_size,
-                                          const char ***out_str_array);
+                                          uint32_t* out_size,
+                                          const char*** out_str_array);
 
 /*!
  * \brief Compose the symbol on other symbols.
@@ -1744,7 +1686,7 @@ MXNET_DLL int MXSymbolListAuxiliaryStates(SymbolHandle symbol,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolCompose(SymbolHandle sym,
-                              const char *name,
+                              const char* name,
                               uint32_t num_args,
                               const char** keys,
                               SymbolHandle* args);
@@ -1757,97 +1699,82 @@ MXNET_DLL int MXSymbolCompose(SymbolHandle sym,
  * \param out the returned symbol that has gradient
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolGrad(SymbolHandle sym,
-                           uint32_t num_wrt,
-                           const char** wrt,
-                           SymbolHandle* out);
+MXNET_DLL int MXSymbolGrad(SymbolHandle sym, uint32_t num_wrt, const char** wrt, SymbolHandle* out);
 
 /*!
  * \brief infer shape of unknown input shapes given the known one.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional. This api is available when MXNet is built with flag USE_INT64_TENSOR_SIZE=0 (by
+ * default) \param sym symbol handle \param num_args number of input arguments. \param keys the key
+ * of keyword args (optional) \param arg_ind_ptr the head pointer of the rows in CSR \param
+ * arg_shape_data the content of the CSR \param in_shape_size sizeof the returning array of
+ * in_shapes \param in_shape_ndim returning array of shape dimensions of eachs input shape. \param
+ * in_shape_data returning array of pointers to head of the input shape. \param out_shape_size
+ * sizeof the returning array of out_shapes \param out_shape_ndim returning array of shape
+ * dimensions of each output shape. \param out_shape_data returning array of pointers to head of the
+ * output shape. \param aux_shape_size sizeof the returning array of aux_shapes \param
+ * aux_shape_ndim returning array of shape dimensions of each auxiliary shape. \param aux_shape_data
+ * returning array of pointers to head of the auxiliary shape. \param complete whether infer shape
+ * completes or more information is needed. \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  uint32_t num_args,
                                  const char** keys,
-                                 const uint32_t *arg_ind_ptr,
-                                 const int *arg_shape_data,
-                                 uint32_t *in_shape_size,
-                                 const int **in_shape_ndim,
-                                 const int ***in_shape_data,
-                                 uint32_t *out_shape_size,
-                                 const int **out_shape_ndim,
-                                 const int ***out_shape_data,
-                                 uint32_t *aux_shape_size,
-                                 const int **aux_shape_ndim,
-                                 const int ***aux_shape_data,
-                                 int *complete);
+                                 const uint32_t* arg_ind_ptr,
+                                 const int* arg_shape_data,
+                                 uint32_t* in_shape_size,
+                                 const int** in_shape_ndim,
+                                 const int*** in_shape_data,
+                                 uint32_t* out_shape_size,
+                                 const int** out_shape_ndim,
+                                 const int*** out_shape_data,
+                                 uint32_t* aux_shape_size,
+                                 const int** aux_shape_ndim,
+                                 const int*** aux_shape_data,
+                                 int* complete);
 
 /*!
  * \brief infer shape of unknown input shapes given the known one.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional. This api is available when MXNet is built with flag USE_INT64_TENSOR_SIZE=1 (not
+ * default) i.e. Large Tensor Support \param sym symbol handle \param num_args number of input
+ * arguments. \param keys the key of keyword args (optional) \param arg_ind_ptr the head pointer of
+ * the rows in CSR \param arg_shape_data the content of the CSR \param in_shape_size sizeof the
+ * returning array of in_shapes \param in_shape_ndim returning array of shape dimensions of each
+ * input shape. \param in_shape_data returning array of pointers to head of the input shape. \param
+ * out_shape_size sizeof the returning array of out_shapes \param out_shape_ndim returning array of
+ * shape dimensions of each output shape. \param out_shape_data returning array of pointers to head
+ * of the output shape. \param aux_shape_size sizeof the returning array of aux_shapes \param
+ * aux_shape_ndim returning array of shape dimensions of each auxiliary shape. \param aux_shape_data
+ * returning array of pointers to head of the auxiliary shape. \param complete whether infer shape
+ * completes or more information is needed. \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolInferShape64(SymbolHandle sym,
                                    uint32_t num_args,
                                    const char** keys,
-                                   const int64_t *arg_ind_ptr,
-                                   const int64_t *arg_shape_data,
-                                   size_t *in_shape_size,
-                                   const int **in_shape_ndim,
-                                   const int64_t ***in_shape_data,
-                                   size_t *out_shape_size,
-                                   const int **out_shape_ndim,
-                                   const int64_t ***out_shape_data,
-                                   size_t *aux_shape_size,
-                                   const int **aux_shape_ndim,
-                                   const int64_t ***aux_shape_data,
-                                   int *complete);
+                                   const int64_t* arg_ind_ptr,
+                                   const int64_t* arg_shape_data,
+                                   size_t* in_shape_size,
+                                   const int** in_shape_ndim,
+                                   const int64_t*** in_shape_data,
+                                   size_t* out_shape_size,
+                                   const int** out_shape_ndim,
+                                   const int64_t*** out_shape_data,
+                                   size_t* aux_shape_size,
+                                   const int** aux_shape_ndim,
+                                   const int64_t*** aux_shape_data,
+                                   int* complete);
 
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
  *
  *  Return partially inferred results if not all shapes could be inferred.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional. This api is available when MXNet is built with flag USE_INT64_TENSOR_SIZE=0 (by
+ * default)
  *
  * \param sym symbol handle
  * \param num_args number of input arguments.
@@ -1869,27 +1796,27 @@ MXNET_DLL int MXSymbolInferShape64(SymbolHandle sym,
 MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
                                         uint32_t num_args,
                                         const char** keys,
-                                        const uint32_t *arg_ind_ptr,
-                                        const int *arg_shape_data,
-                                        uint32_t *in_shape_size,
-                                        const int **in_shape_ndim,
-                                        const int ***in_shape_data,
-                                        uint32_t *out_shape_size,
-                                        const int **out_shape_ndim,
-                                        const int ***out_shape_data,
-                                        uint32_t *aux_shape_size,
-                                        const int **aux_shape_ndim,
-                                        const int ***aux_shape_data,
-                                        int *complete);
+                                        const uint32_t* arg_ind_ptr,
+                                        const int* arg_shape_data,
+                                        uint32_t* in_shape_size,
+                                        const int** in_shape_ndim,
+                                        const int*** in_shape_data,
+                                        uint32_t* out_shape_size,
+                                        const int** out_shape_ndim,
+                                        const int*** out_shape_data,
+                                        uint32_t* aux_shape_size,
+                                        const int** aux_shape_ndim,
+                                        const int*** aux_shape_data,
+                                        int* complete);
 
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
  *
  *  Return partially inferred results if not all shapes could be inferred.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional. This api is available when MXNet is built with flag USE_INT64_TENSOR_SIZE=1 (not
+ * default) i.e. Large Tensor Support
  *
  * \param sym symbol handle
  * \param num_args number of input arguments.
@@ -1911,23 +1838,24 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
 MXNET_DLL int MXSymbolInferShapePartial64(SymbolHandle sym,
                                           uint32_t num_args,
                                           const char** keys,
-                                          const int64_t *arg_ind_ptr,
-                                          const int64_t *arg_shape_data,
-                                          size_t *in_shape_size,
-                                          const int **in_shape_ndim,
-                                          const int64_t ***in_shape_data,
-                                          size_t *out_shape_size,
-                                          const int **out_shape_ndim,
-                                          const int64_t ***out_shape_data,
-                                          size_t *aux_shape_size,
-                                          const int **aux_shape_ndim,
-                                          const int64_t ***aux_shape_data,
-                                          int *complete);
+                                          const int64_t* arg_ind_ptr,
+                                          const int64_t* arg_shape_data,
+                                          size_t* in_shape_size,
+                                          const int** in_shape_ndim,
+                                          const int64_t*** in_shape_data,
+                                          size_t* out_shape_size,
+                                          const int** out_shape_ndim,
+                                          const int64_t*** out_shape_data,
+                                          size_t* aux_shape_size,
+                                          const int** aux_shape_ndim,
+                                          const int64_t*** aux_shape_data,
+                                          int* complete);
 
 /*!
  * \brief infer type of unknown input types given the known one.
  *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional.
  *
  * \param sym symbol handle
  * \param num_args numbe of input arguments.
@@ -1945,21 +1873,22 @@ MXNET_DLL int MXSymbolInferShapePartial64(SymbolHandle sym,
 MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
                                 uint32_t num_args,
                                 const char** keys,
-                                const int *arg_type_data,
-                                uint32_t *in_type_size,
-                                const int **in_type_data,
-                                uint32_t *out_type_size,
-                                const int **out_type_data,
-                                uint32_t *aux_type_size,
-                                const int **aux_type_data,
-                                int *complete);
+                                const int* arg_type_data,
+                                uint32_t* in_type_size,
+                                const int** in_type_data,
+                                uint32_t* out_type_size,
+                                const int** out_type_data,
+                                uint32_t* aux_type_size,
+                                const int** aux_type_data,
+                                int* complete);
 
 /*!
  * \brief partially infer type of unknown input types given the known one.
  *
  *  Return partially inferred results if not all types could be inferred.
  *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
+ *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is
+ * positional.
  *
  * \param sym symbol handle
  * \param num_args numbe of input arguments.
@@ -1977,14 +1906,14 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
 MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
                                        uint32_t num_args,
                                        const char** keys,
-                                       const int *arg_type_data,
-                                       uint32_t *in_type_size,
-                                       const int **in_type_data,
-                                       uint32_t *out_type_size,
-                                       const int **out_type_data,
-                                       uint32_t *aux_type_size,
-                                       const int **aux_type_data,
-                                       int *complete);
+                                       const int* arg_type_data,
+                                       uint32_t* in_type_size,
+                                       const int** in_type_data,
+                                       uint32_t* out_type_size,
+                                       const int** out_type_data,
+                                       uint32_t* aux_type_size,
+                                       const int** aux_type_data,
+                                       int* complete);
 
 /*!
  * \brief Convert a symbol into a quantized symbol where FP32 operators are replaced with INT8
@@ -1993,45 +1922,44 @@ MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
  * \param dev_type device type
  * \param num_excluded_sym_names number of layers excluded from being quantized in the input symbol
  * \param excluded_sym_names node names to be excluded from being quantized
- * \param num_excluded_op_names number of operators excluded from being quantized in the input symbol
- * \param excluded_op_names operator names to be excluded from being quantized
- * \param num_offline number of parameters that are quantized offline
- * \param offline_params array of c strings representing the names of params quantized offline
- * \param quantized_dtype the quantized destination type for input data
- * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could
- * \param quantize_mode quantize mode to be used in quantize pass
- * \param quantize_granularity quantize granularity, tensor-wise or channel-wise
- * \param out_num_calib_names return the number of nodes to be calibrated
- * \param out_calib_names return the node names to be calibrated
+ * \param num_excluded_op_names number of operators excluded from being quantized in the input
+ * symbol \param excluded_op_names operator names to be excluded from being quantized \param
+ * num_offline number of parameters that are quantized offline \param offline_params array of c
+ * strings representing the names of params quantized offline \param quantized_dtype the quantized
+ * destination type for input data \param calib_quantize **Deprecated**. quantize op will always be
+ * calibrated if could \param quantize_mode quantize mode to be used in quantize pass \param
+ * quantize_granularity quantize granularity, tensor-wise or channel-wise \param out_num_calib_names
+ * return the number of nodes to be calibrated \param out_calib_names return the node names to be
+ * calibrated
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
-                               SymbolHandle *ret_sym_handle,
+                               SymbolHandle* ret_sym_handle,
                                const int* dev_type,
                                const uint32_t num_excluded_sym_names,
-                               const char **excluded_sym_names,
+                               const char** excluded_sym_names,
                                const uint32_t num_excluded_op_names,
-                               const char **excluded_op_names,
-                               const uint32_t num_offline, const char **offline_params,
-                               const char *quantized_dtype, const bool calib_quantize,
-                               const char *quantize_mode, const char *quantize_granularity,
-                               uint32_t* out_num_calib_names, const char ***out_calib_names);
-
-/*!
- * \brief Convert a symbol into a mixed precision symbol with cast operators for target dtype casting
- * \param sym_handle symbol to be converted
- * \param ret_sym_handle mixed precision symbol result
- * \param num_args number of arguments for known dtypes
- * \param arg_type_data arg types of the arguments
- * \param target_dtype target_dtype for mixed precision symbol
- * \param cast_optional_params whether to cast optional params to target_dtype
- * \param num_target_dtype_op_names number of ops to be casted to target_dtype
- * \param num_fp32_op_names number of ops to be casted to FP32
- * \param num_widest_dtype_op_names number of ops to be casted to widest dtype
- * \param num_conditional_fp32_op_names number of ops to be casted to FP32 based on a condition
- * \param num_excluded_symbols number of symbols to be excluded from casting
- * \param num_model_params number of model parameters
- * \param num_widest_dtype_op_names number of ops to be casted to the widest dtype
- * \param num_conditional_fp32_op_names number of ops to be cast to fp32 based on precision
+                               const char** excluded_op_names,
+                               const uint32_t num_offline,
+                               const char** offline_params,
+                               const char* quantized_dtype,
+                               const bool calib_quantize,
+                               const char* quantize_mode,
+                               const char* quantize_granularity,
+                               uint32_t* out_num_calib_names,
+                               const char*** out_calib_names);
+
+/*!
+ * \brief Convert a symbol into a mixed precision symbol with cast operators for target dtype
+ * casting \param sym_handle symbol to be converted \param ret_sym_handle mixed precision symbol
+ * result \param num_args number of arguments for known dtypes \param arg_type_data arg types of the
+ * arguments \param target_dtype target_dtype for mixed precision symbol \param cast_optional_params
+ * whether to cast optional params to target_dtype \param num_target_dtype_op_names number of ops to
+ * be casted to target_dtype \param num_fp32_op_names number of ops to be casted to FP32 \param
+ * num_widest_dtype_op_names number of ops to be casted to widest dtype \param
+ * num_conditional_fp32_op_names number of ops to be casted to FP32 based on a condition \param
+ * num_excluded_symbols number of symbols to be excluded from casting \param num_model_params number
+ * of model parameters \param num_widest_dtype_op_names number of ops to be casted to the widest
+ * dtype \param num_conditional_fp32_op_names number of ops to be cast to fp32 based on precision
  * \param target_dtype_op_names op names to be casted to target_dtype
  * \param fp32_op_names op names to be casted to fp32
  * \param widest_dtype_op_names names to be casted to widest dtype
@@ -2043,7 +1971,7 @@ MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
  * \param model_param_names names for model parameters
  */
 MXNET_DLL int MXReducePrecisionSymbol(SymbolHandle sym_handle,
-                                      SymbolHandle *ret_sym_handle,
+                                      SymbolHandle* ret_sym_handle,
                                       uint32_t num_args,
                                       const int* arg_type_data,
                                       uint32_t num_ind_ptr,
@@ -2056,15 +1984,15 @@ MXNET_DLL int MXReducePrecisionSymbol(SymbolHandle sym_handle,
                                       const uint32_t num_conditional_fp32_op_names,
                                       const uint32_t num_excluded_symbols,
                                       const uint32_t num_model_params,
-                                      const char **target_dtype_op_names,
-                                      const char **fp32_op_names,
-                                      const char **widest_dtype_op_names,
-                                      const char **conditional_fp32_op_names,
-                                      const char **excluded_symbols,
-                                      const char **conditional_param_names,
-                                      const char **conditional_param_vals,
-                                      const char **model_param_names,
-                                      const char **arg_names);
+                                      const char** target_dtype_op_names,
+                                      const char** fp32_op_names,
+                                      const char** widest_dtype_op_names,
+                                      const char** conditional_fp32_op_names,
+                                      const char** excluded_symbols,
+                                      const char** conditional_param_names,
+                                      const char** conditional_param_vals,
+                                      const char** model_param_names,
+                                      const char** arg_names);
 /*!
  * \brief Set calibration table to node attributes in the sym
  * \param sym_handle symbol whose node attributes are to be set by calibration table
@@ -2087,15 +2015,16 @@ MXNET_DLL int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
  * \param backend backend names for subgraph pass
  * \param ret_sym_handle returned symbol
  */
-MXNET_DLL int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
-                                   SymbolHandle *ret_sym_handle);
+MXNET_DLL int MXGenBackendSubgraph(SymbolHandle sym_handle,
+                                   const char* backend,
+                                   SymbolHandle* ret_sym_handle);
 
 /*!
  * \brief Generate atomic symbol (able to be composed) from a source symbol
  * \param sym_handle source symbol
  * \param ret_sym_handle returned atomic symbol
  */
-MXNET_DLL int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle);
+MXNET_DLL int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle* ret_sym_handle);
 /*!
  * \brief Partitions symbol for given backend, potentially creating subgraphs
  * \param sym_handle symbol to be partitioned
@@ -2156,7 +2085,6 @@ MXNET_DLL int MXOptimizeForBackend(SymbolHandle sym_handle,
                                    NDArrayHandle** new_aux_handle,
                                    char*** new_aux_names_handle);
 
-
 //--------------------------------------------
 // Part 5: IO Interface
 //--------------------------------------------
@@ -2166,8 +2094,7 @@ MXNET_DLL int MXOptimizeForBackend(SymbolHandle sym_handle,
  * \param out_array the output iteratos entries
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXListDataIters(uint32_t *out_size,
-                              DataIterCreator **out_array);
+MXNET_DLL int MXListDataIters(uint32_t* out_size, DataIterCreator** out_array);
 /*!
  * \brief Init an iterator, init with parameters
  * the array size of passed in arguments
@@ -2180,9 +2107,9 @@ MXNET_DLL int MXListDataIters(uint32_t *out_size,
  */
 MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle,
                                    uint32_t num_param,
-                                   const char **keys,
-                                   const char **vals,
-                                   DataIterHandle *out);
+                                   const char** keys,
+                                   const char** vals,
+                                   DataIterHandle* out);
 /*!
  * \brief Get the detailed information about data iterator.
  * \param creator the DataIterCreator.
@@ -2195,12 +2122,12 @@ MXNET_DLL int MXDataIterCreateIter(DataIterCreator handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXDataIterGetIterInfo(DataIterCreator creator,
-                                    const char **name,
-                                    const char **description,
-                                    uint32_t *num_args,
-                                    const char ***arg_names,
-                                    const char ***arg_type_infos,
-                                    const char ***arg_descriptions);
+                                    const char** name,
+                                    const char** description,
+                                    uint32_t* num_args,
+                                    const char*** arg_names,
+                                    const char*** arg_type_infos,
+                                    const char*** arg_descriptions);
 /*!
  * \brief Free the handle to the IO module
  * \param handle the handle pointer to the data iterator
@@ -2213,8 +2140,7 @@ MXNET_DLL int MXDataIterFree(DataIterHandle handle);
  * \param out return value of next
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterNext(DataIterHandle handle,
-                             int *out);
+MXNET_DLL int MXDataIterNext(DataIterHandle handle, int* out);
 /*!
  * \brief Call iterator.Reset
  * \param handle the handle to iterator
@@ -2227,16 +2153,14 @@ MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle);
  * \param handle the handle to iterator
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetLenHint(DataIterHandle handle,
-                                   int64_t *len);
+MXNET_DLL int MXDataIterGetLenHint(DataIterHandle handle, int64_t* len);
 /*!
  * \brief Get the handle to the NDArray of underlying data
  * \param handle the handle pointer to the data iterator
  * \param out handle to underlying data NDArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetData(DataIterHandle handle,
-                                NDArrayHandle *out);
+MXNET_DLL int MXDataIterGetData(DataIterHandle handle, NDArrayHandle* out);
 /*!
  * \brief Get the image index by array.
  * \param handle the handle pointer to the data iterator
@@ -2244,17 +2168,14 @@ MXNET_DLL int MXDataIterGetData(DataIterHandle handle,
  * \param out_size output size of the array.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetIndex(DataIterHandle handle,
-                                 uint64_t **out_index,
-                                 uint64_t *out_size);
+MXNET_DLL int MXDataIterGetIndex(DataIterHandle handle, uint64_t** out_index, uint64_t* out_size);
 /*!
  * \brief Get the padding number in current data batch
  * \param handle the handle pointer to the data iterator
  * \param pad pad number ptr
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle,
-                                  int *pad);
+MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle, int* pad);
 
 /*!
  * \brief Get the handle to the NDArray of underlying label
@@ -2262,8 +2183,7 @@ MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle,
  * \param out the handle to underlying label NDArray
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle,
-                                 NDArrayHandle *out);
+MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle* out);
 /*!
  * \brief Get the handles to specified underlying ndarrays of index
  * \param handle the handle pointer to the data iterator
@@ -2271,9 +2191,7 @@ MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle,
  * \param out the handle to an array of NDArrays that stores pointers to handles
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDataIterGetItems(DataIterHandle handle,
-                                int* num_outputs,
-                                NDArrayHandle **outputs);
+MXNET_DLL int MXDataIterGetItems(DataIterHandle handle, int* num_outputs, NDArrayHandle** outputs);
 
 /*!
  * \brief List all the available dataset entries
@@ -2281,8 +2199,7 @@ MXNET_DLL int MXDataIterGetItems(DataIterHandle handle,
  * \param out_array the output dataset entries
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXListDatasets(uint32_t *out_size,
-                             DatasetCreator **out_array);
+MXNET_DLL int MXListDatasets(uint32_t* out_size, DatasetCreator** out_array);
 /*!
  * \brief Init an dataset, init with parameters
  * the array size of passed in arguments
@@ -2295,9 +2212,9 @@ MXNET_DLL int MXListDatasets(uint32_t *out_size,
  */
 MXNET_DLL int MXDatasetCreateDataset(DatasetCreator handle,
                                      uint32_t num_param,
-                                     const char **keys,
-                                     const char **vals,
-                                     DatasetHandle *out);
+                                     const char** keys,
+                                     const char** vals,
+                                     DatasetHandle* out);
 /*!
  * \brief Get the detailed information about dataset.
  * \param creator the DatasetCreator.
@@ -2310,12 +2227,12 @@ MXNET_DLL int MXDatasetCreateDataset(DatasetCreator handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXDatasetGetDatasetInfo(DatasetCreator creator,
-                                      const char **name,
-                                      const char **description,
-                                      uint32_t *num_args,
-                                      const char ***arg_names,
-                                      const char ***arg_type_infos,
-                                      const char ***arg_descriptions);
+                                      const char** name,
+                                      const char** description,
+                                      uint32_t* num_args,
+                                      const char*** arg_names,
+                                      const char*** arg_type_infos,
+                                      const char*** arg_descriptions);
 /*!
  * \brief Free the handle to the IO module
  * \param handle the handle pointer to the dataset
@@ -2328,8 +2245,7 @@ MXNET_DLL int MXDatasetFree(DatasetHandle handle);
  * \param out return value of GetLen
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXDatasetGetLen(DatasetHandle handle,
-                              uint64_t *out);
+MXNET_DLL int MXDatasetGetLen(DatasetHandle handle, uint64_t* out);
 /*!
  * \brief Get Output NDArray given specified indices
  * \param handle the handle to dataset
@@ -2342,7 +2258,7 @@ MXNET_DLL int MXDatasetGetLen(DatasetHandle handle,
 MXNET_DLL int MXDatasetGetItems(DatasetHandle handle,
                                 uint64_t index,
                                 int* num_outputs,
-                                NDArrayHandle **outputs);
+                                NDArrayHandle** outputs);
 
 /*!
  * \brief List all the available batchify function entries
@@ -2350,8 +2266,7 @@ MXNET_DLL int MXDatasetGetItems(DatasetHandle handle,
  * \param out_array the output batchify function entries
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXListBatchifyFunctions(uint32_t *out_size,
-                                      BatchifyFunctionCreator **out_array);
+MXNET_DLL int MXListBatchifyFunctions(uint32_t* out_size, BatchifyFunctionCreator** out_array);
 /*!
  * \brief Init an batchify function, init with parameters
  * the array size of passed in arguments
@@ -2363,10 +2278,10 @@ MXNET_DLL int MXListBatchifyFunctions(uint32_t *out_size,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXBatchifyFunctionCreateFunction(BatchifyFunctionCreator handle,
-                                     uint32_t num_param,
-                                     const char **keys,
-                                     const char **vals,
-                                     BatchifyFunctionHandle *out);
+                                               uint32_t num_param,
+                                               const char** keys,
+                                               const char** vals,
+                                               BatchifyFunctionHandle* out);
 /*!
  * \brief Get the detailed information about batchify function.
  * \param creator the batchifyFunctionCreator.
@@ -2379,12 +2294,12 @@ MXNET_DLL int MXBatchifyFunctionCreateFunction(BatchifyFunctionCreator handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXBatchifyFunctionGetFunctionInfo(BatchifyFunctionCreator creator,
-                                      const char **name,
-                                      const char **description,
-                                      uint32_t *num_args,
-                                      const char ***arg_names,
-                                      const char ***arg_type_infos,
-                                      const char ***arg_descriptions);
+                                                const char** name,
+                                                const char** description,
+                                                uint32_t* num_args,
+                                                const char*** arg_names,
+                                                const char*** arg_type_infos,
+                                                const char*** arg_descriptions);
 /*!
  * \brief Invoke the Batchify Function
  * \param handle the handle pointer to the batchify function
@@ -2393,12 +2308,12 @@ MXNET_DLL int MXBatchifyFunctionGetFunctionInfo(BatchifyFunctionCreator creator,
  * \param inputs the pointers to input ndarrays
  * \param ouptuts the pointers to output ndarrays
  * \return 0 when success, -1 when failure happens
- */                                      
+ */
 MXNET_DLL int MXBatchifyFunctionInvoke(BatchifyFunctionHandle handle,
                                        int batch_size,
                                        int num_output,
-                                       NDArrayHandle *inputs,
-                                       NDArrayHandle **outputs);
+                                       NDArrayHandle* inputs,
+                                       NDArrayHandle** outputs);
 /*!
  * \brief Free the handle to the IO module
  * \param handle the handle pointer to the batchify function
@@ -2414,10 +2329,7 @@ MXNET_DLL int MXBatchifyFunctionFree(BatchifyFunctionHandle handle);
  * \param keys environment keys
  * \param vals environment values
  */
-MXNET_DLL int MXInitPSEnv(uint32_t num_vars,
-                          const char **keys,
-                          const char **vals);
-
+MXNET_DLL int MXInitPSEnv(uint32_t num_vars, const char** keys, const char** vals);
 
 /*!
  * \brief Create a kvstore
@@ -2425,8 +2337,7 @@ MXNET_DLL int MXInitPSEnv(uint32_t num_vars,
  * \param out The output type of KVStore
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreCreate(const char *type,
-                              KVStoreHandle *out);
+MXNET_DLL int MXKVStoreCreate(const char* type, KVStoreHandle* out);
 
 /*!
  * \brief Set parameters to use low-bit compressed gradients
@@ -2690,10 +2601,7 @@ MXNET_DLL int MXKVStorePushPullEx(KVStoreHandle handle,
  * \param local the value stored on local on this key
  * \param handle The additional handle to the updater
  */
-typedef void (MXKVStoreUpdater)(int key,
-                                NDArrayHandle recv,
-                                NDArrayHandle local,
-                                void *handle);
+typedef void(MXKVStoreUpdater)(int key, NDArrayHandle recv, NDArrayHandle local, void* handle);
 /*!
  * \brief user-defined updater for the kvstore with string keys
  * It's this updater's responsibility to delete \a recv and \a local
@@ -2702,10 +2610,10 @@ typedef void (MXKVStoreUpdater)(int key,
  * \param local the value stored on local on this key
  * \param handle The additional handle to the updater
  */
-typedef void (MXKVStoreStrUpdater)(const char* key,
-                                   NDArrayHandle recv,
-                                   NDArrayHandle local,
-                                   void *handle);
+typedef void(MXKVStoreStrUpdater)(const char* key,
+                                  NDArrayHandle recv,
+                                  NDArrayHandle local,
+                                  void* handle);
 /*!
  * \brief register a push updater
  * \param handle handle to the KVStore
@@ -2715,7 +2623,7 @@ typedef void (MXKVStoreStrUpdater)(const char* key,
  */
 MXNET_DLL int MXKVStoreSetUpdater(KVStoreHandle handle,
                                   MXKVStoreUpdater updater,
-                                  void *updater_handle);
+                                  void* updater_handle);
 /*!
  * \brief register a push updater with int keys and one with string keys
  * \param handle handle to the KVStore
@@ -2727,15 +2635,14 @@ MXNET_DLL int MXKVStoreSetUpdater(KVStoreHandle handle,
 MXNET_DLL int MXKVStoreSetUpdaterEx(KVStoreHandle handle,
                                     MXKVStoreUpdater updater,
                                     MXKVStoreStrUpdater str_updater,
-                                    void *updater_handle);
+                                    void* updater_handle);
 /*!
  * \brief get the type of the kvstore
  * \param handle handle to the KVStore
  * \param type a string type
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreGetType(KVStoreHandle handle,
-                               const char** type);
+MXNET_DLL int MXKVStoreGetType(KVStoreHandle handle, const char** type);
 //--------------------------------------------
 // Part 6: advanced KVStore for multi-machines
 //--------------------------------------------
@@ -2747,8 +2654,7 @@ MXNET_DLL int MXKVStoreGetType(KVStoreHandle handle,
  * \param ret the node rank
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreGetRank(KVStoreHandle handle,
-                               int *ret);
+MXNET_DLL int MXKVStoreGetRank(KVStoreHandle handle, int* ret);
 
 /**
  * \brief return The number of nodes in this group, which is
@@ -2759,31 +2665,28 @@ MXNET_DLL int MXKVStoreGetRank(KVStoreHandle handle,
  * \param ret the group size
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreGetGroupSize(KVStoreHandle handle,
-                                    int *ret);
+MXNET_DLL int MXKVStoreGetGroupSize(KVStoreHandle handle, int* ret);
 
 /**
  * \brief return whether or not this process is a worker node.
  * \param ret 1 for yes, 0 for no
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreIsWorkerNode(int *ret);
-
+MXNET_DLL int MXKVStoreIsWorkerNode(int* ret);
 
 /**
  * \brief return whether or not this process is a server node.
  * \param ret 1 for yes, 0 for no
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreIsServerNode(int *ret);
-
+MXNET_DLL int MXKVStoreIsServerNode(int* ret);
 
 /**
  * \brief return whether or not this process is a scheduler node.
  * \param ret 1 for yes, 0 for no
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreIsSchedulerNode(int *ret);
+MXNET_DLL int MXKVStoreIsSchedulerNode(int* ret);
 
 /**
  * \brief global barrier among all worker machines
@@ -2800,8 +2703,7 @@ MXNET_DLL int MXKVStoreBarrier(KVStoreHandle handle);
  * \param barrier_before_exit whether to do barrier when kvstore finalize
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
-                                            const int barrier_before_exit);
+MXNET_DLL int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle, const int barrier_before_exit);
 
 /**
  * \brief the prototype of a server controller
@@ -2809,9 +2711,7 @@ MXNET_DLL int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
  * \param body the body of the command
  * \param controller_handle helper handle for implementing controller
  */
-typedef void (MXKVStoreServerController)(int head,
-                                         const char *body,
-                                         void *controller_handle);
+typedef void(MXKVStoreServerController)(int head, const char* body, void* controller_handle);
 
 /**
  * \brief Run as server (or scheduler)
@@ -2822,7 +2722,7 @@ typedef void (MXKVStoreServerController)(int head,
  */
 MXNET_DLL int MXKVStoreRunServer(KVStoreHandle handle,
                                  MXKVStoreServerController controller,
-                                 void *controller_handle);
+                                 void* controller_handle);
 
 /**
  * \brief Send a command to all server nodes
@@ -2847,7 +2747,7 @@ MXNET_DLL int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
  */
 MXNET_DLL int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
                                       const int node_id,
-                                      int *number,
+                                      int* number,
                                       const int timeout_sec DEFAULT(60));
 
 /**
@@ -2855,14 +2755,14 @@ MXNET_DLL int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
  * \param uri path to file
  * \param out handle pointer to the created object
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOWriterCreate(const char *uri, RecordIOHandle *out);
+ */
+MXNET_DLL int MXRecordIOWriterCreate(const char* uri, RecordIOHandle* out);
 
 /**
  * \brief Delete a RecordIO writer object
  * \param handle handle to RecordIO object
  * \return 0 when success, -1 when failure happens
-*/
+ */
 MXNET_DLL int MXRecordIOWriterFree(RecordIOHandle handle);
 
 /**
@@ -2871,31 +2771,30 @@ MXNET_DLL int MXRecordIOWriterFree(RecordIOHandle handle);
  * \param buf buffer to write
  * \param size size of buffer
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOWriterWriteRecord(RecordIOHandle handle,
-                                          const char *buf, size_t size);
+ */
+MXNET_DLL int MXRecordIOWriterWriteRecord(RecordIOHandle handle, const char* buf, size_t size);
 
 /**
  * \brief Get the current writer pointer position
  * \param handle handle to RecordIO object
  * \param pos handle to output position
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOWriterTell(RecordIOHandle handle, size_t *pos);
+ */
+MXNET_DLL int MXRecordIOWriterTell(RecordIOHandle handle, size_t* pos);
 
 /**
  * \brief Create a RecordIO reader object
  * \param uri path to file
  * \param out handle pointer to the created object
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOReaderCreate(const char *uri, RecordIOHandle *out);
+ */
+MXNET_DLL int MXRecordIOReaderCreate(const char* uri, RecordIOHandle* out);
 
 /**
  * \brief Delete a RecordIO reader object
  * \param handle handle to RecordIO object
  * \return 0 when success, -1 when failure happens
-*/
+ */
 MXNET_DLL int MXRecordIOReaderFree(RecordIOHandle handle);
 
 /**
@@ -2904,16 +2803,15 @@ MXNET_DLL int MXRecordIOReaderFree(RecordIOHandle handle);
  * \param buf pointer to return buffer
  * \param size point to size of buffer
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle handle,
-                                        char const **buf, size_t *size);
+ */
+MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle handle, char const** buf, size_t* size);
 
 /**
  * \brief Set the current reader pointer position
  * \param handle handle to RecordIO object
  * \param pos target position
  * \return 0 when success, -1 when failure happens
-*/
+ */
 MXNET_DLL int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos);
 
 /**
@@ -2921,22 +2819,30 @@ MXNET_DLL int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos);
  * \param handle handle to RecordIO object
  * \param pos handle to output position
  * \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXRecordIOReaderTell(RecordIOHandle handle, size_t *pos);
+ */
+MXNET_DLL int MXRecordIOReaderTell(RecordIOHandle handle, size_t* pos);
 
 /**
  * \brief Create a MXRtc object
-*/
-MXNET_DLL int MXRtcCreate(char* name, uint32_t num_input, uint32_t num_output,
-                          char** input_names, char** output_names,
-                          NDArrayHandle* inputs, NDArrayHandle* outputs,
-                          char* kernel, RtcHandle *out);
+ */
+MXNET_DLL int MXRtcCreate(char* name,
+                          uint32_t num_input,
+                          uint32_t num_output,
+                          char** input_names,
+                          char** output_names,
+                          NDArrayHandle* inputs,
+                          NDArrayHandle* outputs,
+                          char* kernel,
+                          RtcHandle* out);
 
 /**
  * \brief Run cuda kernel
-*/
-MXNET_DLL int MXRtcPush(RtcHandle handle, uint32_t num_input, uint32_t num_output,
-                        NDArrayHandle* inputs, NDArrayHandle* outputs,
+ */
+MXNET_DLL int MXRtcPush(RtcHandle handle,
+                        uint32_t num_input,
+                        uint32_t num_output,
+                        NDArrayHandle* inputs,
+                        NDArrayHandle* outputs,
                         uint32_t gridDimX,
                         uint32_t gridDimY,
                         uint32_t gridDimZ,
@@ -2946,7 +2852,7 @@ MXNET_DLL int MXRtcPush(RtcHandle handle, uint32_t num_input, uint32_t num_outpu
 
 /**
  * \brief Delete a MXRtc object
-*/
+ */
 MXNET_DLL int MXRtcFree(RtcHandle handle);
 /*
  * \brief register custom operators from frontend.
@@ -2962,9 +2868,11 @@ MXNET_DLL int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creato
  * \param outputs handle to output NDArrays.
  * \param callbacks callbacks for backward function.
  */
-MXNET_DLL int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
-                                     int num_outputs, NDArrayHandle *outputs,
-                                     struct MXCallbackList *callbacks);
+MXNET_DLL int MXCustomFunctionRecord(int num_inputs,
+                                     NDArrayHandle* inputs,
+                                     int num_outputs,
+                                     NDArrayHandle* outputs,
+                                     struct MXCallbackList* callbacks);
 /*
  * \brief create cuda rtc module
  * \param source cuda source code
@@ -2974,9 +2882,12 @@ MXNET_DLL int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
  * \param exported function names
  * \param out handle to created module
  */
-MXNET_DLL int MXRtcCudaModuleCreate(const char* source, int num_options,
-                                    const char** options, int num_exports,
-                                    const char** exports, CudaModuleHandle *out);
+MXNET_DLL int MXRtcCudaModuleCreate(const char* source,
+                                    int num_options,
+                                    const char** options,
+                                    int num_exports,
+                                    const char** exports,
+                                    CudaModuleHandle* out);
 /*
  * \brief delete cuda rtc module
  * \param handle handle to cuda module
@@ -2992,9 +2903,13 @@ MXNET_DLL int MXRtcCudaModuleFree(CudaModuleHandle handle);
  * \param arg_types data type of arguments
  * \param out created kernel
  */
-MXNET_DLL int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name,
-                                    int num_args, int* is_ndarray, int* is_const,
-                                    int* arg_types, CudaKernelHandle *out);
+MXNET_DLL int MXRtcCudaKernelCreate(CudaModuleHandle handle,
+                                    const char* name,
+                                    int num_args,
+                                    int* is_ndarray,
+                                    int* is_const,
+                                    int* arg_types,
+                                    CudaKernelHandle* out);
 /*
  * \brief delete kernel
  * \param handle handle to previously created kernel
@@ -3013,10 +2928,15 @@ MXNET_DLL int MXRtcCudaKernelFree(CudaKernelHandle handle);
  * \param block_dim_z block dimension z
  * \param shared_mem size of dynamically allocated shared memory
  */
-MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
-                                  uint32_t grid_dim_x, uint32_t grid_dim_y,
-                                  uint32_t grid_dim_z, uint32_t block_dim_x,
-                                  uint32_t block_dim_y, uint32_t block_dim_z,
+MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle,
+                                  int dev_id,
+                                  void** args,
+                                  uint32_t grid_dim_x,
+                                  uint32_t grid_dim_y,
+                                  uint32_t grid_dim_z,
+                                  uint32_t block_dim_x,
+                                  uint32_t block_dim_y,
+                                  uint32_t block_dim_z,
                                   uint32_t shared_mem);
 /*!
  * \brief Get shared memory handle from NDArray
@@ -3024,8 +2944,7 @@ MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** ar
  * \param shared_pid output PID
  * \param shared_id output shared memory id.
  */
-MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
-                                          int* shared_id);
+MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid, int* shared_id);
 
 /*!
  * \brief Release all unreferenced memory from the devices storage managers memory pool
@@ -3043,55 +2962,69 @@ MXNET_DLL int MXStorageEmptyCache(int dev_type, int dev_id);
  * \param dtype data type of NDArray
  * \param out constructed NDArray
  */
-MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const int *shape,
-                                           int ndim, int dtype, NDArrayHandle *out);
-
-/*!
-  * \brief Push an asynchronous operation to the engine.
-  * \param async_func Execution function whici takes a parameter on_complete
-  *                   that must be called when the execution ompletes.
-  * \param func_param The parameter set on calling async_func, can be NULL.
-  * \param deleter The callback to free func_param, can be NULL.
-  * \param ctx_handle Execution context.
-  * \param const_vars_handle The variables that current operation will use
-  *                          but not mutate.
-  * \param num_const_vars The number of const_vars_handle.
-  * \param mutable_vars_handle The variables that current operation will mutate.
-  * \param num_mutable_vars The number of mutable_vars_handle.
-  * \param prop_handle Property of the function.
-  * \param priority Priority of the action, as hint to the engine.
-  * \param opr_name The operation name.
-  * \param wait Whether this is a WaitForVar operation.
-  */
-MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
-                                EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
-                                EngineVarHandle const_vars_handle, int num_const_vars,
-                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid,
+                                           int shared_id,
+                                           const int* shape,
+                                           int ndim,
+                                           int dtype,
+                                           NDArrayHandle* out);
+
+/*!
+ * \brief Push an asynchronous operation to the engine.
+ * \param async_func Execution function whici takes a parameter on_complete
+ *                   that must be called when the execution ompletes.
+ * \param func_param The parameter set on calling async_func, can be NULL.
+ * \param deleter The callback to free func_param, can be NULL.
+ * \param ctx_handle Execution context.
+ * \param const_vars_handle The variables that current operation will use
+ *                          but not mutate.
+ * \param num_const_vars The number of const_vars_handle.
+ * \param mutable_vars_handle The variables that current operation will mutate.
+ * \param num_mutable_vars The number of mutable_vars_handle.
+ * \param prop_handle Property of the function.
+ * \param priority Priority of the action, as hint to the engine.
+ * \param opr_name The operation name.
+ * \param wait Whether this is a WaitForVar operation.
+ */
+MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func,
+                                void* func_param,
+                                EngineFuncParamDeleter deleter,
+                                ContextHandle ctx_handle,
+                                EngineVarHandle const_vars_handle,
+                                int num_const_vars,
+                                EngineVarHandle mutable_vars_handle,
+                                int num_mutable_vars,
                                 EngineFnPropertyHandle prop_handle DEFAULT(NULL),
-                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL),
+                                int priority DEFAULT(0),
+                                const char* opr_name DEFAULT(NULL),
                                 bool wait DEFAULT(false));
 
 /*!
-  * \brief Push a synchronous operation to the engine.
-  * \param sync_func Execution function that executes the operation.
-  * \param func_param The parameter set on calling sync_func, can be NULL.
-  * \param deleter The callback to free func_param, can be NULL.
-  * \param ctx_handle Execution context.
-  * \param const_vars_handle The variables that current operation will use
-  *                          but not mutate.
-  * \param num_const_vars The number of const_vars_handle.
-  * \param mutable_vars_handle The variables that current operation will mutate.
-  * \param num_mutable_vars The number of mutable_vars_handle.
-  * \param prop_handle Property of the function.
-  * \param priority Priority of the action, as hint to the engine.
-  * \param opr_name The operation name.
-  */
-MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
-                               EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
-                               EngineVarHandle const_vars_handle, int num_const_vars,
-                               EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+ * \brief Push a synchronous operation to the engine.
+ * \param sync_func Execution function that executes the operation.
+ * \param func_param The parameter set on calling sync_func, can be NULL.
+ * \param deleter The callback to free func_param, can be NULL.
+ * \param ctx_handle Execution context.
+ * \param const_vars_handle The variables that current operation will use
+ *                          but not mutate.
+ * \param num_const_vars The number of const_vars_handle.
+ * \param mutable_vars_handle The variables that current operation will mutate.
+ * \param num_mutable_vars The number of mutable_vars_handle.
+ * \param prop_handle Property of the function.
+ * \param priority Priority of the action, as hint to the engine.
+ * \param opr_name The operation name.
+ */
+MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func,
+                               void* func_param,
+                               EngineFuncParamDeleter deleter,
+                               ContextHandle ctx_handle,
+                               EngineVarHandle const_vars_handle,
+                               int num_const_vars,
+                               EngineVarHandle mutable_vars_handle,
+                               int num_mutable_vars,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
-                               int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+                               int priority DEFAULT(0),
+                               const char* opr_name DEFAULT(NULL));
 /*!
  * \brief Create an NDArray from source sharing the same data chunk.
  * \param src source NDArray
@@ -3103,84 +3036,93 @@ MXNET_DLL int MXShallowCopyNDArray(NDArrayHandle src, NDArrayHandle* out);
  * \param src source Symbol
  * \param out new Symbol sharing the same graph structure with src
  */
-MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle * out);
-
-/*!
-  * \brief Push an asynchronous operation to the engine.
-  * \param async_func Execution function whici takes a parameter on_complete
-  *                   that must be called when the execution ompletes.
-  * \param func_param The parameter set on calling async_func, can be NULL.
-  * \param deleter The callback to free func_param, can be NULL.
-  * \param ctx_handle Execution context.
-  * \param const_nds_handle The NDArrays that current operation will use
-  *                          but not mutate.
-  * \param num_const_nds The number of const_nds_handle.
-  * \param mutable_nds_handle The NDArrays that current operation will mutate.
-  * \param num_mutable_nds The number of mutable_nds_handle.
-  * \param prop_handle Property of the function.
-  * \param priority Priority of the action, as hint to the engine.
-  * \param opr_name The operation name.
-  * \param wait Whether this is a WaitForVar operation.
-  */
-MXNET_DLL int MXEnginePushAsyncND(EngineAsyncFunc async_func, void* func_param,
-                                  EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
-                                  NDArrayHandle* const_nds_handle, int num_const_nds,
-                                  NDArrayHandle* mutable_nds_handle, int num_mutable_nds,
+MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle* out);
+
+/*!
+ * \brief Push an asynchronous operation to the engine.
+ * \param async_func Execution function whici takes a parameter on_complete
+ *                   that must be called when the execution ompletes.
+ * \param func_param The parameter set on calling async_func, can be NULL.
+ * \param deleter The callback to free func_param, can be NULL.
+ * \param ctx_handle Execution context.
+ * \param const_nds_handle The NDArrays that current operation will use
+ *                          but not mutate.
+ * \param num_const_nds The number of const_nds_handle.
+ * \param mutable_nds_handle The NDArrays that current operation will mutate.
+ * \param num_mutable_nds The number of mutable_nds_handle.
+ * \param prop_handle Property of the function.
+ * \param priority Priority of the action, as hint to the engine.
+ * \param opr_name The operation name.
+ * \param wait Whether this is a WaitForVar operation.
+ */
+MXNET_DLL int MXEnginePushAsyncND(EngineAsyncFunc async_func,
+                                  void* func_param,
+                                  EngineFuncParamDeleter deleter,
+                                  ContextHandle ctx_handle,
+                                  NDArrayHandle* const_nds_handle,
+                                  int num_const_nds,
+                                  NDArrayHandle* mutable_nds_handle,
+                                  int num_mutable_nds,
                                   EngineFnPropertyHandle prop_handle DEFAULT(NULL),
-                                  int priority DEFAULT(0), const char* opr_name DEFAULT(NULL),
+                                  int priority DEFAULT(0),
+                                  const char* opr_name DEFAULT(NULL),
                                   bool wait DEFAULT(false));
 
 /*!
-  * \brief Push a synchronous operation to the engine.
-  * \param sync_func Execution function that executes the operation.
-  * \param func_param The parameter set on calling sync_func, can be NULL.
-  * \param deleter The callback to free func_param, can be NULL.
-  * \param ctx_handle Execution context.
-  * \param const_nds_handle The NDArrays that current operation will use
-  *                          but not mutate.
-  * \param num_const_nds The number of const_nds_handle.
-  * \param mutable_nds_handle The NDArrays that current operation will mutate.
-  * \param num_mutable_nds The number of mutable_nds_handle.
-  * \param prop_handle Property of the function.
-  * \param priority Priority of the action, as hint to the engine.
-  * \param opr_name The operation name.
-  */
-MXNET_DLL int MXEnginePushSyncND(EngineSyncFunc sync_func, void* func_param,
-                                 EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
-                                 NDArrayHandle* const_nds_handle, int num_const_nds,
-                                 NDArrayHandle* mutable_nds_handle, int num_mutable_nds,
+ * \brief Push a synchronous operation to the engine.
+ * \param sync_func Execution function that executes the operation.
+ * \param func_param The parameter set on calling sync_func, can be NULL.
+ * \param deleter The callback to free func_param, can be NULL.
+ * \param ctx_handle Execution context.
+ * \param const_nds_handle The NDArrays that current operation will use
+ *                          but not mutate.
+ * \param num_const_nds The number of const_nds_handle.
+ * \param mutable_nds_handle The NDArrays that current operation will mutate.
+ * \param num_mutable_nds The number of mutable_nds_handle.
+ * \param prop_handle Property of the function.
+ * \param priority Priority of the action, as hint to the engine.
+ * \param opr_name The operation name.
+ */
+MXNET_DLL int MXEnginePushSyncND(EngineSyncFunc sync_func,
+                                 void* func_param,
+                                 EngineFuncParamDeleter deleter,
+                                 ContextHandle ctx_handle,
+                                 NDArrayHandle* const_nds_handle,
+                                 int num_const_nds,
+                                 NDArrayHandle* mutable_nds_handle,
+                                 int num_mutable_nds,
                                  EngineFnPropertyHandle prop_handle DEFAULT(NULL),
-                                 int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+                                 int priority DEFAULT(0),
+                                 const char* opr_name DEFAULT(NULL));
 
 /*!
  * \brief This function checks if any dynamic shape op is present in the symbol.
  * \param sym_handle handler of the input symbol.
  * \param has_dynamic_shape Flag to indicate if the symbol contains dynamic shape op.
  */
-MXNET_DLL int MXCheckDynamicShapeOp(SymbolHandle sym_handle,
-                                    bool* has_dynamic_shape);
+MXNET_DLL int MXCheckDynamicShapeOp(SymbolHandle sym_handle, bool* has_dynamic_shape);
 
 /*!
-  * \brief Push a new NVTX range. Requires building with CUDA and NVTX.
-  * \param name Name of the range.
-  * \param color Color used to display the range in the visual profiling tools.
-  *              Encoded as 256*256*R + 256*G + B.
-  */
-MXNET_DLL int MXNVTXRangePush(const char * name, mx_uint color);
+ * \brief Push a new NVTX range. Requires building with CUDA and NVTX.
+ * \param name Name of the range.
+ * \param color Color used to display the range in the visual profiling tools.
+ *              Encoded as 256*256*R + 256*G + B.
+ */
+MXNET_DLL int MXNVTXRangePush(const char* name, mx_uint color);
 
 /*!
-  * \brief End the NVTX range. Requires building with CUDA and NVTX.
-  */
+ * \brief End the NVTX range. Requires building with CUDA and NVTX.
+ */
 MXNET_DLL int MXNVTXRangePop();
 
 /*!
-  * \brief Start CUDA profiling session. Requires building with CUDA and NVTX.
-  */
+ * \brief Start CUDA profiling session. Requires building with CUDA and NVTX.
+ */
 MXNET_DLL int MXCUDAProfilerStart();
 
 /*!
-  * \brief End CUDA profiling session. Requires building with CUDA and NVTX.
-  */
+ * \brief End CUDA profiling session. Requires building with CUDA and NVTX.
+ */
 MXNET_DLL int MXCUDAProfilerStop();
 
 #ifdef __cplusplus
diff --git a/include/mxnet/c_api_error.h b/include/mxnet/c_api_error.h
index 2aa3a23887b3..e3cfb8381355 100644
--- a/include/mxnet/c_api_error.h
+++ b/include/mxnet/c_api_error.h
@@ -36,26 +36,26 @@
  * and finishes with API_END() or API_END_HANDLE_ERROR()
  * The finally clause contains procedure to cleanup states when an error happens.
  */
-#define MX_API_BEGIN()                                                         \
-  try {                                                                        \
+#define MX_API_BEGIN() \
+  try {                \
     on_enter_api(__FUNCTION__);
-#define MX_API_END()                                                           \
-  }                                                                            \
-  catch (const std::exception &_except_) {                                     \
-    on_exit_api();                                                             \
-    return MXAPIHandleException(_except_);                                     \
-  }                                                                            \
-  on_exit_api();                                                               \
-  return 0; // NOLINT(*)
-#define MX_API_END_HANDLE_ERROR(Finalize)                                      \
-  }                                                                            \
-  catch (const std::exception &_except_) {                                     \
-    Finalize;                                                                  \
-    on_exit_api();                                                             \
-    return MXAPIHandleException(_except_);                                     \
-  }                                                                            \
-  on_exit_api();                                                               \
-  return 0; // NOLINT(*)
+#define MX_API_END()                       \
+  }                                        \
+  catch (const std::exception& _except_) { \
+    on_exit_api();                         \
+    return MXAPIHandleException(_except_); \
+  }                                        \
+  on_exit_api();                           \
+  return 0;  // NOLINT(*)
+#define MX_API_END_HANDLE_ERROR(Finalize)  \
+  }                                        \
+  catch (const std::exception& _except_) { \
+    Finalize;                              \
+    on_exit_api();                         \
+    return MXAPIHandleException(_except_); \
+  }                                        \
+  on_exit_api();                           \
+  return 0;  // NOLINT(*)
 
 /*!
  * \brief Set the last error message needed by C API
@@ -67,10 +67,10 @@ void MXAPISetLastError(const char* msg);
  * \param e the exception
  * \return the return value of API after exception is handled
  */
-int MXAPIHandleException(const std::exception &e);
+int MXAPIHandleException(const std::exception& e);
 
 namespace mxnet {
-extern void on_enter_api(const char *function);
+extern void on_enter_api(const char* function);
 extern void on_exit_api();
 }
 #endif  // MXNET_C_API_ERROR_H_
diff --git a/include/mxnet/c_api_test.h b/include/mxnet/c_api_test.h
index ab662443c29a..5b37262ede8c 100644
--- a/include/mxnet/c_api_test.h
+++ b/include/mxnet/c_api_test.h
@@ -38,10 +38,10 @@ extern "C" {
  * used only for the testing purpose.
  */
 MXNET_DLL int MXBuildSubgraphByOpNames(SymbolHandle sym_handle,
-                                        const char* prop_name,
-                                        const uint32_t num_ops,
-                                        const char** op_names,
-                                        SymbolHandle* ret_sym_handle);
+                                       const char* prop_name,
+                                       const uint32_t num_ops,
+                                       const char** op_names,
+                                       SymbolHandle* ret_sym_handle);
 
 /*!
  * \brief Given a subgraph property name, use the provided op names
@@ -60,8 +60,8 @@ MXNET_DLL int MXSetSubgraphPropertyOpNames(const char* prop_name,
  * op_names to the backend property.
  */
 MXNET_DLL int MXSetSubgraphPropertyOpNamesV2(const char* prop_name,
-                                           const uint32_t num_ops,
-                                           const char** op_names);
+                                             const uint32_t num_ops,
+                                             const char** op_names);
 /*!
  * \brief Given a subgraph property name, delete the op name set
  * in the SubgraphPropertyOpNameSet.
@@ -73,29 +73,26 @@ MXNET_DLL int MXRemoveSubgraphPropertyOpNames(const char* prop_name);
  */
 MXNET_DLL int MXRemoveSubgraphPropertyOpNamesV2(const char* prop_name);
 
-
 /*!
  * \brief Get the value of an environment variable as seen by the backend.
  * \param name The name of the environment variable
  * \param value The returned value of the environment variable
  */
-MXNET_DLL int MXGetEnv(const char* name,
-                       const char** value);
+MXNET_DLL int MXGetEnv(const char* name, const char** value);
 
 /*!
  * \brief Set the value of an environment variable from the backend.
  * \param name The name of the environment variable
  * \param value The desired value to set the environment variable `name`
  */
-MXNET_DLL int MXSetEnv(const char* name,
-                       const char* value);
+MXNET_DLL int MXSetEnv(const char* name, const char* value);
 
 /*!
  * \brief Get the maximum SM architecture supported by the nvrtc compiler
  * \param max_arch The maximum supported architecture (e.g. would be 80, if Ampere)
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXGetMaxSupportedArch(uint32_t *max_arch);
+MXNET_DLL int MXGetMaxSupportedArch(uint32_t* max_arch);
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index cdb8998d2e83..9d20fdd43d74 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -178,7 +178,7 @@ class CallbackOnComplete {
   /*! \brief engine can see content of callback */
   friend class ::mxnet::Engine;
   /*! \brief the real callback */
-  void (*callback_)(Engine *, void *, const dmlc::Error *);
+  void (*callback_)(Engine*, void*, const dmlc::Error*);
   /*! \brief the engine class passed to callback */
   Engine* engine_;
   /*! \brief the parameter set on callback */
@@ -209,7 +209,7 @@ enum class FnProperty {
 
 /*!
  * \brief Dependency engine that schedules operations.
-*/
+ */
 class MXNET_API Engine {
  public:
   /*! \brief on start*/
@@ -266,9 +266,9 @@ class MXNET_API Engine {
   virtual OprHandle NewOperator(AsyncFn fn,
                                 std::vector<VarHandle> const& const_vars,
                                 std::vector<VarHandle> const& mutable_vars,
-                                FnProperty prop = FnProperty::kNormal,
+                                FnProperty prop      = FnProperty::kNormal,
                                 const char* opr_name = nullptr,
-                                bool wait = false) = 0;
+                                bool wait            = false) = 0;
   /*!
    * \brief Delete the given operator.
    * \param op The operator to delete.
@@ -299,13 +299,14 @@ class MXNET_API Engine {
    * \param opr_name The operator name.
    * \param wait Whether this is a WaitForVar operation
    */
-  virtual void PushAsync(AsyncFn exec_fun, Context exec_ctx,
+  virtual void PushAsync(AsyncFn exec_fun,
+                         Context exec_ctx,
                          std::vector<VarHandle> const& const_vars,
                          std::vector<VarHandle> const& mutable_vars,
-                         FnProperty prop = FnProperty::kNormal,
-                         int priority = 0,
+                         FnProperty prop      = FnProperty::kNormal,
+                         int priority         = 0,
                          const char* opr_name = nullptr,
-                         bool wait = false) = 0;
+                         bool wait            = false) = 0;
   /*!
    * \brief Schedule the deletion of a variable.
    *
@@ -317,9 +318,7 @@ class MXNET_API Engine {
    * \param exec_ctx Execution context.
    * \param var The variable to be deleted.
    */
-  virtual void DeleteVariable(SyncFn delete_fn,
-                              Context exec_ctx,
-                              VarHandle var) = 0;
+  virtual void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) = 0;
   /*!
    * \brief Wait for a variable.
    * \param var The variable we should wait for. This function returns when the
@@ -359,11 +358,12 @@ class MXNET_API Engine {
    * \param opr_name The operator name.
    * \tparam SyncFn the synchronous function to be pushed.
    */
-  virtual void PushSync(SyncFn exec_fn, Context exec_ctx,
+  virtual void PushSync(SyncFn exec_fn,
+                        Context exec_ctx,
                         std::vector<VarHandle> const& const_vars,
                         std::vector<VarHandle> const& mutable_vars,
-                        FnProperty prop = FnProperty::kNormal,
-                        int priority = 0,
+                        FnProperty prop      = FnProperty::kNormal,
+                        int priority         = 0,
                         const char* opr_name = nullptr) {
     this->PushAsync(
         [exec_fn](RunContext ctx, CallbackOnStart on_start, CallbackOnComplete on_complete) {
@@ -398,28 +398,27 @@ class MXNET_API Engine {
    * \param callback th static callback function.
    * \param param the paramter passed to callback.
    */
-  inline CallbackOnComplete CreateCallback(
-      void (*callback)(Engine *, void *, const dmlc::Error *), void *param) {
+  inline CallbackOnComplete CreateCallback(void (*callback)(Engine*, void*, const dmlc::Error*),
+                                           void* param) {
     CallbackOnComplete ret;
     ret.callback_ = callback;
-    ret.engine_ = this;
-    ret.param_ = param;
+    ret.engine_   = this;
+    ret.param_    = param;
     return ret;
   }
   // For each var vector, sort it and remove the duplicated vars.
   // Also remove vars from read_vars if it also appears in write_vars
-  inline void DeduplicateVarHandle(std::vector<engine::VarHandle> *read_vars,
-                                   std::vector<engine::VarHandle> *write_vars) {
+  inline void DeduplicateVarHandle(std::vector<engine::VarHandle>* read_vars,
+                                   std::vector<engine::VarHandle>* write_vars) {
     std::sort(write_vars->begin(), write_vars->end());
-    write_vars->resize(std::unique(write_vars->begin(), write_vars->end()) -
-                      write_vars->begin());
+    write_vars->resize(std::unique(write_vars->begin(), write_vars->end()) - write_vars->begin());
     std::sort(read_vars->begin(), read_vars->end());
-    read_vars->resize(std::unique(read_vars->begin(), read_vars->end()) -
-                      read_vars->begin());
-    auto wit = write_vars->begin();
+    read_vars->resize(std::unique(read_vars->begin(), read_vars->end()) - read_vars->begin());
+    auto wit  = write_vars->begin();
     auto rtop = read_vars->begin();
     for (auto rit = read_vars->begin(); rit != read_vars->end(); ++rit) {
-      while (wit != write_vars->end() && *wit < *rit) ++wit;
+      while (wit != write_vars->end() && *wit < *rit)
+        ++wit;
       if (wit == write_vars->end() || *wit != *rit) {
         *rtop = *rit;
         ++rtop;
@@ -435,7 +434,7 @@ class MXNET_API Engine {
   virtual int set_bulk_size(int) {
     return 0;
   }
-};  // class Engine
+};      // class Engine
 #endif  // DMLC_USE_CXX11
 }  // namespace mxnet
 #endif  // MXNET_ENGINE_H_
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index a432f0fc9e57..c5c3719fade2 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -66,7 +66,7 @@ class Executor {
    * \param step current step, user can always start from 0
    * \param step_left Number of steps left to finish the forward.
    */
-  virtual void PartialForward(bool is_train, int step, int *step_left) = 0;
+  virtual void PartialForward(bool is_train, int step, int* step_left) = 0;
   /*!
    * \brief Perform a Backward operation of the Operator.
    *  This must be called after Forward.
@@ -76,17 +76,17 @@ class Executor {
    *
    * \param head_grads the gradient of head nodes to be backproped.
    */
-  virtual void Backward(const std::vector<NDArray> &head_grads, bool is_train = true) = 0;
+  virtual void Backward(const std::vector<NDArray>& head_grads, bool is_train = true) = 0;
   /*!
    * \brief print the execution plan info to output stream.
    * \param os the output stream we like to print to.
    */
-  virtual void Print(std::ostream &os) const {} // NOLINT(*)
+  virtual void Print(std::ostream& os) const {}  // NOLINT(*)
   /*!
    * \brief get array of outputs in the executor.
    * \return array of outputs in the executor.
    */
-  virtual const std::vector<NDArray> &outputs() const = 0;
+  virtual const std::vector<NDArray>& outputs() const = 0;
   /*!
    * \brief get input argument map, key is arg name, value is arg's NDArray.
    * \return input argument map in the executor.
@@ -107,64 +107,62 @@ class Executor {
    *  but different input/output shapes.
    *
    * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
-   * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
-   * \param default_ctx the default context of binding.
-   * \param ctx_map Context mapping group to context.
-   * \param provided_arg_shapes New shape for arguments.
-   * \param in_args the NDArray that stores the input arguments.
-   * \param arg_grads NDArray that is used to store the gradient output of the input arguments.
-   * \param aux_states NDArray that is used as internal states.
-   * \return a new executor.
+   * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the
+   * original. \param default_ctx the default context of binding. \param ctx_map Context mapping
+   * group to context. \param provided_arg_shapes New shape for arguments. \param in_args the
+   * NDArray that stores the input arguments. \param arg_grads NDArray that is used to store the
+   * gradient output of the input arguments. \param aux_states NDArray that is used as internal
+   * states. \return a new executor.
    */
-  virtual Executor* Reshape(const bool partial_shaping,
-                            const bool allow_up_sizing,
-                            const Context& default_ctx,
-                            const std::map<std::string, Context>& ctx_map,
-                            const std::unordered_map<std::string, mxnet::TShape>&
-                              provided_arg_shapes,
-                            std::vector<NDArray>* in_args,
-                            std::vector<NDArray>* arg_grads,
-                            std::vector<NDArray>* aux_states) = 0;
+  virtual Executor* Reshape(
+      const bool partial_shaping,
+      const bool allow_up_sizing,
+      const Context& default_ctx,
+      const std::map<std::string, Context>& ctx_map,
+      const std::unordered_map<std::string, mxnet::TShape>& provided_arg_shapes,
+      std::vector<NDArray>* in_args,
+      std::vector<NDArray>* arg_grads,
+      std::vector<NDArray>* aux_states) = 0;
   /*!
    * \brief Create an operator by bind symbol with context and arguments.
-   *  If user do not want to compute the gradients of i-th argument, grad_req_type[i] can be kNullOp.
+   *  If user do not want to compute the gradients of i-th argument, grad_req_type[i] can be
+   * kNullOp.
    *
    * \param default_ctx the default context of binding.
    * \param group2ctx Context mapping group to context.
    * \param symbol the symbol that specifies the output of Forward pass.
    * \param in_args the NDArray that stores the input arguments to the symbol.
    * \param arg_grad_store NDArray that is used to store the gradient output of the input arguments.
-   * \param grad_req_type requirment type of gradient saving. Can only be in {kNullOp, kAddTo, kWriteTo}.
-   * \param aux_states NDArray that is used as internal state in op
-   * \param shared_exec input executor to share memory with.
-   * \return a new executor.
+   * \param grad_req_type requirment type of gradient saving. Can only be in {kNullOp, kAddTo,
+   * kWriteTo}. \param aux_states NDArray that is used as internal state in op \param shared_exec
+   * input executor to share memory with. \return a new executor.
    */
-  static Executor *Bind(nnvm::Symbol symbol,
+  static Executor* Bind(nnvm::Symbol symbol,
                         const Context& default_ctx,
                         const std::map<std::string, Context>& group2ctx,
-                        const std::vector<NDArray> &in_args,
-                        const std::vector<NDArray> &arg_grad_store,
-                        const std::vector<OpReqType> &grad_req_type,
-                        const std::vector<NDArray> &aux_states,
+                        const std::vector<NDArray>& in_args,
+                        const std::vector<NDArray>& arg_grad_store,
+                        const std::vector<OpReqType>& grad_req_type,
+                        const std::vector<NDArray>& aux_states,
                         Executor* shared_exec = nullptr);
 
-  static Executor* SimpleBind(nnvm::Symbol symbol,
-                              const Context& default_ctx,
-                              const std::map<std::string, Context>& group2ctx,
-                              const std::vector<Context>& in_arg_ctxes,
-                              const std::vector<Context>& arg_grad_ctxes,
-                              const std::vector<Context>& aux_state_ctxes,
-                              const std::unordered_map<std::string, mxnet::TShape>& arg_shape_map,
-                              const std::unordered_map<std::string, int>& arg_dtype_map,
-                              const std::unordered_map<std::string, int>& arg_stype_map,
-                              const std::vector<OpReqType>& grad_req_types,
-                              const std::unordered_set<std::string>& param_names,
-                              std::vector<NDArray>* in_args,
-                              std::vector<NDArray>* arg_grads,
-                              std::vector<NDArray>* aux_states,
-                              std::unordered_map<std::string, NDArray>*
-                                shared_data_arrays = nullptr,
-                              Executor* shared_exec = nullptr);
+  static Executor* SimpleBind(
+      nnvm::Symbol symbol,
+      const Context& default_ctx,
+      const std::map<std::string, Context>& group2ctx,
+      const std::vector<Context>& in_arg_ctxes,
+      const std::vector<Context>& arg_grad_ctxes,
+      const std::vector<Context>& aux_state_ctxes,
+      const std::unordered_map<std::string, mxnet::TShape>& arg_shape_map,
+      const std::unordered_map<std::string, int>& arg_dtype_map,
+      const std::unordered_map<std::string, int>& arg_stype_map,
+      const std::vector<OpReqType>& grad_req_types,
+      const std::unordered_set<std::string>& param_names,
+      std::vector<NDArray>* in_args,
+      std::vector<NDArray>* arg_grads,
+      std::vector<NDArray>* aux_states,
+      std::unordered_map<std::string, NDArray>* shared_data_arrays = nullptr,
+      Executor* shared_exec                                        = nullptr);
 
   /*!
    * \brief the prototype of user-defined monitor callback
diff --git a/include/mxnet/expr_operator.h b/include/mxnet/expr_operator.h
index c28761c0d1b9..8779d23aa6ab 100644
--- a/include/mxnet/expr_operator.h
+++ b/include/mxnet/expr_operator.h
@@ -33,17 +33,18 @@
 
 namespace mxnet {
 
-template<typename ValueType>
+template <typename ValueType>
 inline PrimExpr MakeConstScalar(MXNetDataType t, ValueType value) {
-  if (t.is_int()) return IntImm(t, static_cast<int64_t>(value));
-  if (t.is_float()) return FloatImm(t, static_cast<double>(value));
+  if (t.is_int())
+    return IntImm(t, static_cast<int64_t>(value));
+  if (t.is_float())
+    return FloatImm(t, static_cast<double>(value));
   // customized type and uint is not supported for MXNet for now
   LOG(FATAL) << "cannot make const for type " << t;
   return PrimExpr();
 }
 
-
-template<typename ValueType>
+template <typename ValueType>
 inline PrimExpr make_const(MXNetDataType t, ValueType value) {
   if (t.lanes() == 1) {
     return MakeConstScalar(t, value);
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 76ccf253d904..e4e3f6a938d0 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -35,18 +35,18 @@
 #include "./ndarray.h"
 
 namespace mxnet {
-  /*! \brief there are three numpy shape flags based on priority.
-   * GlobalOn
-   *   turn on numpy shape flag globally, it includes thread local.
-   *   The flag can be seen in any thread.
-   * ThreadLocalOn
-   *   only turn on thread local numpy shape flag, it cannot be seen
-   *   in other threads.
-   * Off
-   *   turn off numpy shape flag globally.
-   * */
-  enum NumpyShape{Off, ThreadLocalOn, GlobalOn};
-  typedef NumpyShape NumpyDefaultDtype;
+/*! \brief there are three numpy shape flags based on priority.
+ * GlobalOn
+ *   turn on numpy shape flag globally, it includes thread local.
+ *   The flag can be seen in any thread.
+ * ThreadLocalOn
+ *   only turn on thread local numpy shape flag, it cannot be seen
+ *   in other threads.
+ * Off
+ *   turn off numpy shape flag globally.
+ * */
+enum NumpyShape { Off, ThreadLocalOn, GlobalOn };
+typedef NumpyShape NumpyDefaultDtype;
 /*! \brief runtime functions for NDArray */
 class Imperative {
  public:
@@ -61,13 +61,14 @@ class Imperative {
                                      // interested in (marked variables)
     bool fresh_out_grad;
 
-    AGInfo() :
-      grad_req(kNullOp), fresh_out_grad(false) {}
+    AGInfo() : grad_req(kNullOp), fresh_out_grad(false) {}
 
     static void Clear(const nnvm::ObjectPtr& node) {
-      if (node == nullptr || node->info.empty()) return;
+      if (node == nullptr || node->info.empty())
+        return;
       AGInfo& info = Get(node);
-      if (info.grad_req != kNullOp) return;
+      if (info.grad_req != kNullOp)
+        return;
       node->info.clear();
     }
 
@@ -86,40 +87,38 @@ class Imperative {
 
     static bool IsVariable(const nnvm::ObjectPtr& node) {
       AGInfo& info = Get(node);
-      return info.grad_req != kNullOp && info.outputs.size() == 1
-             && info.out_grads.size() == 1;
+      return info.grad_req != kNullOp && info.outputs.size() == 1 && info.out_grads.size() == 1;
     }
   };
 
   /*! \brief DCInfo datastructure to enable deferred computation */
   class DCInfo {
    public:
-    explicit DCInfo(const std::vector<NDArray *> &inputs,
-                    const std::vector<NDArray *> &outputs);
+    explicit DCInfo(const std::vector<NDArray*>& inputs, const std::vector<NDArray*>& outputs);
 
     /*! \brief Compute the outputs of the associated operator. */
-    static void Compute(const NDArray &arr);
+    static void Compute(const NDArray& arr);
 
-    static DCInfo &Get(const nnvm::ObjectPtr &node) {
+    static DCInfo& Get(const nnvm::ObjectPtr& node) {
       return dmlc::get<DCInfo>(node->info);
     }
 
-    static bool IsNone(const NDArray &arr) {
+    static bool IsNone(const NDArray& arr) {
       return arr.deferredcompute_entry_.node == nullptr ||
              arr.deferredcompute_entry_.node->info.empty();
     }
 
-    static bool IsComputed(const NDArray &arr) {
-      return IsNone(arr) ||
-        dmlc::get<DCInfo>(arr.deferredcompute_entry_.node->info).is_computed_;
+    static bool IsComputed(const NDArray& arr) {
+      return IsNone(arr) || dmlc::get<DCInfo>(arr.deferredcompute_entry_.node->info).is_computed_;
     }
 
-    static DCInfo &Create(const nnvm::ObjectPtr &node,
-                          const std::vector<NDArray *> &inputs,
-                          const std::vector<NDArray *> &outputs);
+    static DCInfo& Create(const nnvm::ObjectPtr& node,
+                          const std::vector<NDArray*>& inputs,
+                          const std::vector<NDArray*>& outputs);
 
     static void Clear(const nnvm::ObjectPtr& node) {
-      if (node == nullptr || node->info.empty()) return;
+      if (node == nullptr || node->info.empty())
+        return;
       node->info.clear();
     }
 
@@ -146,7 +145,7 @@ class Imperative {
      * Note that the frontend may have deallocated the NDArray* and the
      * input_handles stored here may point to invalid memory.
      */
-    std::vector<const NDArray *> input_handles_;
+    std::vector<const NDArray*> input_handles_;
 
     /*! \brief Copies of output NDArrays
      *
@@ -168,9 +167,9 @@ class Imperative {
   }
   /*! \brief turn on or turn off operator recording for autograd. */
   bool set_is_training(bool is_train) {
-      bool old = is_train_;
-      is_train_ = is_train;
-      return old;
+    bool old  = is_train_;
+    is_train_ = is_train;
+    return old;
   }
   /*! \brief whether operator recording is on. */
   bool is_recording() const {
@@ -178,15 +177,17 @@ class Imperative {
   }
   /*! \brief turn on or turn off operator recording for autograd. */
   bool set_is_recording(bool is_recording) {
-      bool old = is_recording_;
-      is_recording_ = is_recording;
-      return old;
+    bool old      = is_recording_;
+    is_recording_ = is_recording;
+    return old;
   }
   /*! \brief whether deferred compute mode is on. */
-  bool is_deferred_compute() const { return is_deferred_compute_; }
+  bool is_deferred_compute() const {
+    return is_deferred_compute_;
+  }
   /*! \brief turn on or turn off operator recording for autograd. */
   bool set_is_deferred_compute(bool is_deferred_compute) {
-    bool old = is_deferred_compute_;
+    bool old             = is_deferred_compute_;
     is_deferred_compute_ = is_deferred_compute;
     return old;
   }
@@ -197,24 +198,22 @@ class Imperative {
     if (is_np_shape_global_) {
       return NumpyShape::GlobalOn;
     }
-    return is_np_shape_thread_local_ ?
-           NumpyShape::ThreadLocalOn :
-           NumpyShape::Off;
+    return is_np_shape_thread_local_ ? NumpyShape::ThreadLocalOn : NumpyShape::Off;
   }
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
     NumpyShape flag = static_cast<NumpyShape>(is_np_shape);
-    bool old = this->is_np_shape();
+    bool old        = this->is_np_shape();
     switch (flag) {
       case GlobalOn:
-        is_np_shape_global_ = true;
+        is_np_shape_global_       = true;
         is_np_shape_thread_local_ = true;
         break;
       case ThreadLocalOn:
         is_np_shape_thread_local_ = true;
         break;
       case Off:
-        is_np_shape_global_ = false;
+        is_np_shape_global_       = false;
         is_np_shape_thread_local_ = false;
         break;
     }
@@ -242,19 +241,19 @@ class Imperative {
   void RecordOp(nnvm::NodeAttrs&& attrs,
                 const std::vector<NDArray*>& inputs,
                 const std::vector<NDArray*>& outputs,
-                const OpStatePtr& state = OpStatePtr(),
-                std::vector<bool>* p_save_inputs = nullptr,
+                const OpStatePtr& state           = OpStatePtr(),
+                std::vector<bool>* p_save_inputs  = nullptr,
                 std::vector<bool>* p_save_outputs = nullptr);
   /*! \brief to record operator, return corresponding node. */
   void RecordDeferredCompute(nnvm::NodeAttrs&& attrs,
                              const std::vector<NDArray*>& inputs,
                              const std::vector<NDArray*>& outputs);
   /*! \brief obtain symbol representation of deferred compute session. */
-  nnvm::Symbol GetDeferredComputeSymbol(const std::vector<NDArray *> &outputs);
+  nnvm::Symbol GetDeferredComputeSymbol(const std::vector<NDArray*>& outputs);
   /*! \brief associate arrays with variables for deferred compute */
-  void SetDeferredComputeVariable(NDArrayHandle *arrays, SymbolHandle *variables, const int num);
+  void SetDeferredComputeVariable(NDArrayHandle* arrays, SymbolHandle* variables, const int num);
   /*! \brief clear info node associated with array */
-  void DeferredComputeClear(NDArrayHandle *arrays, const int num);
+  void DeferredComputeClear(NDArrayHandle* arrays, const int num);
   /*! \brief */
   OpStatePtr Invoke(const Context& default_ctx,
                     const nnvm::NodeAttrs& attrs,
@@ -278,7 +277,8 @@ class Imperative {
   std::vector<NDArray*> Backward(const std::vector<NDArray*>& outputs,
                                  const std::vector<NDArray*>& ograds,
                                  const std::vector<NDArray*>& variables,
-                                 bool is_train, bool retain_graph,
+                                 bool is_train,
+                                 bool retain_graph,
                                  bool create_graph);
   /*! \brief Return the marked nonleaf nodes. */
   std::vector<nnvm::ObjectPtr> ListNonleafVariables(const nnvm::Symbol& sym) const;
@@ -311,11 +311,11 @@ class Imperative {
       backward_bulk_size_ = BulkExecMaxNodeTrainBwd();
   }
   /*! \brief find the input/output ndarrays that are needed for backward */
-  void GetBackwardDependency(
-      const nnvm::ObjectPtr& node,
-      uint32_t num_inputs, uint32_t num_outputs,
-      std::vector<bool> *p_save_inputs,
-      std::vector<bool> *p_save_outputs);
+  void GetBackwardDependency(const nnvm::ObjectPtr& node,
+                             uint32_t num_inputs,
+                             uint32_t num_outputs,
+                             std::vector<bool>* p_save_inputs,
+                             std::vector<bool>* p_save_outputs);
   /*! \brief indicate whether is training. */
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index aebc5f663def..4c2d7cfb20ca 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -38,7 +38,7 @@ namespace mxnet {
  * \brief iterator type
  * \tparam DType data type
  */
-template<typename DType>
+template <typename DType>
 class IIterator : public dmlc::DataIter<DType> {
  public:
   /*!
@@ -51,7 +51,7 @@ class IIterator : public dmlc::DataIter<DType> {
   /*! \brief move to next item */
   virtual bool Next(void) = 0;
   /*! \brief get current data */
-  virtual const DType &Value(void) const = 0;
+  virtual const DType& Value(void) const = 0;
   /*! \brief constructor */
   virtual ~IIterator(void) {}
   /*! \brief store the name of each data, it could be used for making NDArrays */
@@ -94,14 +94,11 @@ struct DataBatch {
 };  // struct DataBatch
 
 /*! \brief typedef the factory function of data iterator */
-typedef std::function<IIterator<DataBatch> *()> DataIteratorFactory;
+typedef std::function<IIterator<DataBatch>*()> DataIteratorFactory;
 /*!
  * \brief Registry entry for DataIterator factory functions.
  */
-struct DataIteratorReg
-    : public dmlc::FunctionRegEntryBase<DataIteratorReg,
-                                        DataIteratorFactory> {
-};
+struct DataIteratorReg : public dmlc::FunctionRegEntryBase<DataIteratorReg, DataIteratorFactory> {};
 //--------------------------------------------------------------
 // The following part are API Registration of Iterators
 //--------------------------------------------------------------
@@ -117,7 +114,7 @@ struct DataIteratorReg
  *   });
  * \endcode
  */
-#define MXNET_REGISTER_IO_ITER(name)                                    \
+#define MXNET_REGISTER_IO_ITER(name) \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name)
 
 /*!
@@ -129,29 +126,26 @@ struct DataIteratorReg
 class Dataset {
  public:
   /*!
-  *  \brief Get the size of the dataset
-  */
+   *  \brief Get the size of the dataset
+   */
   virtual uint64_t GetLen(void) const = 0;
   /*!
-  *  \brief Get the ndarray items given index in dataset
-  *  \param idx the integer index for required data
-  *  \param ret the returned ndarray items
-  */
+   *  \brief Get the ndarray items given index in dataset
+   *  \param idx the integer index for required data
+   *  \param ret the returned ndarray items
+   */
   virtual bool GetItem(uint64_t idx, std::vector<NDArray>* ret) = 0;
   // virtual destructor
   virtual ~Dataset(void) {}
 };  // class Dataset
 
 /*! \brief typedef the factory function of dataset */
-typedef std::function<Dataset *(
-  const std::vector<std::pair<std::string, std::string> >&)> DatasetFactory;
+typedef std::function<Dataset*(const std::vector<std::pair<std::string, std::string> >&)>
+    DatasetFactory;
 /*!
  * \brief Registry entry for Dataset factory functions.
  */
-struct DatasetReg
-    : public dmlc::FunctionRegEntryBase<DatasetReg,
-                                        DatasetFactory> {
-};
+struct DatasetReg : public dmlc::FunctionRegEntryBase<DatasetReg, DatasetFactory> {};
 //--------------------------------------------------------------
 // The following part are API Registration of Datasets
 //--------------------------------------------------------------
@@ -167,7 +161,7 @@ struct DatasetReg
  *   });
  * \endcode
  */
-#define MXNET_REGISTER_IO_DATASET(name)                                    \
+#define MXNET_REGISTER_IO_DATASET(name) \
   DMLC_REGISTRY_REGISTER(::mxnet::DatasetReg, DatasetReg, name)
 
 class BatchifyFunction {
@@ -182,15 +176,13 @@ class BatchifyFunction {
 using BatchifyFunctionPtr = std::shared_ptr<BatchifyFunction>;
 
 /*! \brief typedef the factory function of data sampler */
-typedef std::function<BatchifyFunction *(
-  const std::vector<std::pair<std::string, std::string> >&)> BatchifyFunctionFactory;
+typedef std::function<BatchifyFunction*(const std::vector<std::pair<std::string, std::string> >&)>
+    BatchifyFunctionFactory;
 /*!
  * \brief Registry entry for DataSampler factory functions.
  */
 struct BatchifyFunctionReg
-    : public dmlc::FunctionRegEntryBase<BatchifyFunctionReg,
-                                        BatchifyFunctionFactory> {
-};
+    : public dmlc::FunctionRegEntryBase<BatchifyFunctionReg, BatchifyFunctionFactory> {};
 //--------------------------------------------------------------
 // The following part are API Registration of Batchify Function
 //--------------------------------------------------------------
@@ -206,7 +198,7 @@ struct BatchifyFunctionReg
  *   });
  * \endcode
  */
-#define MXNET_REGISTER_IO_BATCHIFY_FUNCTION(name)                                    \
+#define MXNET_REGISTER_IO_BATCHIFY_FUNCTION(name) \
   DMLC_REGISTRY_REGISTER(::mxnet::BatchifyFunctionReg, BatchifyFunctionReg, name)
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/include/mxnet/ir/expr.h b/include/mxnet/ir/expr.h
index a9f4ff2bbf70..53053dec674b 100644
--- a/include/mxnet/ir/expr.h
+++ b/include/mxnet/ir/expr.h
@@ -100,7 +100,7 @@ class PrimExprNode : public BaseExprNode {
  */
 class PrimExpr : public BaseExpr {
  public:
-    /*! \brief Cosntructor */
+  /*! \brief Cosntructor */
   PrimExpr() {}
   /*!
    * \brief Cosntructor from object ptr.
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index 0907d2d04e6f..9be22e97e9a8 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -45,9 +45,7 @@ namespace mxnet {
  * kPause allows pausing and resuming of profiler
  * kDump asks profiler to dump output
  */
-enum class KVStoreServerProfilerCommand {
-  kSetConfig, kState, kPause, kDump
-};
+enum class KVStoreServerProfilerCommand { kSetConfig, kState, kPause, kDump };
 
 /*!
  * \brief distributed key-value store
@@ -70,20 +68,22 @@ class KVStore {
    *   - 'dist_*' : multi-machines
    * \return a new created KVStore.
    */
-  static KVStore *Create(const char *type = "local");
+  static KVStore* Create(const char* type = "local");
 
   /**
    * \brief return the type
    */
-  inline const std::string& type() { return type_; }
+  inline const std::string& type() {
+    return type_;
+  }
 
   /**
    * \brief Set parameters to use low-bit compressed gradients
    * \param compression_type type of compression
    * \param threshold threshold for 2bit compression
    */
-  virtual void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
-                                      & kwargs) = 0;
+  virtual void SetGradientCompression(
+      const std::vector<std::pair<std::string, std::string>>& kwargs) = 0;
 
   /*!
    * \brief Initialize a list of key-value pair to the store.
@@ -101,8 +101,7 @@ class KVStore {
    * \param keys a list of unique keys
    * \param values a list of values
    */
-  virtual void Init(const std::vector<int>& keys,
-                    const std::vector<NDArray>& values) = 0;
+  virtual void Init(const std::vector<int>& keys, const std::vector<NDArray>& values) = 0;
   /*!
    * \brief Initialize a list of key-value pair to the store.
    * \param keys a list of unique keys in string format
@@ -148,7 +147,7 @@ class KVStore {
    */
   virtual void Push(const std::vector<int>& keys,
                     const std::vector<NDArray>& values,
-                    int priority = 0)  = 0;
+                    int priority = 0) = 0;
 
   /*!
    * \brief push a list of key-value pairs into the store
@@ -158,7 +157,7 @@ class KVStore {
    */
   virtual void Push(const std::vector<std::string>& str_keys,
                     const std::vector<NDArray>& values,
-                    int priority = 0)  = 0;
+                    int priority = 0) = 0;
   /*!
    * \brief pull a list of key-value pairs from the store
    *
@@ -185,7 +184,8 @@ class KVStore {
    */
   virtual void Pull(const std::vector<int>& keys,
                     const std::vector<NDArray*>& values,
-                    int priority = 0, bool ignore_sparse = true) = 0;
+                    int priority       = 0,
+                    bool ignore_sparse = true) = 0;
   /*!
    * \brief pull a list of key-value pairs from the store
    * \param keys the list of keys in string format
@@ -195,7 +195,8 @@ class KVStore {
    */
   virtual void Pull(const std::vector<std::string>& str_keys,
                     const std::vector<NDArray*>& values,
-                    int priority = 0, bool ignore_sparse = true) = 0;
+                    int priority       = 0,
+                    bool ignore_sparse = true) = 0;
 
   /*!
    * \brief broadcast a list of key-value pairs from the store
@@ -214,10 +215,9 @@ class KVStore {
   /*!
    * \brief broadcast a list of key-value pairs from the store
    * \param vkeys the list of keys to be pushed in string format
-   * \param okeys the list of keys to be pulled in string format. Should be the same set of keys in vkeys.
-   * \param values the list of values to be pushed
-   * \param outs the list of buffers for the pulled data, they should be preallocated
-   * \param priority Priority of the action.
+   * \param okeys the list of keys to be pulled in string format. Should be the same set of keys in
+   * vkeys. \param values the list of values to be pushed \param outs the list of buffers for the
+   * pulled data, they should be preallocated \param priority Priority of the action.
    */
   virtual void Broadcast(const std::vector<std::string>& str_vkeys,
                          const std::vector<std::string>& str_okeys,
@@ -242,10 +242,9 @@ class KVStore {
   /*!
    * \brief push and pull a list of key-value pairs from the store
    * \param vkeys the list of keys to be pushed in string format
-   * \param okeys the list of keys to be pulled in string format. Should be the same set of keys in vkeys.
-   * \param values the list of values to be pushed
-   * \param outs the list of buffers for the pulled data, they should be preallocated
-   * \param priority Priority of the action.
+   * \param okeys the list of keys to be pulled in string format. Should be the same set of keys in
+   * vkeys. \param values the list of values to be pushed \param outs the list of buffers for the
+   * pulled data, they should be preallocated \param priority Priority of the action.
    */
   virtual void PushPull(const std::vector<std::string>& str_vkeys,
                         const std::vector<std::string>& str_okeys,
@@ -358,7 +357,8 @@ class KVStore {
 
   void set_barrier_before_exit(const bool barrier_before_exit) {
 #if MXNET_USE_DIST_KVSTORE
-    if (!IsWorkerNode()) LOG(FATAL) << "barrier_before_exit takes effect only on worker nodes";
+    if (!IsWorkerNode())
+      LOG(FATAL) << "barrier_before_exit takes effect only on worker nodes";
     barrier_before_exit_ = barrier_before_exit;
 #else
     LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to enable barrier";
@@ -415,7 +415,7 @@ class KVStore {
    * all of them are reached this point. It doesn't guarantee that all
    * operations issued before are actually finished, such as \ref Push and \ref Pull.
    */
-  virtual void Barrier() { }
+  virtual void Barrier() {}
 
   /**
    * \brief Send a command to all server nodes
@@ -428,7 +428,7 @@ class KVStore {
    * \param cmd_id the head of the command
    * \param cmd_body the body of the command
    */
-  virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { }
+  virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) {}
 
   /**
    * \brief Sends server profiler commands to all server nodes
@@ -462,7 +462,7 @@ class KVStore {
    *
    * \param controller the user-defined server controller
    */
-  virtual void RunServer(const Controller& controller) { }
+  virtual void RunServer(const Controller& controller) {}
 
  protected:
   /**
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index f9525a28c4d4..dfdca6c6c588 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -47,8 +47,8 @@
 #include <sstream>
 
 #if defined(__NVCC__)
-  #include <cuda_runtime.h>
-  #include <curand_kernel.h>
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
 #endif
 
 /* Make sure to update the version number everytime you make changes */
@@ -60,9 +60,9 @@
  * see https://labjack.com/news/simple-cpp-symbol-visibility-demo for details
  */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  #define PRIVATE_SYMBOL
+#define PRIVATE_SYMBOL
 #else
-  #define PRIVATE_SYMBOL  __attribute__ ((visibility ("hidden")))
+#define PRIVATE_SYMBOL __attribute__((visibility("hidden")))
 #endif
 
 /*
@@ -94,120 +94,120 @@
 
 #ifdef __cplusplus
 extern "C" {
-  #endif
+#endif
+/*!
+ * \brief The device type in DLContext.
+ */
+typedef enum {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLGPU = 2,
   /*!
-   * \brief The device type in DLContext.
+   * \brief Pinned CUDA GPU device by cudaMallocHost
+   * \note kDLCPUPinned = kDLCPU | kDLGPU
    */
-  typedef enum {
-    /*! \brief CPU device */
-    kDLCPU = 1,
-    /*! \brief CUDA GPU device */
-    kDLGPU = 2,
-    /*!
-     * \brief Pinned CUDA GPU device by cudaMallocHost
-     * \note kDLCPUPinned = kDLCPU | kDLGPU
-     */
-    kDLCPUPinned = 3,
-    /*! \brief OpenCL devices. */
-    kDLOpenCL = 4,
-    /*! \brief Vulkan buffer for next generation graphics. */
-    kDLVulkan = 7,
-    /*! \brief Metal for Apple GPU. */
-    kDLMetal = 8,
-    /*! \brief Verilog simulator buffer */
-    kDLVPI = 9,
-    /*! \brief ROCm GPUs for AMD GPUs */
-    kDLROCM = 10,
-    /*!
-     * \brief Reserved extension device type,
-     * used for quickly test extension device
-     * The semantics can differ depending on the implementation.
-     */
-    kDLExtDev = 12,
-  } DLDeviceType;
-
+  kDLCPUPinned = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
   /*!
-   * \brief A Device context for Tensor and operator.
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
    */
-  typedef struct {
-    /*! \brief The device type used in the device. */
-    DLDeviceType device_type;
-    /*! \brief The device index */
-    int device_id;
-  } DLContext;
+  kDLExtDev = 12,
+} DLDeviceType;
+
+/*!
+ * \brief A Device context for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*! \brief The device index */
+  int device_id;
+} DLContext;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  kDLInt   = 0U,
+  kDLUInt  = 1U,
+  kDLFloat = 2U,
+} DLDataTypeCode;
 
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ */
+typedef struct {
   /*!
-   * \brief The type code options DLDataType.
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
    */
-  typedef enum {
-    kDLInt = 0U,
-    kDLUInt = 1U,
-    kDLFloat = 2U,
-  } DLDataTypeCode;
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
 
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
   /*!
-   * \brief The data type the tensor can hold.
+   * \brief The opaque data pointer points to the allocated data. This will be
+   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+   * aligns to 256 bytes as in CUDA.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
    *
-   *  Examples
-   *   - float: type_code = 2, bits = 32, lanes=1
-   *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
-   *   - int8: type_code = 0, bits = 8, lanes=1
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
    */
-  typedef struct {
-    /*!
-     * \brief Type code of base types.
-     * We keep it uint8_t instead of DLDataTypeCode for minimal memory
-     * footprint, but the value should be one of DLDataTypeCode enum values.
-     * */
-    uint8_t code;
-    /*!
-     * \brief Number of bits, common choices are 8, 16, 32.
-     */
-    uint8_t bits;
-    /*! \brief Number of lanes in the type, used for vector types. */
-    uint16_t lanes;
-  } DLDataType;
-
+  void* data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
   /*!
-   * \brief Plain C Tensor object, does not manage memory.
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be nullptr, indicating tensor is compact and row-majored.
    */
-  typedef struct {
-    /*!
-     * \brief The opaque data pointer points to the allocated data. This will be
-     * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
-     * aligns to 256 bytes as in CUDA.
-     *
-     * For given DLTensor, the size of memory required to store the contents of
-     * data is calculated as follows:
-     *
-     * \code{.c}
-     * static inline size_t GetDataSize(const DLTensor* t) {
-     *   size_t size = 1;
-     *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
-     *     size *= t->shape[i];
-     *   }
-     *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
-     *   return size;
-     * }
-     * \endcode
-     */
-    void* data;
-    /*! \brief The device context of the tensor */
-    DLContext ctx;
-    /*! \brief Number of dimensions */
-    int ndim;
-    /*! \brief The data type of the pointer*/
-    DLDataType dtype;
-    /*! \brief The shape of the tensor */
-    int64_t* shape;
-    /*!
-     * \brief strides of the tensor (in number of elements, not bytes)
-     *  can be nullptr, indicating tensor is compact and row-majored.
-     */
-    int64_t* strides;
-    /*! \brief The offset in bytes to the beginning pointer to data */
-    uint64_t byte_offset;
-  } DLTensor;
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
@@ -250,11 +250,11 @@ enum MXDType {
   kFloat32 = 0,
   kFloat64 = 1,
   kFloat16 = 2,
-  kUint8 = 3,
-  kInt32 = 4,
-  kInt8  = 5,
-  kInt64 = 6,
-  kUNSET = 100,
+  kUint8   = 3,
+  kInt32   = 4,
+  kInt8    = 5,
+  kInt64   = 6,
+  kUNSET   = 100,
 };
 
 /*
@@ -288,14 +288,14 @@ struct MXContext {
 };
 
 enum MXReturnValue {
-  MX_FAIL = 0,
+  MX_FAIL    = 0,
   MX_SUCCESS = 1,
 };
 
 // For sparse tensors, read/write the data from NDarray via pointers.
 struct MXSparse {
   // Pointer to data.
-  void *data{nullptr};
+  void* data{nullptr};
   // length of (non-zero) data.
   int64_t data_len;
 
@@ -310,8 +310,13 @@ struct MXSparse {
   int64_t* indptr = nullptr;
   int64_t indptr_len;
 
-  void set(void *data_ptr, const int64_t* dims, int ndims, void *idx,
-           int64_t num_idx, void *idx_ptr = nullptr, int64_t num_idx_ptr = 0);
+  void set(void* data_ptr,
+           const int64_t* dims,
+           int ndims,
+           void* idx,
+           int64_t num_idx,
+           void* idx_ptr       = nullptr,
+           int64_t num_idx_ptr = 0);
 };
 
 /*!
@@ -320,18 +325,27 @@ struct MXSparse {
 struct MXTensor {
   MXTensor();
   MXTensor(const MXTensor& oth);
-  MXTensor(void *data_ptr, std::vector<int64_t> shape, MXDType dtype,
-           size_t vID, MXContext mx_ctx, MXStorageType stype = kDefaultStorage);
+  MXTensor(void* data_ptr,
+           std::vector<int64_t> shape,
+           MXDType dtype,
+           size_t vID,
+           MXContext mx_ctx,
+           MXStorageType stype = kDefaultStorage);
 
   /*! \brief populate internal tensor fields */
-  void setTensor(void *dptr, MXDType type, const int64_t* dims, int ndims,
-                 size_t vID, MXContext mx_ctx, MXStorageType storage_type);
+  void setTensor(void* dptr,
+                 MXDType type,
+                 const int64_t* dims,
+                 int ndims,
+                 size_t vID,
+                 MXContext mx_ctx,
+                 MXStorageType storage_type);
 
   /*! \brief populate DLTensor fields */
   void setDLTensor();
 
   /*! \brief helper function to cast data pointer */
-  template<typename data_type>
+  template <typename data_type>
   inline data_type* data() {
     return reinterpret_cast<data_type*>(data_ptr);
   }
@@ -340,11 +354,11 @@ struct MXTensor {
   int64_t size() const;
 
   /*! \brief helper function to compare two MXTensors */
-  bool isSame(const MXTensor &oth) const;
+  bool isSame(const MXTensor& oth) const;
 
   // For dense, data_ptr points to 1D flattened tensor data
   // For sparse, data_ptr points to MXSparse
-  void *data_ptr;
+  void* data_ptr;
 
   // shape is in [2,3,4] format to represent high-dim tensor
   std::vector<int64_t> shape;
@@ -371,16 +385,22 @@ typedef void* (*xpu_malloc_t)(void*, int);
 /*! \brief sparse alloc function to allocate memory inside Forward/Backward functions */
 typedef void (*sparse_malloc_t)(void*, int, int, int, void**, int64_t**, int64_t**);
 /*! \brief resource malloc function to allocate ndarrays for graph passes */
-typedef void (*nd_malloc_t)(const void* _ndarray_alloc, const int64_t* shapes, int num_shapes,
-                            const char* dev_str, int dev_id, int dtype, const char* name,
-                            int isArg, void** data);
+typedef void (*nd_malloc_t)(const void* _ndarray_alloc,
+                            const int64_t* shapes,
+                            int num_shapes,
+                            const char* dev_str,
+                            int dev_id,
+                            int dtype,
+                            const char* name,
+                            int isArg,
+                            void** data);
 /*! \brief GPU stream pointer, is void* when not compiled with CUDA */
 #if defined(__NVCC__)
-  typedef cudaStream_t mx_stream_t;
-  typedef curandStatePhilox4_32_10_t mx_gpu_rand_t;
+typedef cudaStream_t mx_stream_t;
+typedef curandStatePhilox4_32_10_t mx_gpu_rand_t;
 #else
-  typedef void* mx_stream_t;
-  typedef void* mx_gpu_rand_t;
+typedef void* mx_stream_t;
+typedef void* mx_gpu_rand_t;
 #endif
 typedef std::mt19937 mx_cpu_rand_t;
 
@@ -394,15 +414,20 @@ class PassResource {
  public:
   PassResource(std::unordered_map<std::string, MXTensor>* new_args,
                std::unordered_map<std::string, MXTensor>* new_aux,
-               nd_malloc_t nd_malloc, const void* nd_alloc);
+               nd_malloc_t nd_malloc,
+               const void* nd_alloc);
 
   // allocate new arg param, adds to args map, returns newly allocated tensor
-  MXTensor* alloc_arg(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const;
+  MXTensor* alloc_arg(const std::string& name,
+                      const std::vector<int64_t>& shapes,
+                      const MXContext& ctx,
+                      MXDType dtype) const;
 
   // allocate new aux param, adds to aux map, returns newly allocated tensor
-  MXTensor* alloc_aux(const std::string& name, const std::vector<int64_t>& shapes,
-                      const MXContext &ctx, MXDType dtype) const;
+  MXTensor* alloc_aux(const std::string& name,
+                      const std::vector<int64_t>& shapes,
+                      const MXContext& ctx,
+                      MXDType dtype) const;
 
  private:
   std::unordered_map<std::string, MXTensor>* new_args_;
@@ -416,10 +441,15 @@ class PassResource {
  */
 class OpResource {
  public:
-  OpResource(xpu_malloc_t cpu_malloc_fp, void* cpu_alloc_fp,
-             xpu_malloc_t gpu_malloc_fp, void* gpu_alloc_fp, void* stream,
-             sparse_malloc_t sparse_malloc_fp, void* sparse_alloc_fp,
-             void* rng_cpu_states, void* rng_gpu_states);
+  OpResource(xpu_malloc_t cpu_malloc_fp,
+             void* cpu_alloc_fp,
+             xpu_malloc_t gpu_malloc_fp,
+             void* gpu_alloc_fp,
+             void* stream,
+             sparse_malloc_t sparse_malloc_fp,
+             void* sparse_alloc_fp,
+             void* rng_cpu_states,
+             void* rng_gpu_states);
 
   /*! \brief allocate cpu memory controlled by MXNet */
   void* alloc_cpu(int size) const;
@@ -452,11 +482,11 @@ class OpResource {
   /*! \brief lambda function to return allocated memory handle */
   void *cpu_alloc, *gpu_alloc;
   /*! \brief cuda stream passed from MXNet */
-  void *cuda_stream;
+  void* cuda_stream;
   /*! \brief sparse allocation lambda function */
   sparse_malloc_t sparse_malloc;
   /*! \brief lambda function to return allocated sparse memory handle */
-  void *sparse_alloc;
+  void* sparse_alloc;
   /*! \brief cpu and gpu rng fully inited and seeded states */
   void *rand_cpu_states, *rand_gpu_states;
 };
@@ -484,7 +514,7 @@ std::string getShapeAt(const std::string& shape, unsigned index);
  * Examples:
  *
  * getDtypeAt("[1]", 0) returns "1"
- * getDtypeAt("[1,2]", 1) returns "2" 
+ * getDtypeAt("[1,2]", 1) returns "2"
  */
 std::string getDtypeAt(const std::string& dtype, unsigned index);
 
@@ -492,7 +522,7 @@ std::string getDtypeAt(const std::string& dtype, unsigned index);
  * \brief Json utility to parse serialized subgraph symbol
  */
 /*! \brief Types of JSON objects */
-enum JsonType {ERR, STR, NUM, LIST, MAP};
+enum JsonType { ERR, STR, NUM, LIST, MAP };
 
 /*! \brief definition of JSON objects */
 struct JsonVal {
@@ -505,7 +535,7 @@ struct JsonVal {
   explicit JsonVal(int n);
   // complex constructor
   JsonVal(JsonType t, int n, std::string s);
-  bool operator<(const JsonVal &o) const;
+  bool operator<(const JsonVal& o) const;
 
   // convert JSON object back to JSON-compatible string
   std::string dump() const;
@@ -526,7 +556,7 @@ struct JsonVal {
   static JsonVal parse_map(const std::string& json, unsigned int* idx);
 
   // generic parse function
-  static JsonVal parse(const std::string& json, unsigned int *idx);
+  static JsonVal parse(const std::string& json, unsigned int* idx);
 
   // debug function to convert data structure to a debugstring
   std::string toString() const;
@@ -547,7 +577,7 @@ class Graph;
 // Representation of an input/output to a node
 struct NodeEntry {
   Node* node;  // other node thats producing/consuming inputs/outputs
-  int entry;  // entry index from other node (ie. output index from producing node)
+  int entry;   // entry index from other node (ie. output index from producing node)
 };
 
 // Representation of a node in the graph
@@ -559,19 +589,17 @@ class Node {
   void _setPassResource(PassResource* res_);
 
   /* \brief allocate an arg tensor for this node */
-  void alloc_arg(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype);
+  void alloc_arg(const std::vector<int64_t>& shapes, const MXContext& ctx, MXDType dtype);
 
   /* \brief allocate an aux tensor for this node */
-  void alloc_aux(const std::vector<int64_t>& shapes,
-                 const MXContext &ctx, MXDType dtype);
-
-  std::string op;  // operator name (ie. Convolution)
-  std::string name;  // unique node name (ie. conv_0 or conv_1)
-  MXTensor* tensor;  // tensor data for input nodes
-  std::vector<NodeEntry> inputs;  // set of inputs to the node
-  std::vector<NodeEntry> outputs;  // set of outputs from the node
-  std::vector<Graph*> subgraphs;  // set of subgraphs within this node
+  void alloc_aux(const std::vector<int64_t>& shapes, const MXContext& ctx, MXDType dtype);
+
+  std::string op;                                      // operator name (ie. Convolution)
+  std::string name;                                    // unique node name (ie. conv_0 or conv_1)
+  MXTensor* tensor;                                    // tensor data for input nodes
+  std::vector<NodeEntry> inputs;                       // set of inputs to the node
+  std::vector<NodeEntry> outputs;                      // set of outputs from the node
+  std::vector<Graph*> subgraphs;                       // set of subgraphs within this node
   std::unordered_map<std::string, std::string> attrs;  // node attributes
 
  private:
@@ -599,7 +627,8 @@ class Graph {
   std::string toString() const;
 
   /* \brief visits a node "n" */
-  void _dfs_util(Node* n, std::unordered_set<Node*>* to_visit,
+  void _dfs_util(Node* n,
+                 std::unordered_set<Node*>* to_visit,
                  std::function<void(Node*)> handler) const;
 
   /* \brief post-order DFS graph traversal */
@@ -668,8 +697,7 @@ class CustomOpSelector {
    * candidates - indices of nodes to include in subgraph
    * keep - indices of nodes to keep in subgraph
    */
-  virtual void Filter(const std::vector<int>& candidates,
-                      std::vector<int>* keep) {
+  virtual void Filter(const std::vector<int>& candidates, std::vector<int>* keep) {
     keep->insert(keep->end(), candidates.begin(), candidates.end());
   }
   /* \brief Reset any selector state, called after growing subgraph, before filter
@@ -688,14 +716,16 @@ class CustomStatefulOp {
   CustomStatefulOp();
   virtual ~CustomStatefulOp();
 
-  template<class A, typename ...Ts>
-  static CustomStatefulOp* create(Ts...args) {
+  template <class A, typename... Ts>
+  static CustomStatefulOp* create(Ts... args) {
     CustomStatefulOp* op = new A(args...);
-    op->created = true;
+    op->created          = true;
     return op;
   }
 
-  bool wasCreated() { return created; }
+  bool wasCreated() {
+    return created;
+  }
 
   virtual MXReturnValue Forward(std::vector<MXTensor>* inputs,
                                 std::vector<MXTensor>* outputs,
@@ -714,35 +744,34 @@ class CustomStatefulOp {
 };
 
 /*! \brief Custom Operator function templates */
-typedef MXReturnValue (*fcomp_t)(const std::unordered_map<std::string,
-                                 std::string>& attributes,
+typedef MXReturnValue (*fcomp_t)(const std::unordered_map<std::string, std::string>& attributes,
                                  std::vector<MXTensor>* inputs,
                                  std::vector<MXTensor>* outputs,
                                  const OpResource& res);
-typedef MXReturnValue (*parseAttrs_t)(const std::unordered_map<std::string,
-                                      std::string>& attributes,
-                                      int* num_inputs, int* num_outputs);
-typedef MXReturnValue (*inferType_t)(const std::unordered_map<std::string,
-                                     std::string>& attributes,
+typedef MXReturnValue (*parseAttrs_t)(
+    const std::unordered_map<std::string, std::string>& attributes,
+    int* num_inputs,
+    int* num_outputs);
+typedef MXReturnValue (*inferType_t)(const std::unordered_map<std::string, std::string>& attributes,
                                      std::vector<int>* in_types,
                                      std::vector<int>* out_types);
-typedef MXReturnValue (*inferSType_t)(const std::unordered_map<std::string,
-                                      std::string>& attributes,
-                                      std::vector<int>* in_storage_types,
-                                      std::vector<int>* out_storage_types);
-typedef MXReturnValue (*inferShape_t)(const std::unordered_map<std::string,
-                                      std::string>& attributes,
-                                      std::vector<std::vector<unsigned int> >* in_shapes,
-                                      std::vector<std::vector<unsigned int> >* out_shapes);
-typedef MXReturnValue (*mutateInputs_t)(const std::unordered_map<std::string,
-                                        std::string>& attributes,
-                                        std::vector<int>* input_indices);
-typedef MXReturnValue (*createOpState_t)(const std::unordered_map<std::string,
-                                         std::string>& attributes,
-                                         const MXContext& ctx,
-                                         const std::vector<std::vector<unsigned int> >& in_shapes,
-                                         const std::vector<int> in_types,
-                                         CustomStatefulOp**);
+typedef MXReturnValue (*inferSType_t)(
+    const std::unordered_map<std::string, std::string>& attributes,
+    std::vector<int>* in_storage_types,
+    std::vector<int>* out_storage_types);
+typedef MXReturnValue (*inferShape_t)(
+    const std::unordered_map<std::string, std::string>& attributes,
+    std::vector<std::vector<unsigned int> >* in_shapes,
+    std::vector<std::vector<unsigned int> >* out_shapes);
+typedef MXReturnValue (*mutateInputs_t)(
+    const std::unordered_map<std::string, std::string>& attributes,
+    std::vector<int>* input_indices);
+typedef MXReturnValue (*createOpState_t)(
+    const std::unordered_map<std::string, std::string>& attributes,
+    const MXContext& ctx,
+    const std::vector<std::vector<unsigned int> >& in_shapes,
+    const std::vector<int> in_types,
+    CustomStatefulOp**);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -816,19 +845,20 @@ class CustomPass {
 };
 
 /*! \brief Custom Subgraph Create function template */
-typedef MXReturnValue (*supportedOps_t)(const mxnet::ext::Graph *graph, std::vector<int>* ids,
-                                        const std::unordered_map<std::string,
-                                                                 std::string>& options);
-typedef MXReturnValue (*createSelector_t)(const mxnet::ext::Graph *graph,
-                                          CustomOpSelector** sel_inst,
-                                          const std::unordered_map<std::string,
-                                                                   std::string>& options);
-typedef MXReturnValue (*reviewSubgraph_t)(const mxnet::ext::Graph *subgraph, int subgraph_id,
-                                          bool* accept,
-                                          const std::unordered_map<std::string,
-                                                                   std::string>& options,
-                                          std::unordered_map<std::string,
-                                                             std::string>* attrs);
+typedef MXReturnValue (*supportedOps_t)(
+    const mxnet::ext::Graph* graph,
+    std::vector<int>* ids,
+    const std::unordered_map<std::string, std::string>& options);
+typedef MXReturnValue (*createSelector_t)(
+    const mxnet::ext::Graph* graph,
+    CustomOpSelector** sel_inst,
+    const std::unordered_map<std::string, std::string>& options);
+typedef MXReturnValue (*reviewSubgraph_t)(
+    const mxnet::ext::Graph* subgraph,
+    int subgraph_id,
+    bool* accept,
+    const std::unordered_map<std::string, std::string>& options,
+    std::unordered_map<std::string, std::string>* attrs);
 
 /*!
  * \brief An abstract class for subgraph property
@@ -839,8 +869,7 @@ class CustomPartitioner {
 
   explicit CustomPartitioner(const char* backend_name);
 
-  CustomPartitioner& addStrategy(const char* prop_name,
-                                 const char* sg_name);
+  CustomPartitioner& addStrategy(const char* prop_name, const char* sg_name);
 
   CustomPartitioner& setSupportedOps(const char* prop_name, supportedOps_t fn);
 
@@ -885,7 +914,7 @@ class Registry {
    * \returns new object associated with registered name
    */
   T& add(const char* name) {
-    T *entry = new T(name);
+    T* entry = new T(name);
     entries.push_back(entry);
     return *entry;
   }
@@ -910,34 +939,35 @@ class Registry {
  * Annoyingly, the concat_ and concat macros are necessary to
  * be able to use __COUNTER__ in an identifier name
  */
-#define MX_STR_CONCAT_(__a, __b) __a ## __b
-#define MX_STR_CONCAT(__a, __b) MX_STR_CONCAT_(__a, __b)
+#define MX_STR_CONCAT_(__a, __b) __a##__b
+#define MX_STR_CONCAT(__a, __b)  MX_STR_CONCAT_(__a, __b)
 
 /*! \brief convert a token to a string */
 #define MX_STRINGIFY(x) #x
-#define MX_TOSTRING(x) MX_STRINGIFY(x)
+#define MX_TOSTRING(x)  MX_STRINGIFY(x)
 
 /*! \brief declare a variable with custom name */
-#define MX_REGISTER_NAME_(Name) MXNet ## _CustomOp ## _ ## Name
-#define MX_REGISTER_DEF_(Name) mxnet::ext::CustomOp MX_REGISTER_NAME_(Name)
+#define MX_REGISTER_NAME_(Name) MXNet##_CustomOp##_##Name
+#define MX_REGISTER_DEF_(Name)  mxnet::ext::CustomOp MX_REGISTER_NAME_(Name)
 
-#define MX_REGISTER_PROP_NAME_(Name) MXNet ## _CustomSubProp ## _ ## Name
-#define MX_REGISTER_PROP_DEF_(Name) mxnet::ext::CustomPartitioner MX_REGISTER_PROP_NAME_(Name)
+#define MX_REGISTER_PROP_NAME_(Name) MXNet##_CustomSubProp##_##Name
+#define MX_REGISTER_PROP_DEF_(Name)  mxnet::ext::CustomPartitioner MX_REGISTER_PROP_NAME_(Name)
 
-#define MX_REGISTER_PASS_NAME_(Name) MXNet ## _CustomPass ## _ ## Name
-#define MX_REGISTER_PASS_DEF_(Name) mxnet::ext::CustomPass MX_REGISTER_PASS_NAME_(Name)
+#define MX_REGISTER_PASS_NAME_(Name) MXNet##_CustomPass##_##Name
+#define MX_REGISTER_PASS_DEF_(Name)  mxnet::ext::CustomPass MX_REGISTER_PASS_NAME_(Name)
 
 /*! \brief assign a var to a value */
-#define REGISTER_OP(Name) MX_STR_CONCAT(MX_REGISTER_DEF_(Name), __COUNTER__) = \
-    mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->add(MX_TOSTRING(Name))
+#define REGISTER_OP(Name)                              \
+  MX_STR_CONCAT(MX_REGISTER_DEF_(Name), __COUNTER__) = \
+      mxnet::ext::Registry<mxnet::ext::CustomOp>::get()->add(MX_TOSTRING(Name))
 
-#define REGISTER_PARTITIONER(Name) \
+#define REGISTER_PARTITIONER(Name)                          \
   MX_STR_CONCAT(MX_REGISTER_PROP_DEF_(Name), __COUNTER__) = \
-    mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->add(MX_TOSTRING(Name))
+      mxnet::ext::Registry<mxnet::ext::CustomPartitioner>::get()->add(MX_TOSTRING(Name))
 
-#define REGISTER_PASS(Name) \
+#define REGISTER_PASS(Name)                                 \
   MX_STR_CONCAT(MX_REGISTER_PASS_DEF_(Name), __COUNTER__) = \
-    mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->add(MX_TOSTRING(Name))
+      mxnet::ext::Registry<mxnet::ext::CustomPass>::get()->add(MX_TOSTRING(Name))
 
 /* -------------- BELOW ARE CTYPE FUNCTIONS PROTOTYPES --------------- */
 
@@ -950,94 +980,172 @@ class Registry {
 typedef int (*opRegSize_t)(void);
 
 #define MXLIB_OPREGGET_STR "_opRegGet"
-typedef int (*opRegGet_t)(int idx, const char** name, int *isSGop,
-                          const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
-                          int* forward_count, const char*** backward_ctx,
-                          mxnet::ext::fcomp_t** backward_fp, int* backward_count,
-                          const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
-                          int* create_op_count, mxnet::ext::parseAttrs_t* parse,
-                          mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
-                          mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate);
+typedef int (*opRegGet_t)(int idx,
+                          const char** name,
+                          int* isSGop,
+                          const char*** forward_ctx,
+                          mxnet::ext::fcomp_t** forward_fp,
+                          int* forward_count,
+                          const char*** backward_ctx,
+                          mxnet::ext::fcomp_t** backward_fp,
+                          int* backward_count,
+                          const char*** create_op_ctx,
+                          mxnet::ext::createOpState_t** create_op_fp,
+                          int* create_op_count,
+                          mxnet::ext::parseAttrs_t* parse,
+                          mxnet::ext::inferType_t* type,
+                          mxnet::ext::inferSType_t* stype,
+                          mxnet::ext::inferShape_t* shape,
+                          mxnet::ext::mutateInputs_t* mutate);
 
 #define MXLIB_OPCALLFREE_STR "_opCallFree"
 typedef int (*opCallFree_t)(void* ptr);
 
 #define MXLIB_OPCALLPARSEATTRS_STR "_opCallParseAttrs"
-typedef int (*opCallParseAttrs_t)(parseAttrs_t parseAttrs, const char* const* keys,
-                                  const char* const* vals, int num,
-                                  int* num_in, int* num_out);
+typedef int (*opCallParseAttrs_t)(parseAttrs_t parseAttrs,
+                                  const char* const* keys,
+                                  const char* const* vals,
+                                  int num,
+                                  int* num_in,
+                                  int* num_out);
 
 #define MXLIB_OPCALLINFERSHAPE_STR "_opCallInferShape"
-typedef int (*opCallInferShape_t)(inferShape_t inferShape, const char* const* keys,
-                                  const char* const* vals, int num,
-                                  unsigned int** inshapes, int* indims, int num_in,
-                                  unsigned int*** mod_inshapes, int** mod_indims,
-                                  unsigned int*** outshapes, int** outdims, int num_out);
+typedef int (*opCallInferShape_t)(inferShape_t inferShape,
+                                  const char* const* keys,
+                                  const char* const* vals,
+                                  int num,
+                                  unsigned int** inshapes,
+                                  int* indims,
+                                  int num_in,
+                                  unsigned int*** mod_inshapes,
+                                  int** mod_indims,
+                                  unsigned int*** outshapes,
+                                  int** outdims,
+                                  int num_out);
 
 #define MXLIB_OPCALLINFERTYPE_STR "_opCallInferType"
-typedef int (*opCallInferType_t)(inferType_t inferType, const char* const* keys,
-                                 const char* const* vals, int num,
-                                 int* intypes, int num_in, int* outtypes, int num_out);
+typedef int (*opCallInferType_t)(inferType_t inferType,
+                                 const char* const* keys,
+                                 const char* const* vals,
+                                 int num,
+                                 int* intypes,
+                                 int num_in,
+                                 int* outtypes,
+                                 int num_out);
 
 #define MXLIB_OPCALLINFERSTYPE_STR "_opCallInferSType"
-typedef int (*opCallInferSType_t)(inferSType_t inferSType, const char* const* keys,
-                                 const char* const* vals, int num,
-                                 int* intypes, int num_in, int* outtypes, int num_out);
+typedef int (*opCallInferSType_t)(inferSType_t inferSType,
+                                  const char* const* keys,
+                                  const char* const* vals,
+                                  int num,
+                                  int* intypes,
+                                  int num_in,
+                                  int* outtypes,
+                                  int num_out);
 
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
-typedef int (*opCallFComp_t)(fcomp_t fcomp, const char* const* keys,
-                             const char* const* vals, int num,
-                             const int64_t** inshapes, int* indims,
-                             void** indata, int* intypes,
-                             size_t* inIDs, const char** indev_type,
-                             int* indev_id, int num_in,
-                             const int64_t** outshapes, int* outdims,
-                             void** outdata, int* outtypes,
-                             size_t* outIDs, const char** outdev_type,
-                             int* outdev_id, int num_out,
-                             xpu_malloc_t cpu_malloc, void* cpu_alloc,
-                             xpu_malloc_t gpu_malloc, void* gpu_alloc, void* cuda_stream,
-                             sparse_malloc_t sparse_malloc, void* sparse_alloc,
-                             int* instypes, int* outstypes,
-                             void** in_indices, void** out_indices,
-                             void** in_indptr, void** out_indptr,
-                             int64_t* in_indices_shapes, int64_t* out_indices_shapes,
-                             int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                             void* rng_cpu_states, void* rng_gpu_states);
+typedef int (*opCallFComp_t)(fcomp_t fcomp,
+                             const char* const* keys,
+                             const char* const* vals,
+                             int num,
+                             const int64_t** inshapes,
+                             int* indims,
+                             void** indata,
+                             int* intypes,
+                             size_t* inIDs,
+                             const char** indev_type,
+                             int* indev_id,
+                             int num_in,
+                             const int64_t** outshapes,
+                             int* outdims,
+                             void** outdata,
+                             int* outtypes,
+                             size_t* outIDs,
+                             const char** outdev_type,
+                             int* outdev_id,
+                             int num_out,
+                             xpu_malloc_t cpu_malloc,
+                             void* cpu_alloc,
+                             xpu_malloc_t gpu_malloc,
+                             void* gpu_alloc,
+                             void* cuda_stream,
+                             sparse_malloc_t sparse_malloc,
+                             void* sparse_alloc,
+                             int* instypes,
+                             int* outstypes,
+                             void** in_indices,
+                             void** out_indices,
+                             void** in_indptr,
+                             void** out_indptr,
+                             int64_t* in_indices_shapes,
+                             int64_t* out_indices_shapes,
+                             int64_t* in_indptr_shapes,
+                             int64_t* out_indptr_shapes,
+                             void* rng_cpu_states,
+                             void* rng_gpu_states);
 
 #define MXLIB_OPCALLMUTATEINPUTS_STR "_opCallMutateInputs"
-typedef int (*opCallMutateInputs_t)(mutateInputs_t mutate, const char* const* keys,
-                                    const char* const* vals, int num,
-                                    int** mutate_indices, int* indices_size);
+typedef int (*opCallMutateInputs_t)(mutateInputs_t mutate,
+                                    const char* const* keys,
+                                    const char* const* vals,
+                                    int num,
+                                    int** mutate_indices,
+                                    int* indices_size);
 
 #define MXLIB_OPCALLCREATEOPSTATE_STR "_opCallCreateOpState"
-typedef int (*opCallCreateOpState_t)(createOpState_t create_op, const char* const* keys,
-                                     const char* const* vals, int num, const char* dev_type,
-                                     int dev_id, unsigned int** inshapes, int* indims,
-                                     int num_in, const int* intypes, void** state_op);
+typedef int (*opCallCreateOpState_t)(createOpState_t create_op,
+                                     const char* const* keys,
+                                     const char* const* vals,
+                                     int num,
+                                     const char* dev_type,
+                                     int dev_id,
+                                     unsigned int** inshapes,
+                                     int* indims,
+                                     int num_in,
+                                     const int* intypes,
+                                     void** state_op);
 
 #define MXLIB_OPCALLDESTROYOPSTATE_STR "_opCallDestroyOpState"
 typedef int (*opCallDestroyOpState_t)(void* state_op);
 
 #define MXLIB_OPCALLFSTATEFULCOMP_STR "_opCallFStatefulCompute"
-typedef int (*opCallFStatefulComp_t)(int is_forward, void* state_op,
-                                     const int64_t** inshapes, int* indims,
-                                     void** indata, int* intypes,
-                                     size_t* inIDs, const char** indev_type,
-                                     int* indev_id, int num_in,
-                                     const int64_t** outshapes, int* outdims,
-                                     void** outdata, int* outtypes,
-                                     size_t* outIDs, const char** outdev_type,
-                                     int* outdev_id, int num_out,
-                                     xpu_malloc_t cpu_malloc, void* cpu_alloc,
-                                     xpu_malloc_t gpu_malloc, void* gpu_alloc, void* stream,
-                                     sparse_malloc_t sparse_malloc, void* sparse_alloc,
-                                     int* instypes, int* outstypes,
-                                     void** in_indices, void** out_indices,
-                                     void** in_indptr, void** out_indptr,
-                                     int64_t* in_indices_shapes, int64_t* out_indices_shapes,
-                                     int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                                     void* rng_cpu_states, void* rng_gpu_states);
+typedef int (*opCallFStatefulComp_t)(int is_forward,
+                                     void* state_op,
+                                     const int64_t** inshapes,
+                                     int* indims,
+                                     void** indata,
+                                     int* intypes,
+                                     size_t* inIDs,
+                                     const char** indev_type,
+                                     int* indev_id,
+                                     int num_in,
+                                     const int64_t** outshapes,
+                                     int* outdims,
+                                     void** outdata,
+                                     int* outtypes,
+                                     size_t* outIDs,
+                                     const char** outdev_type,
+                                     int* outdev_id,
+                                     int num_out,
+                                     xpu_malloc_t cpu_malloc,
+                                     void* cpu_alloc,
+                                     xpu_malloc_t gpu_malloc,
+                                     void* gpu_alloc,
+                                     void* stream,
+                                     sparse_malloc_t sparse_malloc,
+                                     void* sparse_alloc,
+                                     int* instypes,
+                                     int* outstypes,
+                                     void** in_indices,
+                                     void** out_indices,
+                                     void** in_indptr,
+                                     void** out_indptr,
+                                     int64_t* in_indices_shapes,
+                                     int64_t* out_indices_shapes,
+                                     int64_t* in_indptr_shapes,
+                                     int64_t* out_indptr_shapes,
+                                     void* rng_cpu_states,
+                                     void* rng_gpu_states);
 
 #define MXLIB_PARTREGSIZE_STR "_partRegSize"
 typedef int (*partRegSize_t)(void);
@@ -1046,52 +1154,81 @@ typedef int (*partRegSize_t)(void);
 typedef int (*partRegGetCount_t)(int idx, const char** name);
 
 #define MXLIB_PARTREGGET_STR "_partRegGet"
-typedef void (*partRegGet_t)(int part_idx, int stg_idx, const char** strategy,
-                             supportedOps_t* supportedOps, createSelector_t* createSelector,
-                             reviewSubgraph_t* reviewSubgraph, const char** op_name);
+typedef void (*partRegGet_t)(int part_idx,
+                             int stg_idx,
+                             const char** strategy,
+                             supportedOps_t* supportedOps,
+                             createSelector_t* createSelector,
+                             reviewSubgraph_t* reviewSubgraph,
+                             const char** op_name);
 
 #define MXLIB_PARTCALLSUPPORTEDOPS_STR "_partCallSupportedOps"
-typedef int (*partCallSupportedOps_t)(supportedOps_t supportedOps, const char *json,
-                                      int num_ids, int *ids, const char* const* opt_keys,
-                                      const char* const* opt_vals, int num_opts);
+typedef int (*partCallSupportedOps_t)(supportedOps_t supportedOps,
+                                      const char* json,
+                                      int num_ids,
+                                      int* ids,
+                                      const char* const* opt_keys,
+                                      const char* const* opt_vals,
+                                      int num_opts);
 
 #define MXLIB_PARTCALLCREATESELECTOR_STR "_partCallCreateSelector"
-typedef int (*partCallCreateSelector_t)(createSelector_t createSelector, const char *json,
-                                        void** selector, const char* const* opt_keys,
-                                        const char* const* opt_vals, int num_opts);
+typedef int (*partCallCreateSelector_t)(createSelector_t createSelector,
+                                        const char* json,
+                                        void** selector,
+                                        const char* const* opt_keys,
+                                        const char* const* opt_vals,
+                                        int num_opts);
 
 #define MXLIB_PARTCALLSELECT_STR "_partCallSelect"
 typedef void (*partCallSelect_t)(void* sel_inst, int nodeID, int* selected);
 
 #define MXLIB_PARTCALLSELECTINPUT_STR "_partCallSelectInput"
-typedef void (*partCallSelectInput_t)(void* sel_inst, int nodeID, int input_nodeID,
-                                  int* selected);
+typedef void (*partCallSelectInput_t)(void* sel_inst, int nodeID, int input_nodeID, int* selected);
 
 #define MXLIB_PARTCALLSELECTOUTPUT_STR "_partCallSelectOutput"
-typedef void (*partCallSelectOutput_t)(void* sel_inst, int nodeID, int output_nodeID,
-                                   int* selected);
+typedef void (*partCallSelectOutput_t)(void* sel_inst,
+                                       int nodeID,
+                                       int output_nodeID,
+                                       int* selected);
 
 #define MXLIB_PARTCALLFILTER_STR "_partCallFilter"
-typedef void (*partCallFilter_t)(void* sel_inst, int* candidates, int num_candidates,
-                             int** keep, int* num_keep);
+typedef void (*partCallFilter_t)(void* sel_inst,
+                                 int* candidates,
+                                 int num_candidates,
+                                 int** keep,
+                                 int* num_keep);
 
 #define MXLIB_PARTCALLRESET_STR "_partCallReset"
 typedef void (*partCallReset_t)(void* sel_inst);
 
 #define MXLIB_PARTCALLREVIEWSUBGRAPH_STR "_partCallReviewSubgraph"
-typedef int (*partCallReviewSubgraph_t)(reviewSubgraph_t reviewSubgraph, const char *json,
-                                        int subgraph_id, int *accept, const char* const* opt_keys,
-                                        const char* const* opt_vals, int num_opts,
-                                        char*** attr_keys, char*** attr_vals, int *num_attrs,
-                                        const char* const* arg_names, int num_args,
-                                        void* const* arg_data, const int64_t* const* arg_shapes,
-                                        const int* arg_dims, const int* arg_types,
-                                        const size_t* arg_IDs, const char* const* arg_dev_type,
+typedef int (*partCallReviewSubgraph_t)(reviewSubgraph_t reviewSubgraph,
+                                        const char* json,
+                                        int subgraph_id,
+                                        int* accept,
+                                        const char* const* opt_keys,
+                                        const char* const* opt_vals,
+                                        int num_opts,
+                                        char*** attr_keys,
+                                        char*** attr_vals,
+                                        int* num_attrs,
+                                        const char* const* arg_names,
+                                        int num_args,
+                                        void* const* arg_data,
+                                        const int64_t* const* arg_shapes,
+                                        const int* arg_dims,
+                                        const int* arg_types,
+                                        const size_t* arg_IDs,
+                                        const char* const* arg_dev_type,
                                         const int* arg_dev_id,
-                                        const char* const* aux_names, int num_aux,
-                                        void* const* aux_data, const int64_t* const* aux_shapes,
-                                        const int* aux_dims, const int* aux_types,
-                                        const size_t* aux_IDs, const char* const* aux_dev_type,
+                                        const char* const* aux_names,
+                                        int num_aux,
+                                        void* const* aux_data,
+                                        const int64_t* const* aux_shapes,
+                                        const int* aux_dims,
+                                        const int* aux_types,
+                                        const size_t* aux_IDs,
+                                        const char* const* aux_dev_type,
                                         const int* aux_dev_id);
 
 #define MXLIB_PASSREGSIZE_STR "_passRegSize"
@@ -1101,19 +1238,32 @@ typedef int (*passRegSize_t)(void);
 typedef void (*passRegGet_t)(int pass_idx, graphPass_t* graphPass, const char** pass_name);
 
 #define MXLIB_PASSCALLGRAPHPASS_STR "_passCallGraphPass"
-typedef int (*passCallGraphPass_t)(graphPass_t graphPass, const char *in_graph,
-                                   char** out_graph, const char* const* opt_keys,
-                                   const char* const* opt_vals, int num_opts,
-                                   const char* pass_name, const char* const* arg_names,
-                                   int num_args, void* const* arg_data,
-                                   const int64_t* const* arg_shapes, const int* arg_dims,
-                                   const int* arg_types, const size_t* arg_IDs,
-                                   const char* const* arg_dev_type, const int* arg_dev_id,
-                                   const char* const* aux_names, int num_aux,
-                                   void* const* aux_data, const int64_t* const* aux_shapes,
-                                   const int* aux_dims, const int* aux_types,
-                                   const size_t* aux_IDs, const char* const* aux_dev_type,
-                                   const int* aux_dev_id, nd_malloc_t nd_malloc,
+typedef int (*passCallGraphPass_t)(graphPass_t graphPass,
+                                   const char* in_graph,
+                                   char** out_graph,
+                                   const char* const* opt_keys,
+                                   const char* const* opt_vals,
+                                   int num_opts,
+                                   const char* pass_name,
+                                   const char* const* arg_names,
+                                   int num_args,
+                                   void* const* arg_data,
+                                   const int64_t* const* arg_shapes,
+                                   const int* arg_dims,
+                                   const int* arg_types,
+                                   const size_t* arg_IDs,
+                                   const char* const* arg_dev_type,
+                                   const int* arg_dev_id,
+                                   const char* const* aux_names,
+                                   int num_aux,
+                                   void* const* aux_data,
+                                   const int64_t* const* aux_shapes,
+                                   const int* aux_dims,
+                                   const int* aux_types,
+                                   const size_t* aux_IDs,
+                                   const char* const* aux_dev_type,
+                                   const int* aux_dev_id,
+                                   nd_malloc_t nd_malloc,
                                    const void* nd_alloc);
 
 #define MXLIB_INITIALIZE_STR "initialize"
@@ -1133,8 +1283,11 @@ class CustomStatefulOpWrapper {
  public:
   ~CustomStatefulOpWrapper();
   explicit CustomStatefulOpWrapper(CustomStatefulOp* inst, opCallDestroyOpState_t destroy)
-    : instance(inst), destroy_(destroy) {}
-  CustomStatefulOp* get_instance() { return instance; }
+      : instance(inst), destroy_(destroy) {}
+  CustomStatefulOp* get_instance() {
+    return instance;
+  }
+
  private:
   CustomStatefulOp* instance;
   opCallDestroyOpState_t destroy_;
@@ -1152,194 +1305,315 @@ class CustomStatefulOpWrapper {
 }  // namespace mxnet
 
 extern "C" {
-  /*! \brief returns MXNet library version */
-  MX_INT_RET _opVersion();
-
-  /*! \brief returns number of ops registered in this library */
-  MX_INT_RET _opRegSize();
-
-  /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _opRegGet(int idx, const char** name, int *isSGop,
-                        const char*** forward_ctx, mxnet::ext::fcomp_t** forward_fp,
-                        int* forward_count, const char*** backward_ctx,
-                        mxnet::ext::fcomp_t** backward_fp, int* backward_count,
-                        const char*** create_op_ctx, mxnet::ext::createOpState_t** create_op_fp,
-                        int* create_op_count, mxnet::ext::parseAttrs_t* parse,
-                        mxnet::ext::inferType_t* type, mxnet::ext::inferSType_t* stype,
-                        mxnet::ext::inferShape_t* shape, mxnet::ext::mutateInputs_t* mutate);
-
-  /*! \brief calls free from the external library for library allocated arrays */
-  MX_VOID_RET _opCallFree(void* ptr);
-
-  /*! \brief returns status of calling parse attributes function for operator from library */
-  MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs, const char* const* keys,
-                               const char* const* vals, int num,
-                               int* num_in, int* num_out);
-
-  /*! \brief returns status of calling inferShape function for operator from library */
-  MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape, const char* const* keys,
-                               const char* const* vals, int num,
-                               unsigned int** inshapes, int* indims, int num_in,
-                               unsigned int*** mod_inshapes, int** mod_indims,
-                               unsigned int*** outshapes, int** outdims, int num_out);
-
-  /*! \brief returns status of calling inferType function for operator from library */
-  MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType, const char* const* keys,
-                              const char* const* vals, int num,
-                              int* intypes, int num_in, int* outtypes, int num_out);
-
-  /*! \brief returns status of calling inferSType function for operator from library */
-  MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType, const char* const* keys,
-                               const char* const* vals, int num,
-                               int* instypes, int num_in, int* outstypes, int num_out);
-
-  /*! \brief returns status of calling Forward/Backward function for operator from library */
-  MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp, const char* const* keys,
+/*! \brief returns MXNet library version */
+MX_INT_RET _opVersion();
+
+/*! \brief returns number of ops registered in this library */
+MX_INT_RET _opRegSize();
+
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _opRegGet(int idx,
+                      const char** name,
+                      int* isSGop,
+                      const char*** forward_ctx,
+                      mxnet::ext::fcomp_t** forward_fp,
+                      int* forward_count,
+                      const char*** backward_ctx,
+                      mxnet::ext::fcomp_t** backward_fp,
+                      int* backward_count,
+                      const char*** create_op_ctx,
+                      mxnet::ext::createOpState_t** create_op_fp,
+                      int* create_op_count,
+                      mxnet::ext::parseAttrs_t* parse,
+                      mxnet::ext::inferType_t* type,
+                      mxnet::ext::inferSType_t* stype,
+                      mxnet::ext::inferShape_t* shape,
+                      mxnet::ext::mutateInputs_t* mutate);
+
+/*! \brief calls free from the external library for library allocated arrays */
+MX_VOID_RET _opCallFree(void* ptr);
+
+/*! \brief returns status of calling parse attributes function for operator from library */
+MX_INT_RET _opCallParseAttrs(mxnet::ext::parseAttrs_t parseAttrs,
+                             const char* const* keys,
                              const char* const* vals,
-                             int num, const int64_t** inshapes, int* indims, void** indata,
-                             int* intypes, size_t* inIDs, const char** indev_type, int* indev_id,
-                             int num_in, const int64_t** outshapes, int* outdims, void** outdata,
-                             int* outtypes, size_t* outIDs, const char** outdev_type,
-                             int* outdev_id, int num_out, mxnet::ext::xpu_malloc_t cpu_malloc,
-                             void* cpu_alloc,
-                             mxnet::ext::xpu_malloc_t gpu_malloc, void* gpu_alloc,
-                             void* cuda_stream,
-                             mxnet::ext::sparse_malloc_t sparse_malloc, void* sparse_alloc,
-                             int* instypes, int* outstypes, void** in_indices, void** out_indices,
-                             void** in_indptr, void** out_indptr,
-                             int64_t* in_indices_shapes, int64_t* out_indices_shapes,
-                             int64_t* in_indptr_shapes, int64_t* out_indptr_shapes,
-                             void* rng_cpu_states, void* rng_gpu_states);
-
-  /*! \brief returns status of calling mutateInputs function for operator from library */
-  MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate, const char* const* keys,
-                                 const char* const* vals, int num,
-                                 int** mutate_indices, int* indices_size);
-
-  /*! \brief returns status of calling createStatefulOp function for operator from library */
-  MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op, const char* const* keys,
-                                  const char* const* vals, int num, const char* dev_type,
-                                  int dev_id, unsigned int** inshapes, int* indims,
-                                  int num_in, const int* intypes, void** state_op);
-
-  /*! \brief returns status of deleting StatefulOp instance for operator from library */
-  MX_VOID_RET _opCallDestroyOpState(void* state_op);
-
-  /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
-  MX_INT_RET _opCallFStatefulCompute(int is_forward, void* state_op, const int64_t** inshapes,
-                                     int* indims, void** indata, int* intypes, size_t* inIDs,
-                                     const char** indev_type, int* indev_id, int num_in,
-                                     const int64_t** outshapes, int* outdims, void** outdata,
-                                     int* outtypes, size_t* outIDs, const char** outdev_type,
-                                     int* outdev_id, int num_out,
-                                     mxnet::ext::xpu_malloc_t cpu_malloc,
-                                     void* cpu_alloc, mxnet::ext::xpu_malloc_t gpu_malloc,
-                                     void* gpu_alloc,
-                                     void* stream, mxnet::ext::sparse_malloc_t sparse_malloc,
-                                     void* sparse_alloc, int* instypes, int* outstypes,
-                                     void** in_indices, void** out_indices, void** in_indptr,
-                                     void** out_indptr, int64_t* in_indices_shapes,
-                                     int64_t* out_indices_shapes, int64_t* in_indptr_shapes,
-                                     int64_t* out_indptr_shapes,
-                                     void* rng_cpu_states, void* rng_gpu_states);
-
-  /*! \brief returns number of partitioners registered in this library */
-  MX_INT_RET _partRegSize();
-
-  /* returns number of strategies registered for partitioner
-   * at specified index */
-  MX_INT_RET _partRegGetCount(int idx, const char** name);
-
-  /*! \brief returns partitioner registration at specified index */
-  MX_VOID_RET _partRegGet(int part_idx, int stg_idx, const char** strategy,
-                          mxnet::ext::supportedOps_t* supportedOps,
-                          mxnet::ext::createSelector_t* createSelector,
-                          mxnet::ext::reviewSubgraph_t* reviewSubgraph, const char** op_name);
-
-  /*! \brief returns status of calling supported ops function from library */
-  MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps, const char *json,
-                                   int num_ids, int *ids, const char* const* opt_keys,
-                                   const char* const* opt_vals, int num_opts);
-
-  /*! \brief returns status of calling create selector function from library */
-  MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector, const char *json,
-                                     void** selector, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts);
-
-  /*! \brief returns status of calling select function from library */
-  MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected);
-
-  /*! \brief returns status of calling select input function from library */
-  MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID,
-                                   int input_nodeID, int* selected);
-
-  /*! \brief returns status of calling select output function from library */
-  MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID,
-                                    int output_nodeID, int* selected);
-
-  /*! \brief returns status of calling filter function from library */
-  MX_VOID_RET _partCallFilter(void* sel_inst, int* candidates, int num_candidates,
-                              int** keep, int* num_keep);
-
-  /*! \brief returns status of calling reset selector function from library */
-  MX_VOID_RET _partCallReset(void* sel_inst);
-
-  /*! \brief returns status of calling review subgraph function from library */
-  MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph, const char *json,
-                                     int subgraph_id, int *accept, const char* const* opt_keys,
-                                     const char* const* opt_vals, int num_opts,
-                                     char*** attr_keys, char*** attr_vals, int *num_attrs,
-                                     const char* const* arg_names, int num_args,
-                                     void* const* arg_data, const int64_t* const* arg_shapes,
-                                     const int* arg_dims, const int* arg_types,
-                                     const size_t* arg_IDs, const char* const* arg_dev_type,
-                                     const int* arg_dev_id,
-                                     const char* const* aux_names, int num_aux,
-                                     void* const* aux_data, const int64_t* const* aux_shapes,
-                                     const int* aux_dims, const int* aux_types,
-                                     const size_t* aux_IDs, const char* const* aux_dev_type,
-                                     const int* aux_dev_id);
-
-  /*! \brief returns number of graph passes registered in this library */
-  MX_INT_RET _passRegSize();
-
-  /*! \brief returns pass registration at specified index */
-  MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass,
-                          const char** pass_name);
-
-  /*! \brief returns status of calling graph pass function from library */
-  MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass, const char *json,
-                                char** out_graph, const char* const* opt_keys,
-                                const char* const* opt_vals, int num_opts,
-                                const char* pass_name, const char* const* arg_names, int num_args,
-                                void* const* arg_data, const int64_t* const* arg_shapes,
-                                const int* arg_dims, const int* arg_types,
-                                const size_t* arg_IDs, const char* const* arg_dev_type,
-                                const int* arg_dev_id, const char* const* aux_names, int num_aux,
-                                void* const* aux_data, const int64_t* const* aux_shapes,
-                                const int* aux_dims, const int* aux_types,
-                                const size_t* aux_IDs, const char* const* aux_dev_type,
-                                const int* aux_dev_id, mxnet::ext::nd_malloc_t nd_malloc,
-                                const void* nd_alloc);
+                             int num,
+                             int* num_in,
+                             int* num_out);
 
-  /*!
-   * \brief Checks if the MXNet version is supported by the library.
-   * If supported, initializes the library.
-   * \param version MXNet version number passed to library and defined as:
-   *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
-   * \return Non-zero value on error i.e. library incompatible with passed MXNet version
-   */
+/*! \brief returns status of calling inferShape function for operator from library */
+MX_INT_RET _opCallInferShape(mxnet::ext::inferShape_t inferShape,
+                             const char* const* keys,
+                             const char* const* vals,
+                             int num,
+                             unsigned int** inshapes,
+                             int* indims,
+                             int num_in,
+                             unsigned int*** mod_inshapes,
+                             int** mod_indims,
+                             unsigned int*** outshapes,
+                             int** outdims,
+                             int num_out);
+
+/*! \brief returns status of calling inferType function for operator from library */
+MX_INT_RET _opCallInferType(mxnet::ext::inferType_t inferType,
+                            const char* const* keys,
+                            const char* const* vals,
+                            int num,
+                            int* intypes,
+                            int num_in,
+                            int* outtypes,
+                            int num_out);
+
+/*! \brief returns status of calling inferSType function for operator from library */
+MX_INT_RET _opCallInferSType(mxnet::ext::inferSType_t inferSType,
+                             const char* const* keys,
+                             const char* const* vals,
+                             int num,
+                             int* instypes,
+                             int num_in,
+                             int* outstypes,
+                             int num_out);
+
+/*! \brief returns status of calling Forward/Backward function for operator from library */
+MX_INT_RET _opCallFCompute(mxnet::ext::fcomp_t fcomp,
+                           const char* const* keys,
+                           const char* const* vals,
+                           int num,
+                           const int64_t** inshapes,
+                           int* indims,
+                           void** indata,
+                           int* intypes,
+                           size_t* inIDs,
+                           const char** indev_type,
+                           int* indev_id,
+                           int num_in,
+                           const int64_t** outshapes,
+                           int* outdims,
+                           void** outdata,
+                           int* outtypes,
+                           size_t* outIDs,
+                           const char** outdev_type,
+                           int* outdev_id,
+                           int num_out,
+                           mxnet::ext::xpu_malloc_t cpu_malloc,
+                           void* cpu_alloc,
+                           mxnet::ext::xpu_malloc_t gpu_malloc,
+                           void* gpu_alloc,
+                           void* cuda_stream,
+                           mxnet::ext::sparse_malloc_t sparse_malloc,
+                           void* sparse_alloc,
+                           int* instypes,
+                           int* outstypes,
+                           void** in_indices,
+                           void** out_indices,
+                           void** in_indptr,
+                           void** out_indptr,
+                           int64_t* in_indices_shapes,
+                           int64_t* out_indices_shapes,
+                           int64_t* in_indptr_shapes,
+                           int64_t* out_indptr_shapes,
+                           void* rng_cpu_states,
+                           void* rng_gpu_states);
+
+/*! \brief returns status of calling mutateInputs function for operator from library */
+MX_INT_RET _opCallMutateInputs(mxnet::ext::mutateInputs_t mutate,
+                               const char* const* keys,
+                               const char* const* vals,
+                               int num,
+                               int** mutate_indices,
+                               int* indices_size);
+
+/*! \brief returns status of calling createStatefulOp function for operator from library */
+MX_INT_RET _opCallCreateOpState(mxnet::ext::createOpState_t create_op,
+                                const char* const* keys,
+                                const char* const* vals,
+                                int num,
+                                const char* dev_type,
+                                int dev_id,
+                                unsigned int** inshapes,
+                                int* indims,
+                                int num_in,
+                                const int* intypes,
+                                void** state_op);
+
+/*! \brief returns status of deleting StatefulOp instance for operator from library */
+MX_VOID_RET _opCallDestroyOpState(void* state_op);
+
+/*! \brief returns status of calling Stateful Forward/Backward for operator from library */
+MX_INT_RET _opCallFStatefulCompute(int is_forward,
+                                   void* state_op,
+                                   const int64_t** inshapes,
+                                   int* indims,
+                                   void** indata,
+                                   int* intypes,
+                                   size_t* inIDs,
+                                   const char** indev_type,
+                                   int* indev_id,
+                                   int num_in,
+                                   const int64_t** outshapes,
+                                   int* outdims,
+                                   void** outdata,
+                                   int* outtypes,
+                                   size_t* outIDs,
+                                   const char** outdev_type,
+                                   int* outdev_id,
+                                   int num_out,
+                                   mxnet::ext::xpu_malloc_t cpu_malloc,
+                                   void* cpu_alloc,
+                                   mxnet::ext::xpu_malloc_t gpu_malloc,
+                                   void* gpu_alloc,
+                                   void* stream,
+                                   mxnet::ext::sparse_malloc_t sparse_malloc,
+                                   void* sparse_alloc,
+                                   int* instypes,
+                                   int* outstypes,
+                                   void** in_indices,
+                                   void** out_indices,
+                                   void** in_indptr,
+                                   void** out_indptr,
+                                   int64_t* in_indices_shapes,
+                                   int64_t* out_indices_shapes,
+                                   int64_t* in_indptr_shapes,
+                                   int64_t* out_indptr_shapes,
+                                   void* rng_cpu_states,
+                                   void* rng_gpu_states);
+
+/*! \brief returns number of partitioners registered in this library */
+MX_INT_RET _partRegSize();
+
+/* returns number of strategies registered for partitioner
+ * at specified index */
+MX_INT_RET _partRegGetCount(int idx, const char** name);
+
+/*! \brief returns partitioner registration at specified index */
+MX_VOID_RET _partRegGet(int part_idx,
+                        int stg_idx,
+                        const char** strategy,
+                        mxnet::ext::supportedOps_t* supportedOps,
+                        mxnet::ext::createSelector_t* createSelector,
+                        mxnet::ext::reviewSubgraph_t* reviewSubgraph,
+                        const char** op_name);
+
+/*! \brief returns status of calling supported ops function from library */
+MX_INT_RET _partCallSupportedOps(mxnet::ext::supportedOps_t supportedOps,
+                                 const char* json,
+                                 int num_ids,
+                                 int* ids,
+                                 const char* const* opt_keys,
+                                 const char* const* opt_vals,
+                                 int num_opts);
+
+/*! \brief returns status of calling create selector function from library */
+MX_INT_RET _partCallCreateSelector(mxnet::ext::createSelector_t createSelector,
+                                   const char* json,
+                                   void** selector,
+                                   const char* const* opt_keys,
+                                   const char* const* opt_vals,
+                                   int num_opts);
+
+/*! \brief returns status of calling select function from library */
+MX_VOID_RET _partCallSelect(void* sel_inst, int nodeID, int* selected);
+
+/*! \brief returns status of calling select input function from library */
+MX_VOID_RET _partCallSelectInput(void* sel_inst, int nodeID, int input_nodeID, int* selected);
+
+/*! \brief returns status of calling select output function from library */
+MX_VOID_RET _partCallSelectOutput(void* sel_inst, int nodeID, int output_nodeID, int* selected);
+
+/*! \brief returns status of calling filter function from library */
+MX_VOID_RET _partCallFilter(void* sel_inst,
+                            int* candidates,
+                            int num_candidates,
+                            int** keep,
+                            int* num_keep);
+
+/*! \brief returns status of calling reset selector function from library */
+MX_VOID_RET _partCallReset(void* sel_inst);
+
+/*! \brief returns status of calling review subgraph function from library */
+MX_INT_RET _partCallReviewSubgraph(mxnet::ext::reviewSubgraph_t reviewSubgraph,
+                                   const char* json,
+                                   int subgraph_id,
+                                   int* accept,
+                                   const char* const* opt_keys,
+                                   const char* const* opt_vals,
+                                   int num_opts,
+                                   char*** attr_keys,
+                                   char*** attr_vals,
+                                   int* num_attrs,
+                                   const char* const* arg_names,
+                                   int num_args,
+                                   void* const* arg_data,
+                                   const int64_t* const* arg_shapes,
+                                   const int* arg_dims,
+                                   const int* arg_types,
+                                   const size_t* arg_IDs,
+                                   const char* const* arg_dev_type,
+                                   const int* arg_dev_id,
+                                   const char* const* aux_names,
+                                   int num_aux,
+                                   void* const* aux_data,
+                                   const int64_t* const* aux_shapes,
+                                   const int* aux_dims,
+                                   const int* aux_types,
+                                   const size_t* aux_IDs,
+                                   const char* const* aux_dev_type,
+                                   const int* aux_dev_id);
+
+/*! \brief returns number of graph passes registered in this library */
+MX_INT_RET _passRegSize();
+
+/*! \brief returns pass registration at specified index */
+MX_VOID_RET _passRegGet(int pass_idx, mxnet::ext::graphPass_t* graphPass, const char** pass_name);
+
+/*! \brief returns status of calling graph pass function from library */
+MX_INT_RET _passCallGraphPass(mxnet::ext::graphPass_t graphPass,
+                              const char* json,
+                              char** out_graph,
+                              const char* const* opt_keys,
+                              const char* const* opt_vals,
+                              int num_opts,
+                              const char* pass_name,
+                              const char* const* arg_names,
+                              int num_args,
+                              void* const* arg_data,
+                              const int64_t* const* arg_shapes,
+                              const int* arg_dims,
+                              const int* arg_types,
+                              const size_t* arg_IDs,
+                              const char* const* arg_dev_type,
+                              const int* arg_dev_id,
+                              const char* const* aux_names,
+                              int num_aux,
+                              void* const* aux_data,
+                              const int64_t* const* aux_shapes,
+                              const int* aux_dims,
+                              const int* aux_types,
+                              const size_t* aux_IDs,
+                              const char* const* aux_dev_type,
+                              const int* aux_dev_id,
+                              mxnet::ext::nd_malloc_t nd_malloc,
+                              const void* nd_alloc);
+
+/*!
+ * \brief Checks if the MXNet version is supported by the library.
+ * If supported, initializes the library.
+ * \param version MXNet version number passed to library and defined as:
+ *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+ * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+ */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
+__declspec(dllexport) mxnet::ext::MXReturnValue __cdecl
 #else
-  mxnet::ext::MXReturnValue
+mxnet::ext::MXReturnValue
 #endif
-  initialize(int version);
+    initialize(int version);
 
-  MX_INT_RET _msgSize();
+MX_INT_RET _msgSize();
 
-  /*! \brief returns operator registration at specified index */
-  MX_VOID_RET _msgGet(int idx, const char** msg);
+/*! \brief returns operator registration at specified index */
+MX_VOID_RET _msgGet(int idx, const char** msg);
 }  // extern "C"
 
 #endif  // MXNET_LIB_API_H_
diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index 66511421da02..d7ef85b1efb5 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -74,14 +74,12 @@
 #endif
 
 /*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */
-#define MXNET_GPU_NOT_ENABLED_ERROR  "GPU is not enabled"
-
+#define MXNET_GPU_NOT_ENABLED_ERROR "GPU is not enabled"
 
 #ifndef MXNET_USE_TENSORRT
 #define MXNET_USE_TENSORRT 0
 #endif
 
-
 #ifndef MXNET_USE_BLAS_ATLAS
 #define MXNET_USE_BLAS_ATLAS 0
 #endif
@@ -154,7 +152,6 @@ enum : unsigned {
   CPU_AVX,
   CPU_AVX2,
 
-
   // Multiprocessing / CPU / System
   OPENMP,
   SSE,
@@ -192,7 +189,6 @@ enum : unsigned {
   MAX_FEATURES
 };
 
-
 struct EnumNames {
   static const std::vector<std::string> names;
 };
@@ -203,9 +199,10 @@ struct LibInfo {
   const std::array<LibFeature, MAX_FEATURES>& getFeatures() {
     return m_lib_features;
   }
+
  private:
   std::array<LibFeature, MAX_FEATURES> m_lib_features;
-  static std::unique_ptr<LibInfo>  m_inst;
+  static std::unique_ptr<LibInfo> m_inst;
 };
 
 /*!
diff --git a/include/mxnet/node/container.h b/include/mxnet/node/container.h
index e164f64a9184..12c527cf2e37 100644
--- a/include/mxnet/node/container.h
+++ b/include/mxnet/node/container.h
@@ -50,14 +50,13 @@ class ArrayNode : public Object {
  * \tparam Converter a struct that contains converting function
  * \tparam TIter the content iterator type.
  */
-template<typename Converter,
-         typename TIter>
+template <typename Converter, typename TIter>
 class IterAdapter {
  public:
-  using difference_type = typename std::iterator_traits<TIter>::difference_type;
-  using value_type = typename Converter::ResultType;
-  using pointer = typename Converter::ResultType*;
-  using reference = typename Converter::ResultType&;   // NOLINT(*)
+  using difference_type   = typename std::iterator_traits<TIter>::difference_type;
+  using value_type        = typename Converter::ResultType;
+  using pointer           = typename Converter::ResultType*;
+  using reference         = typename Converter::ResultType&;  // NOLINT(*)
   using iterator_category = typename std::iterator_traits<TIter>::iterator_category;
 
   explicit IterAdapter(TIter iter) : iter_(iter) {}
@@ -69,10 +68,10 @@ class IterAdapter {
     return IterAdapter(iter_ + offset);
   }
 
-  template<typename T = IterAdapter>
+  template <typename T = IterAdapter>
   typename std::enable_if<std::is_same<iterator_category, std::random_access_iterator_tag>::value,
-                          typename T::difference_type>::type
-  inline operator-(const IterAdapter& rhs) const {
+                          typename T::difference_type>::type inline
+  operator-(const IterAdapter& rhs) const {
     return iter_ - rhs.iter_;
   }
 
@@ -98,8 +97,8 @@ class IterAdapter {
  * operator[] only provide const acces, use Set to mutate the content.
  * \tparam T The content NodeRef type.
  */
-template<typename T,
-         typename = typename std::enable_if<std::is_base_of<ObjectRef, T>::value>::type >
+template <typename T,
+          typename = typename std::enable_if<std::is_base_of<ObjectRef, T>::value>::type>
 class Array : public ObjectRef {
  public:
   /*!
@@ -112,14 +111,14 @@ class Array : public ObjectRef {
    * \brief move constructor
    * \param other source
    */
-  Array(Array<T> && other) {  // NOLINT(*)
+  Array(Array<T>&& other) {  // NOLINT(*)
     data_ = std::move(other.data_);
   }
   /*!
    * \brief copy constructor
    * \param other source
    */
-  Array(const Array<T> &other) { // NOLINT(*)
+  Array(const Array<T>& other) {  // NOLINT(*)
     data_ = std::move(other.data_);
   }
   /*!
@@ -133,7 +132,7 @@ class Array : public ObjectRef {
    * \param end end of iterator
    * \tparam IterType The type of iterator
    */
-  template<typename IterType>
+  template <typename IterType>
   Array(IterType begin, IterType end) {
     assign(begin, end);
   }
@@ -141,14 +140,14 @@ class Array : public ObjectRef {
    * \brief constructor from initializer list
    * \param init The initalizer list
    */
-  Array(std::initializer_list<T> init) { // NOLINT(*)
+  Array(std::initializer_list<T> init) {  // NOLINT(*)
     assign(init.begin(), init.end());
   }
   /*!
    * \brief constructor from vector
    * \param init The vector
    */
-  Array(const std::vector<T>& init) { // NOLINT(*)
+  Array(const std::vector<T>& init) {  // NOLINT(*)
     assign(init.begin(), init.end());
   }
   /*!
@@ -168,7 +167,7 @@ class Array : public ObjectRef {
    * \param other The source of assignment
    * \return reference to self.
    */
-  Array<T>& operator=(Array<T> && other) {
+  Array<T>& operator=(Array<T>&& other) {
     data_ = std::move(other.data_);
     return *this;
   }
@@ -177,7 +176,7 @@ class Array : public ObjectRef {
    * \param other The source of assignment
    * \return reference to self.
    */
-  Array<T>& operator=(const Array<T> & other) {
+  Array<T>& operator=(const Array<T>& other) {
     data_ = other.data_;
     return *this;
   }
@@ -187,7 +186,7 @@ class Array : public ObjectRef {
    * \param end end of iterator
    * \tparam IterType The type of iterator
    */
-  template<typename IterType>
+  template <typename IterType>
   void assign(IterType begin, IterType end) {
     auto n = make_object<ArrayNode>();
     for (IterType it = begin; it != end; ++it) {
@@ -201,12 +200,12 @@ class Array : public ObjectRef {
    * \return the i-th element.
    */
   inline const T operator[](size_t i) const {
-    return DowncastNoCheck<T>(
-        static_cast<const ArrayNode*>(data_.get())->data[i]);
+    return DowncastNoCheck<T>(static_cast<const ArrayNode*>(data_.get())->data[i]);
   }
   /*! \return The size of the array */
   inline size_t size() const {
-    if (data_.get() == nullptr) return 0;
+    if (data_.get() == nullptr)
+      return 0;
     return static_cast<const ArrayNode*>(data_.get())->data.size();
   }
   /*!
@@ -218,9 +217,9 @@ class Array : public ObjectRef {
    * \return Handle to the internal node container(which ganrantees to be unique)
    */
   inline ArrayNode* CopyOnWrite() {
-    if (data_.get() == nullptr || !data_.unique())  {
+    if (data_.get() == nullptr || !data_.unique()) {
       runtime::ObjectPtr<ArrayNode> n = make_object<ArrayNode>();
-      n->data = static_cast<ArrayNode*>(data_.get())->data;
+      n->data                         = static_cast<ArrayNode*>(data_.get())->data;
       runtime::ObjectPtr<Object>(std::move(n)).swap(data_);
     }
     return static_cast<ArrayNode*>(data_.get());
@@ -248,7 +247,7 @@ class Array : public ObjectRef {
    */
   inline void Set(size_t i, const T& value) {
     ArrayNode* n = this->CopyOnWrite();
-    n->data[i] = value;
+    n->data[i]   = value;
   }
   /*! \return whether array is empty */
   inline bool empty() const {
@@ -260,10 +259,11 @@ class Array : public ObjectRef {
    * \tparam F the type of the mutation function.
    * \note This function performs copy on write optimization.
    */
-  template<typename F>
+  template <typename F>
   inline void MutateByApply(F fmutate) {
     ArrayNode* ptr = static_cast<ArrayNode*>(data_.get());
-    if (ptr == nullptr) return;
+    if (ptr == nullptr)
+      return;
     if (data_.unique()) {
       // Copy on write optimization.
       // Perform inplace update because this is an unique copy.
@@ -271,8 +271,8 @@ class Array : public ObjectRef {
         // It is important to use move here
         // to make prevent the element's ref count from increasing
         // so fmutate itself can perform copy-on-write optimization
-        T old_elem = DowncastNoCheck<T>(std::move(ptr->data[i]));
-        T new_elem = fmutate(std::move(old_elem));
+        T old_elem   = DowncastNoCheck<T>(std::move(ptr->data[i]));
+        T new_elem   = fmutate(std::move(old_elem));
         ptr->data[i] = std::move(new_elem);
       }
     } else {
@@ -305,12 +305,10 @@ class Array : public ObjectRef {
       return DowncastNoCheck<T>(n);
     }
   };
-  using iterator = IterAdapter<ValueConverter,
-                               std::vector<ObjectRef>::const_iterator>;
+  using iterator = IterAdapter<ValueConverter, std::vector<ObjectRef>::const_iterator>;
 
-  using reverse_iterator = IterAdapter<
-    ValueConverter,
-    std::vector<ObjectRef>::const_reverse_iterator>;
+  using reverse_iterator =
+      IterAdapter<ValueConverter, std::vector<ObjectRef>::const_reverse_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
diff --git a/include/mxnet/node/node.h b/include/mxnet/node/node.h
index 76bf0e67fad0..18a2a35ead22 100644
--- a/include/mxnet/node/node.h
+++ b/include/mxnet/node/node.h
@@ -46,17 +46,17 @@
 
 namespace mxnet {
 
-using runtime::TypeIndex;
 using runtime::Object;
+using runtime::TypeIndex;
 // We strictly restrict ObjectPtr to ::mxnet::runtime
 // as it may conflict with ::nnvm::ObjectPtr
 // using runtime::ObjectPtr;
-using runtime::ObjectRef;
-using runtime::GetRef;
 using runtime::Downcast;
-using runtime::ObjectHash;
-using runtime::ObjectEqual;
+using runtime::GetRef;
 using runtime::make_object;
+using runtime::ObjectEqual;
+using runtime::ObjectHash;
+using runtime::ObjectRef;
 
 }  // namespace mxnet
 
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 4e43d87a87c8..2fec1768ea86 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -79,7 +79,7 @@ struct OpContext {
    * \return the mshadow stream
    * \tparam xpu the device type of the stream
    */
-  template<typename xpu>
+  template <typename xpu>
   inline mshadow::Stream<xpu>* get_stream() const {
     return run_ctx.get_stream<xpu>();
   }
@@ -150,18 +150,16 @@ class OpStatePtr {
   /* \brief Create a OpStatePtr with state of type T.
    * \param args Arguments passed to T's constructor.
    */
-  template<typename T, typename... Args>
+  template <typename T, typename... Args>
   static OpStatePtr Create(Args&&... args) {
     OpStatePtr ret;
     auto state = new T(std::forward<Args>(args)...);
-    auto var = Engine::Get()->NewVariable();
-    ret.ptr_.reset(
-      new OpState(var, state),
-      [](OpState* p) {
-        Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), p->var);
-        delete reinterpret_cast<T*>(p->state);
-        delete p;
-      });
+    auto var   = Engine::Get()->NewVariable();
+    ret.ptr_.reset(new OpState(var, state), [](OpState* p) {
+      Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), p->var);
+      delete reinterpret_cast<T*>(p->state);
+      delete p;
+    });
 
     return ret;
   }
@@ -170,7 +168,7 @@ class OpStatePtr {
     return ptr_->var;
   }
   /* \brief Get state of type T */
-  template<typename T>
+  template <typename T>
   T& get_state() const {
     return *reinterpret_cast<T*>(ptr_->state);
   }
@@ -214,10 +212,10 @@ class OpStatePtr {
  *
  *  \note Register under "FCreateLayerOp"
  */
-using FCreateOpState = std::function<OpStatePtr (const NodeAttrs& attrs,
-                                                 Context ctx,
-                                                 const mxnet::ShapeVector& in_shape,
-                                                 const std::vector<int>& in_type)>;
+using FCreateOpState = std::function<OpStatePtr(const NodeAttrs& attrs,
+                                                Context ctx,
+                                                const mxnet::ShapeVector& in_shape,
+                                                const std::vector<int>& in_type)>;
 
 /*!
  * \brief Whether the operator always produces the same
@@ -232,7 +230,7 @@ using THasDeterministicOutput = bool;
 /*!
  * \brief Execution mode of this operator.
  */
-using FExecType = std::function<ExecType (const NodeAttrs& attrs)>;
+using FExecType = std::function<ExecType(const NodeAttrs& attrs)>;
 /*!
  * \brief Resiger a compute function for stateful operator.
  *  OpStatePtr is a pointer type, it's content is mutable even if
@@ -240,11 +238,11 @@ using FExecType = std::function<ExecType (const NodeAttrs& attrs)>;
  *
  * \note Register under "FStatefulCompute<cpu>" and "FStatefulCompute<gpu>"
  */
-using FStatefulCompute = std::function<void (const OpStatePtr& state,
-                                             const OpContext& ctx,
-                                             const std::vector<TBlob>& inputs,
-                                             const std::vector<OpReqType>& req,
-                                             const std::vector<TBlob>& outputs)>;
+using FStatefulCompute = std::function<void(const OpStatePtr& state,
+                                            const OpContext& ctx,
+                                            const std::vector<TBlob>& inputs,
+                                            const std::vector<OpReqType>& req,
+                                            const std::vector<TBlob>& outputs)>;
 /*!
  * \brief Resiger a compute function for stateful operator using NDArray interface.
  *  OpStatePtr is a pointer type, it's content is mutable even if
@@ -252,19 +250,18 @@ using FStatefulCompute = std::function<void (const OpStatePtr& state,
  *
  * \note Register under "FStatefulComputeEx<cpu>" and "FStatefulComputeEx<gpu>"
  */
-using FStatefulComputeEx = std::function<void (const OpStatePtr& state,
-                                               const OpContext& ctx,
-                                               const std::vector<NDArray>& inputs,
-                                               const std::vector<OpReqType>& req,
-                                               const std::vector<NDArray>& outputs)>;
+using FStatefulComputeEx = std::function<void(const OpStatePtr& state,
+                                              const OpContext& ctx,
+                                              const std::vector<NDArray>& inputs,
+                                              const std::vector<OpReqType>& req,
+                                              const std::vector<NDArray>& outputs)>;
 /*!
  * \brief The resource request from the operator.
  *        An operator could register ResourceRequestEx, or ResourceRequest, or neither.
  *
  * \note Register under "FResourceRequest"
  */
-using FResourceRequest = std::function<
-  std::vector<ResourceRequest> (const NodeAttrs& n)>;
+using FResourceRequest = std::function<std::vector<ResourceRequest>(const NodeAttrs& n)>;
 /*!
  * \brief The resource request from the operator.
  *        An operator could register ResourceRequestEx, or ResourceRequest, or neither.
@@ -273,38 +270,38 @@ using FResourceRequest = std::function<
  *
  * \note Register under "FResourceRequestEx"
  */
-using FResourceRequestEx = std::function<
-  std::vector<ResourceRequest> (const NodeAttrs& n,
-                                const int dev_mask,
-                                const DispatchMode dispatch_mode)>;
+using FResourceRequestEx =
+    std::function<std::vector<ResourceRequest>(const NodeAttrs& n,
+                                               const int dev_mask,
+                                               const DispatchMode dispatch_mode)>;
 /*!
  * \brief Register an operator called as a NDArray function
  *
  * \note Register under "FNDArrayFunction"
  */
-using FNDArrayFunction = std::function<void (const nnvm::NodeAttrs& attrs,
-                                             const std::vector<NDArray>& inputs,
-                                             std::vector<NDArray>* outputs)>;
+using FNDArrayFunction = std::function<void(const nnvm::NodeAttrs& attrs,
+                                            const std::vector<NDArray>& inputs,
+                                            std::vector<NDArray>* outputs)>;
 /*!
  * \brief Register a compute function for simple stateless forward only operator
  *
  * \note Register under "FCompute<cpu>" and "FCompute<gpu>"
  */
-using FCompute = std::function<void (const nnvm::NodeAttrs& attrs,
-                                     const OpContext& ctx,
-                                     const std::vector<TBlob>& inputs,
-                                     const std::vector<OpReqType>& req,
-                                     const std::vector<TBlob>& outputs)>;
+using FCompute = std::function<void(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs)>;
 /*!
  * \brief Register an NDArray compute function for simple stateless forward only operator
  * \note Register under "FComputeEx<xpu>" and "FComputeEx<xpu>"
  *       Dispatched only when inferred dispatch_mode is FDispatchComputeEx
  */
-using FComputeEx = std::function<void (const nnvm::NodeAttrs& attrs,
-                                       const OpContext& ctx,
-                                       const std::vector<NDArray>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<NDArray>& outputs)>;
+using FComputeEx = std::function<void(const nnvm::NodeAttrs& attrs,
+                                      const OpContext& ctx,
+                                      const std::vector<NDArray>& inputs,
+                                      const std::vector<OpReqType>& req,
+                                      const std::vector<NDArray>& outputs)>;
 
 /*!
  * \brief Register a storage and dispatch mode inference function based on
@@ -312,23 +309,23 @@ using FComputeEx = std::function<void (const nnvm::NodeAttrs& attrs,
  *
  * \note Register under "FInferStorageType"
  */
-using FInferStorageType = std::function<bool (const NodeAttrs& attrs,
-                                              const int dev_mask,
-                                              DispatchMode* dispatch_mode,
-                                              std::vector<int>* in_attrs,
-                                              std::vector<int>* out_attrs)>;
+using FInferStorageType = std::function<bool(const NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int>* in_attrs,
+                                             std::vector<int>* out_attrs)>;
 
 /*!
  * \brief Register a quantized node creation function based on the attrs of the node
  * \note Register under "FQuantizedOp" for non-quantized operators
  */
-using FQuantizable = std::function<QuantizeType (const NodeAttrs& attrs)>;
+using FQuantizable = std::function<QuantizeType(const NodeAttrs& attrs)>;
 
 /*!
  * \brief Register a quantized node creation function based on the attrs of the node
  * \note Register under "FQuantizedOp" for non-quantized operators
  */
-using FQuantizedOp = std::function<nnvm::ObjectPtr (const NodeAttrs& attrs)>;
+using FQuantizedOp = std::function<nnvm::ObjectPtr(const NodeAttrs& attrs)>;
 
 /*!
  * \brief Register a function to determine if the output of a quantized operator
@@ -336,30 +333,29 @@ using FQuantizedOp = std::function<nnvm::ObjectPtr (const NodeAttrs& attrs)>;
  * taking int8 data types while accumulating in int32, e.g. quantized_conv.
  * \note Register under "FNeedRequantize" for non-quantized operators
  */
-using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
+using FNeedRequantize = std::function<bool(const NodeAttrs& attrs)>;
 
 /*!
  * \brief Register a function to determine if the input of a quantized operator
  * needs to be quantized. This is usually used for the quantized operators
  * which can handle fp32 inputs directly.
  */
-using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
-                                                const size_t index,
-                                                const std::string quantize_granularity)>;
+using FAvoidQuantizeInput = std::function<
+    bool(const NodeAttrs& attrs, const size_t index, const std::string quantize_granularity)>;
 
 /*!
  * \brief Register a function to determine if the input of a quantized operator
  * needs to be calibrated. This is usually used for the quantized operators
  * which need calibration on its input.
  */
-using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
+using FNeedCalibrateInput = std::function<std::vector<int>(const NodeAttrs& attrs)>;
 
 /*!
  * \brief Register a function to determine if the output of a quantized operator
  * needs to be calibrated. This is usually used for the quantized operators
  * which need calibration on its output.
  */
-using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
+using FNeedCalibrateOutput = std::function<std::vector<int>(const NodeAttrs& attrs)>;
 
 }  // namespace mxnet
 
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index d813c74fa9b6..268460fd7c25 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -66,11 +66,11 @@ class Operator {
    *        need, epecial case like Batch Norm requires.
    * \sa OpReqType, OpContext
    */
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) = 0;
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_states) = 0;
   /*!
    * \brief Perform a Backward Operation, write gradient to the in_grad.
    *
@@ -99,17 +99,18 @@ class Operator {
    * \param aux_states Auxiliary states of operator. Normally operator doesn't need
    * \sa OperatorProperty, OpReqType, OpContext
    */
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_states) {
     LOG(FATAL) << "Backward is not implemented";
   }
   /*! \return [Deprecated] execution type of the operator */
-  virtual ExecType exec_type() const final {  // NOLINT(*) exec_type has been moved to OperatorProperty
+  virtual ExecType exec_type()
+      const final {  // NOLINT(*) exec_type has been moved to OperatorProperty
     return ExecType::kSync;
   }
 };
@@ -197,9 +198,9 @@ class OperatorProperty {
    * \return true if the shape inference is successful, false if there is not enough information.
    * \throws dmlc::Error if the known arg_shapes are inconsistent.
    */
-  virtual bool InferShape(mxnet::ShapeVector *in_shape,
-                          mxnet::ShapeVector *out_shape,
-                          mxnet::ShapeVector *aux_shape) const = 0;
+  virtual bool InferShape(mxnet::ShapeVector* in_shape,
+                          mxnet::ShapeVector* out_shape,
+                          mxnet::ShapeVector* aux_shape) const = 0;
   /*!
    * \brief infer the data types of outputs and unknown input arguments
    * \param in_type the type of input arguments of the operator
@@ -217,25 +218,28 @@ class OperatorProperty {
    * \return true if the type inference is successful, false if there is not enough information.
    * \throws dmlc::Error if the known arg_types are inconsistent.
    */
-  virtual bool InferType(std::vector<int> *in_type,
-                          std::vector<int> *out_type,
-                          std::vector<int> *aux_type) const {
+  virtual bool InferType(std::vector<int>* in_type,
+                         std::vector<int>* out_type,
+                         std::vector<int>* aux_type) const {
     CHECK_LE(in_type->size(), this->ListArguments().size());
     int n_in = this->ListArguments().size();
     for (unsigned i = 0; i < in_type->size(); ++i) {
-      CHECK(in_type->at(i) == mshadow::default_type_flag ||
-            in_type->at(i) == -1) << "Unsupported data type " << in_type->at(i);
+      CHECK(in_type->at(i) == mshadow::default_type_flag || in_type->at(i) == -1)
+          << "Unsupported data type " << in_type->at(i);
     }
     in_type->clear();
-    for (int i = 0; i < n_in; ++i ) in_type->push_back(mshadow::default_type_flag);
+    for (int i = 0; i < n_in; ++i)
+      in_type->push_back(mshadow::default_type_flag);
 
     int n_out = this->ListOutputs().size();
     out_type->clear();
-    for (int i = 0; i < n_out; ++i ) out_type->push_back(mshadow::default_type_flag);
+    for (int i = 0; i < n_out; ++i)
+      out_type->push_back(mshadow::default_type_flag);
 
     int n_aux = this->ListAuxiliaryStates().size();
     aux_type->clear();
-    for (int i = 0; i < n_aux; ++i ) aux_type->push_back(mshadow::default_type_flag);
+    for (int i = 0; i < n_aux; ++i)
+      aux_type->push_back(mshadow::default_type_flag);
     return true;
   }
   /*!
@@ -254,8 +258,9 @@ class OperatorProperty {
    * \param in_type dtype of the input ndarrays
    * \return the created operator
    */
-  virtual Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                                     std::vector<int> *in_type) const {
+  virtual Operator* CreateOperatorEx(Context ctx,
+                                     mxnet::ShapeVector* in_shape,
+                                     std::vector<int>* in_type) const {
     std::vector<int> out_type, aux_type;
     mxnet::ShapeVector out_shape, aux_shape;
     out_type.resize(this->ListOutputs().size());
@@ -282,8 +287,7 @@ class OperatorProperty {
    * \param in_shape The input shape to the operator, corresponds to shapes of in_data.
    * \return Additional resource request
    */
-  virtual std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const {
+  virtual std::vector<ResourceRequest> ForwardResource(const mxnet::ShapeVector& in_shape) const {
     return std::vector<ResourceRequest>();
   }
   /*!
@@ -293,8 +297,7 @@ class OperatorProperty {
    * \param in_shape The input shape to the operator, corresponds to shapes of in_data.
    * \return Additional resource request
    */
-  virtual std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const {
+  virtual std::vector<ResourceRequest> BackwardResource(const mxnet::ShapeVector& in_shape) const {
     return std::vector<ResourceRequest>();
   }
   /*!
@@ -319,10 +322,9 @@ class OperatorProperty {
    * \return an integer vector indicating the input requirments
    * \sa BackwardInputs
    */
-  virtual std::vector<int> DeclareBackwardDependency(
-      const std::vector<int> &out_grad,
-      const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
+  virtual std::vector<int> DeclareBackwardDependency(const std::vector<int>& out_grad,
+                                                     const std::vector<int>& in_data,
+                                                     const std::vector<int>& out_data) const {
     // By default requires to see all the things.
     // remember to override this function to get a better performance.
     std::vector<int> ret = out_grad;
@@ -352,8 +354,8 @@ class OperatorProperty {
    *   indicating possible in place operations.
    */
   virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
-      const std::vector<int> &in_data,
-      const std::vector<void*> &out_data) const {
+      const std::vector<int>& in_data,
+      const std::vector<void*>& out_data) const {
     return std::vector<std::pair<int, void*> >();
   }
   /*!
@@ -383,10 +385,10 @@ class OperatorProperty {
    *   indicating possible in place operations.
    */
   virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
-      const std::vector<int> &out_grad,
-      const std::vector<int> &in_data,
-      const std::vector<int> &out_data,
-      const std::vector<void*> &in_grad) const {
+      const std::vector<int>& out_grad,
+      const std::vector<int>& in_data,
+      const std::vector<int>& out_data,
+      const std::vector<void*>& in_grad) const {
     return std::vector<std::pair<int, void*> >();
   }
   /*!
@@ -401,10 +403,10 @@ class OperatorProperty {
    * \return vector of inputs the Backward Operation depends on.
    * \sa DeclareBackwardDependency
    */
-  template<typename T>
-  inline std::vector<T> BackwardInputs(const std::vector<T> &out_grad,
-                                       const std::vector<T> &in_data,
-                                       const std::vector<T> &out_data) const {
+  template <typename T>
+  inline std::vector<T> BackwardInputs(const std::vector<T>& out_grad,
+                                       const std::vector<T>& in_data,
+                                       const std::vector<T>& out_data) const {
     int counter = 0;
     std::vector<int> out_grad_index(out_grad.size());
     std::vector<int> in_data_index(in_data.size());
@@ -423,8 +425,8 @@ class OperatorProperty {
     all_data.insert(all_data.end(), in_data.begin(), in_data.end());
     all_data.insert(all_data.end(), out_data.begin(), out_data.end());
 
-    std::vector<int> ret_index = this->DeclareBackwardDependency(
-        out_grad_index, in_data_index, out_data_index);
+    std::vector<int> ret_index =
+        this->DeclareBackwardDependency(out_grad_index, in_data_index, out_data_index);
 
     std::vector<T> ret(ret_index.size());
     for (size_t i = 0; i < ret_index.size(); ++i) {
@@ -437,7 +439,7 @@ class OperatorProperty {
    * \param type_name the type string of the OperatorProperty
    * \return a new constructed OperatorProperty
    */
-  static OperatorProperty *Create(const char* type_name);
+  static OperatorProperty* Create(const char* type_name);
   /*! \return execution type of the operator */
   virtual ExecType exec_type() const {
     return ExecType::kSync;
@@ -445,13 +447,12 @@ class OperatorProperty {
 };
 
 /*! \brief typedef the factory function of operator property */
-typedef std::function<OperatorProperty *()> OperatorPropertyFactory;
+typedef std::function<OperatorProperty*()> OperatorPropertyFactory;
 /*!
  * \brief Registry entry for OperatorProperty factory functions.
  */
 struct OperatorPropertyReg
-    : public dmlc::FunctionRegEntryBase<OperatorPropertyReg,
-                                        OperatorPropertyFactory> {
+    : public dmlc::FunctionRegEntryBase<OperatorPropertyReg, OperatorPropertyFactory> {
   /*!
    * \brief Set key_var_num_args
    *  When this is set, the API caller is required to pass in a
@@ -464,7 +465,7 @@ struct OperatorPropertyReg
    *
    * \param key the key name to be set
    */
-  inline OperatorPropertyReg& set_key_var_num_args(const std::string &key) {  // NOLINT(*)
+  inline OperatorPropertyReg& set_key_var_num_args(const std::string& key) {  // NOLINT(*)
     this->key_var_num_args = key;
     return *this;
   }
@@ -472,12 +473,12 @@ struct OperatorPropertyReg
    * \brief Check if TypeString of the type matches the registered name
    */
   inline OperatorPropertyReg& check_name() {
-    OperatorProperty *p = this->body();
-    std::string type = p->TypeString();
+    OperatorProperty* p = this->body();
+    std::string type    = p->TypeString();
     delete p;
-    CHECK_EQ(this->name, type)
-        << "Register Name and TypeString mismatch, name=\"" << this->name << "\","
-        << " but TypeString=\"" << type <<"\"";
+    CHECK_EQ(this->name, type) << "Register Name and TypeString mismatch, name=\"" << this->name
+                               << "\","
+                               << " but TypeString=\"" << type << "\"";
     return *this;
   }
 
@@ -499,11 +500,11 @@ struct OperatorPropertyReg
  *
  * \endcode
  */
-#define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
+#define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)                    \
   DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
-  .set_body([]() { return new OperatorPropertyType(); })                \
-  .set_return_type("NDArray-or-Symbol") \
-  .check_name()
+      .set_body([]() { return new OperatorPropertyType(); })                      \
+      .set_return_type("NDArray-or-Symbol")                                       \
+      .check_name()
 
 #endif  // DMLC_USE_CXX11
 }  // namespace mxnet
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index 9f1ddc4570c3..c5c274ebede9 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -30,7 +30,7 @@
 #define MXNET_OPERATOR_UTIL_H_
 
 #ifdef _MSC_VER
-#pragma warning(disable:4503)  // disable warning: decorated name length exceeded.
+#pragma warning(disable : 4503)  // disable warning: decorated name length exceeded.
 #endif
 
 #include <dmlc/registry.h>
@@ -86,10 +86,7 @@ struct EnvArguments {
  * \param req The requirement to stroe the ret.
  * \param ctx Runtime context to execute the function.
  */
-typedef void (*SourceFunction)(const EnvArguments& env,
-                               TBlob* ret,
-                               OpReqType req,
-                               RunContext ctx);
+typedef void (*SourceFunction)(const EnvArguments& env, TBlob* ret, OpReqType req, RunContext ctx);
 
 /*!
  * \brief Shape inference function to get the correct shape.
@@ -118,8 +115,7 @@ typedef void (*UnaryFunction)(const TBlob& src,
  * \param env The Environment arguments.
  * \return The inferred result shape.
  */
-typedef mxnet::TShape (*UnaryShapeFunction)(const mxnet::TShape& src,
-                                     const EnvArguments& env);
+typedef mxnet::TShape (*UnaryShapeFunction)(const mxnet::TShape& src, const EnvArguments& env);
 
 /*!
  * \brief Gradient function that takes output value of function and computes gradient wrt to input.
@@ -189,8 +185,8 @@ typedef void (*BinaryFunction)(const TBlob& lhs,
  * \return The inferred result shape.
  */
 typedef mxnet::TShape (*BinaryShapeFunction)(const mxnet::TShape& lhs,
-                                      const mxnet::TShape& rhs,
-                                      const EnvArguments& env);
+                                             const mxnet::TShape& rhs,
+                                             const EnvArguments& env);
 /*!
  * \brief Gradient function that takes only output gradient and computes gradient wrt to input.
  *  We support total gradient as a whole to make it easy to combine a few ops.
@@ -246,16 +242,10 @@ enum SimpleOpInplaceOption {
 };
 
 /*! \brief options in the registry to set symbolic registration */
-enum SimpleOpScalarOption {
-  kScalarBeforeArray,
-  kArrayBeforeScalar
-};
+enum SimpleOpScalarOption { kScalarBeforeArray, kArrayBeforeScalar };
 
 /*! \brief options in the registry to set symbolic registration */
-enum SimpleOpRegOption {
-  kNotRegisterSymbolic,
-  kRegisterSymbolic
-};
+enum SimpleOpRegOption { kNotRegisterSymbolic, kRegisterSymbolic };
 
 /*! \brief registry entry to register simple operators via functions. */
 class SimpleOpRegEntry {
@@ -278,9 +268,8 @@ class SimpleOpRegEntry {
    * \param enable_scalar whether to enable scalar argument
    * \param type_mask the position of the scalar argument.
    */
-  virtual TSelf& set_enable_scalar(
-      bool enable_scalar,
-      SimpleOpScalarOption type_mask = kArrayBeforeScalar) = 0;
+  virtual TSelf& set_enable_scalar(bool enable_scalar,
+                                   SimpleOpScalarOption type_mask = kArrayBeforeScalar) = 0;
   /*!
    * \brief set whether to enable kwargs
    *  A function cannot have both kwargs and scalar arguments.
@@ -294,8 +283,7 @@ class SimpleOpRegEntry {
    *  The resource will be presented in both forward and backward.
    * \param reqs the request.
    */
-  virtual TSelf& set_resource_request(
-      const std::vector<ResourceRequest>& reqs) = 0;
+  virtual TSelf& set_resource_request(const std::vector<ResourceRequest>& reqs) = 0;
   /*!
    * \brief set resource request
    *  By default there is no resource request.
@@ -326,10 +314,9 @@ class SimpleOpRegEntry {
    * \param fsource The unary function that peforms the operation.
    * \param register_symbolic Whether register a symbolic operator as well.
    */
-  virtual TSelf& set_function(
-      int dev_mask,
-      SourceFunction fsource,
-      SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
+  virtual TSelf& set_function(int dev_mask,
+                              SourceFunction fsource,
+                              SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
@@ -337,11 +324,10 @@ class SimpleOpRegEntry {
    * \param inplace_in_out Whether do inplace optimization on in and out.
    * \param register_symbolic Whether register a symbolic operator as well.
    */
-  virtual TSelf& set_function(
-      int dev_mask,
-      UnaryFunction funary,
-      SimpleOpInplaceOption inplace_in_out,
-      SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
+  virtual TSelf& set_function(int dev_mask,
+                              UnaryFunction funary,
+                              SimpleOpInplaceOption inplace_in_out,
+                              SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
@@ -349,11 +335,10 @@ class SimpleOpRegEntry {
    * \param inplace_lhs_out Whether do inplace optimization on lhs and out.
    * \param register_symbolic Whether register a symbolic operator as well.
    */
-  virtual TSelf& set_function(
-      int dev_mask,
-      BinaryFunction fbinary,
-      SimpleOpInplaceOption inplace_lhs_out,
-      SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
+  virtual TSelf& set_function(int dev_mask,
+                              BinaryFunction fbinary,
+                              SimpleOpInplaceOption inplace_lhs_out,
+                              SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
   /*!
    * \brief set gradient of the function of this function.
    * \param dev_mask The device mask of the function can act on.
@@ -404,14 +389,14 @@ class SimpleOpRegEntry {
    * \param description The description of the function.
    * \return reference to self.
    */
-  virtual TSelf& describe(const std::string &description) = 0;
+  virtual TSelf& describe(const std::string& description) = 0;
   /*!
    * \brief Describe the function.
    * \param args argument information.
    *  Add additional arguments to the function.
    * \return reference to self.
    */
-  virtual TSelf& add_arguments(const std::vector<dmlc::ParamFieldInfo> &args) = 0;
+  virtual TSelf& add_arguments(const std::vector<dmlc::ParamFieldInfo>& args) = 0;
   /*! \brief virtual destructor */
   virtual ~SimpleOpRegEntry() {}
 };
@@ -424,13 +409,13 @@ class SimpleOpRegistry {
    * \param name name of the function
    * \return ref to the registered entry, used to set properties
    */
-  SimpleOpRegEntry &__REGISTER_OR_FIND__(char const* name);
+  SimpleOpRegEntry& __REGISTER_OR_FIND__(char const* name);
   /*!
    * \brief Find the entry with corresponding name.
    * \param name name of the function
    * \return the corresponding function, can be nullptr
    */
-  inline static const SimpleOpRegEntry *Find(const std::string &name) {
+  inline static const SimpleOpRegEntry* Find(const std::string& name) {
     return Get()->fmap_.at(name);
   }
   /*! \return global singleton of the registry */
@@ -451,29 +436,28 @@ class SimpleOpRegistry {
  * \tparam OType output type
  * \tparam Exp expression type
  */
-#define ASSIGN_DISPATCH(out, req, exp)  \
-  {                                     \
-    switch (req) {                      \
-      case kNullOp:                     \
-        break;                          \
-      case kWriteTo:                    \
-      case kWriteInplace:               \
-        (out) = (exp);                  \
-        break;                          \
-      case kAddTo:                      \
-        (out) += (exp);                 \
-        break;                          \
-      default:                          \
-        LOG(FATAL) << "not reached";    \
-    }                                   \
+#define ASSIGN_DISPATCH(out, req, exp) \
+  {                                    \
+    switch (req) {                     \
+      case kNullOp:                    \
+        break;                         \
+      case kWriteTo:                   \
+      case kWriteInplace:              \
+        (out) = (exp);                 \
+        break;                         \
+      case kAddTo:                     \
+        (out) += (exp);                \
+        break;                         \
+      default:                         \
+        LOG(FATAL) << "not reached";   \
+    }                                  \
   }
 
 /*!
-* \brief Maximum ndim supported for special operators like broadcasting with non contiguous lhs/rhs
-*/
+ * \brief Maximum ndim supported for special operators like broadcasting with non contiguous lhs/rhs
+ */
 #define MXNET_SPECIAL_MAX_NDIM 5
 
-
 //--------------------------------------------------------------
 // The following part are API Registration of Simple Operators
 //--------------------------------------------------------------
@@ -494,9 +478,8 @@ class SimpleOpRegistry {
  *
  * \endcode
  */
-#define MXNET_REGISTER_SIMPLE_OP(Name, DEV)                             \
-  static ::mxnet::op::SimpleOpRegEntry &                                \
-  __make_ ## SimpleOpRegEntry ## _ ## Name ## __ ## DEV ##__ =          \
+#define MXNET_REGISTER_SIMPLE_OP(Name, DEV)                                               \
+  static ::mxnet::op::SimpleOpRegEntry& __make_##SimpleOpRegEntry##_##Name##__##DEV##__ = \
       ::mxnet::op::SimpleOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
 
 }  // namespace op
diff --git a/include/mxnet/random_generator.h b/include/mxnet/random_generator.h
index 8a717451c23b..4d6f8c70a1c1 100644
--- a/include/mxnet/random_generator.h
+++ b/include/mxnet/random_generator.h
@@ -37,10 +37,10 @@ namespace mxnet {
 namespace common {
 namespace random {
 
-template<typename Device, typename DType MSHADOW_DEFAULT_DTYPE>
+template <typename Device, typename DType MSHADOW_DEFAULT_DTYPE>
 class RandGenerator;
 
-template<typename DType>
+template <typename DType>
 class RandGenerator<cpu, DType> {
  public:
   // at least how many random numbers should be generated by one CPU thread.
@@ -52,15 +52,17 @@ class RandGenerator<cpu, DType> {
   // TODO(alexzai): move impl class to separate file - tracked in MXNET-948
   class Impl {
    public:
-    typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                      DType, double>::type FType;
-    explicit Impl(RandGenerator<cpu, DType> *gen, int state_idx)
+    typedef
+        typename std::conditional<std::is_floating_point<DType>::value, DType, double>::type FType;
+    explicit Impl(RandGenerator<cpu, DType>* gen, int state_idx)
         : engine_(gen->states_ + state_idx) {}
 
-    Impl(const Impl &) = delete;
-    Impl &operator=(const Impl &) = delete;
+    Impl(const Impl&) = delete;
+    Impl& operator=(const Impl&) = delete;
 
-    MSHADOW_XINLINE int rand() { return engine_->operator()(); }
+    MSHADOW_XINLINE int rand() {
+      return engine_->operator()();
+    }
 
     MSHADOW_XINLINE int64_t rand_int64() {
       return static_cast<int64_t>(engine_->operator()() << 31) + engine_->operator()();
@@ -68,8 +70,8 @@ class RandGenerator<cpu, DType> {
 
     MSHADOW_XINLINE FType uniform() {
       typedef typename std::conditional<std::is_integral<DType>::value,
-      std::uniform_int_distribution<DType>,
-      std::uniform_real_distribution<FType>>::type GType;
+                                        std::uniform_int_distribution<DType>,
+                                        std::uniform_real_distribution<FType>>::type GType;
       GType dist_uniform;
       return dist_uniform(*engine_);
     }
@@ -80,19 +82,20 @@ class RandGenerator<cpu, DType> {
     }
 
    private:
-    std::mt19937 *engine_;
+    std::mt19937* engine_;
   };  // class RandGenerator<cpu, DType>::Impl
 
-  static void AllocState(RandGenerator<cpu, DType> *inst) {
+  static void AllocState(RandGenerator<cpu, DType>* inst) {
     inst->states_ = new std::mt19937[kNumRandomStates];
   }
 
-  static void FreeState(RandGenerator<cpu, DType> *inst) {
+  static void FreeState(RandGenerator<cpu, DType>* inst) {
     delete[] inst->states_;
   }
 
-  MSHADOW_XINLINE void Seed(mshadow::Stream<cpu> *, uint32_t seed) {
-    for (int i = 0; i < kNumRandomStates; ++i) (states_ + i)->seed(seed + i);
+  MSHADOW_XINLINE void Seed(mshadow::Stream<cpu>*, uint32_t seed) {
+    for (int i = 0; i < kNumRandomStates; ++i)
+      (states_ + i)->seed(seed + i);
   }
 
   // export global random states, used by c++ custom operator
@@ -101,18 +104,18 @@ class RandGenerator<cpu, DType> {
   }
 
  private:
-  std::mt19937 *states_;
+  std::mt19937* states_;
 };  // class RandGenerator<cpu, DType>
 
-template<typename DType>
+template <typename DType>
 const int RandGenerator<cpu, DType>::kMinNumRandomPerThread = 64;
 
-template<typename DType>
+template <typename DType>
 const int RandGenerator<cpu, DType>::kNumRandomStates = 1024;
 
 #if MXNET_USE_CUDA
 
-template<typename DType>
+template <typename DType>
 class RandGenerator<gpu, DType> {
  public:
   // at least how many random numbers should be generated by one GPU thread.
@@ -127,14 +130,12 @@ class RandGenerator<gpu, DType> {
   // TODO(alexzai): move impl class to separate file - tracked in MXNET-948
   class Impl {
    public:
-    Impl &operator=(const Impl &) = delete;
-    Impl(const Impl &) = delete;
+    Impl& operator=(const Impl&) = delete;
+    Impl(const Impl&)            = delete;
 
     // Copy state to local memory for efficiency.
-    __device__ explicit Impl(RandGenerator<gpu, DType> *gen, int state_idx)
-        : global_gen_(gen),
-          global_state_idx_(state_idx),
-          state_(*(gen->states_ + state_idx)) {}
+    __device__ explicit Impl(RandGenerator<gpu, DType>* gen, int state_idx)
+        : global_gen_(gen), global_state_idx_(state_idx), state_(*(gen->states_ + state_idx)) {}
 
     __device__ ~Impl() {
       // store the curand state back into global memory
@@ -158,25 +159,25 @@ class RandGenerator<gpu, DType> {
     }
 
    private:
-    RandGenerator<gpu, DType> *global_gen_;
+    RandGenerator<gpu, DType>* global_gen_;
     int global_state_idx_;
     curandStatePhilox4_32_10_t state_;
   };  // class RandGenerator<gpu, DType>::Impl
 
-  static void AllocState(RandGenerator<gpu, DType> *inst);
+  static void AllocState(RandGenerator<gpu, DType>* inst);
 
-  static void FreeState(RandGenerator<gpu, DType> *inst);
+  static void FreeState(RandGenerator<gpu, DType>* inst);
 
-  void Seed(mshadow::Stream<gpu> *s, uint32_t seed);
+  void Seed(mshadow::Stream<gpu>* s, uint32_t seed);
 
   // export global random states, used by c++ custom operator
   void* GetStates();
 
  private:
-  curandStatePhilox4_32_10_t *states_;
+  curandStatePhilox4_32_10_t* states_;
 };  // class RandGenerator<gpu, DType>
 
-template<>
+template <>
 class RandGenerator<gpu, double> {
  public:
   // uniform number generation in Cuda made consistent with stl (include 0 but exclude 1)
@@ -186,14 +187,12 @@ class RandGenerator<gpu, double> {
   // TODO(alexzai): move impl class to separate file - tracked in MXNET-948
   class Impl {
    public:
-    Impl &operator=(const Impl &) = delete;
-    Impl(const Impl &) = delete;
+    Impl& operator=(const Impl&) = delete;
+    Impl(const Impl&)            = delete;
 
     // Copy state to local memory for efficiency.
-    __device__ explicit Impl(RandGenerator<gpu, double> *gen, int state_idx)
-        : global_gen_(gen),
-          global_state_idx_(state_idx),
-          state_(*(gen->states_ + state_idx)) {}
+    __device__ explicit Impl(RandGenerator<gpu, double>* gen, int state_idx)
+        : global_gen_(gen), global_state_idx_(state_idx), state_(*(gen->states_ + state_idx)) {}
 
     __device__ ~Impl() {
       // store the curand state back into global memory
@@ -217,13 +216,13 @@ class RandGenerator<gpu, double> {
     }
 
    private:
-    RandGenerator<gpu, double> *global_gen_;
+    RandGenerator<gpu, double>* global_gen_;
     int global_state_idx_;
     curandStatePhilox4_32_10_t state_;
   };  // class RandGenerator<gpu, double>::Impl
 
  private:
-  curandStatePhilox4_32_10_t *states_;
+  curandStatePhilox4_32_10_t* states_;
 };  // class RandGenerator<gpu, double>
 
 #endif  // MXNET_USE_CUDA
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index b98abe1c997f..b856002cb76f 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -74,15 +74,12 @@ inline std::string __extract_fname(const std::string& path) {
 }  // anonymous namespace
 
 #if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__)
-#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
-    std::string(tag) \
-    + " (" + __extract_fname(__builtin_FILE()) \
-    + " +" +  std::to_string(__builtin_LINE()) + ")"
+#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag)                          \
+  std::string(tag) + " (" + __extract_fname(__builtin_FILE()) + " +" + \
+      std::to_string(__builtin_LINE()) + ")"
 #else  // !__GNUC__ || __clang__
 #define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
-    std::string(tag) \
-    + " (" + __extract_fname(__FILE__) \
-    + " +" +  std::to_string(__LINE__) + ")"
+  std::string(tag) + " (" + __extract_fname(__FILE__) + " +" + std::to_string(__LINE__) + ")"
 #endif  // __GNUC__ && !__clang__
 
 /*!
@@ -101,7 +98,7 @@ struct Resource {
    * \brief pointer to the resource, do not use directly,
    *  access using member functions
    */
-  void *ptr_;
+  void* ptr_;
   /*! \brief default constructor */
   Resource() : id(0) {}
   /*!
@@ -110,12 +107,10 @@ struct Resource {
    * \return the mshadow random number generator requested.
    * \tparam xpu the device type of random number generator.
    */
-  template<typename xpu, typename DType>
-  inline mshadow::Random<xpu, DType>* get_random(
-      mshadow::Stream<xpu> *stream) const {
+  template <typename xpu, typename DType>
+  inline mshadow::Random<xpu, DType>* get_random(mshadow::Stream<xpu>* stream) const {
     CHECK_EQ(req.type, ResourceRequest::kRandom);
-    mshadow::Random<xpu, DType> *ret =
-        static_cast<mshadow::Random<xpu, DType>*>(ptr_);
+    mshadow::Random<xpu, DType>* ret = static_cast<mshadow::Random<xpu, DType>*>(ptr_);
     ret->set_stream(stream);
     return ret;
   }
@@ -126,7 +121,7 @@ struct Resource {
    * \tparam DType the return type.
    * \return the parallel random number generator. for gpu, it is allocated on global memory.
    */
-  template<typename xpu, typename DType>
+  template <typename xpu, typename DType>
   inline common::random::RandGenerator<xpu, DType>* get_parallel_random() const {
     CHECK_EQ(req.type, ResourceRequest::kParallelRandom);
     return static_cast<common::random::RandGenerator<xpu, DType>*>(ptr_);
@@ -149,10 +144,11 @@ struct Resource {
    * \tparam xpu   the device type of random number generator.
    * \tparam ndim  the number of dimension of the tensor requested.
    */
-  template<typename xpu, int ndim>
+  template <typename xpu, int ndim>
   inline mshadow::Tensor<xpu, ndim, real_t> get_space(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
-      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
+      mshadow::Shape<ndim> shape,
+      mshadow::Stream<xpu>* stream,
+      const std::string& name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
     return get_space_typed<xpu, ndim, real_t>(shape, stream, name);
   }
   /*!
@@ -163,9 +159,8 @@ struct Resource {
    * \return the mshadow tensor requested.
    * \tparam ndim the number of dimension of the tensor requested.
    */
-  template<int ndim>
-  inline mshadow::Tensor<cpu, ndim, real_t> get_host_space(
-      mshadow::Shape<ndim> shape) const {
+  template <int ndim>
+  inline mshadow::Tensor<cpu, ndim, real_t> get_host_space(mshadow::Shape<ndim> shape) const {
     return get_host_space_typed<cpu, ndim, real_t>(shape);
   }
   /*!
@@ -179,15 +174,17 @@ struct Resource {
    * \tparam xpu   the device type of random number generator.
    * \tparam ndim  the number of dimension of the tensor requested.
    */
-  template<typename xpu, int ndim, typename DType>
+  template <typename xpu, int ndim, typename DType>
   inline mshadow::Tensor<xpu, ndim, DType> get_space_typed(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
-      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
+      mshadow::Shape<ndim> shape,
+      mshadow::Stream<xpu>* stream,
+      const std::string& name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
     CHECK_EQ(req.type, ResourceRequest::kTempSpace);
     return mshadow::Tensor<xpu, ndim, DType>(
-        reinterpret_cast<DType*>(get_space_internal(
-          shape.Size() * sizeof(DType), name)),
-        shape, shape[ndim - 1], stream);
+        reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType), name)),
+        shape,
+        shape[ndim - 1],
+        stream);
   }
 #if MXNET_USE_CUDNN == 1
   /*!
@@ -200,10 +197,10 @@ struct Resource {
    * \return the mshadow tensor requested.
    */
   void get_cudnn_dropout_desc(
-      cudnnDropoutDescriptor_t *dropout_desc,
-      mshadow::Stream<gpu> *stream,
+      cudnnDropoutDescriptor_t* dropout_desc,
+      mshadow::Stream<gpu>* stream,
       const float dropout,
-      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("cudnn_dropout_state")) const;
+      const std::string& name = MXNET_RESOURCE_DEFAULT_NAME_FARG("cudnn_dropout_state")) const;
 #endif  // MXNET_USE_CUDNN == 1
 
   /*!
@@ -215,12 +212,13 @@ struct Resource {
    * \tparam ndim the number of dimnesion of tensor requested
    * \tparam DType request data type
    */
-  template<int ndim, typename DType>
-  inline mshadow::Tensor<cpu, ndim, DType> get_host_space_typed(
-    mshadow::Shape<ndim> shape) const {
-      return mshadow::Tensor<cpu, ndim, DType>(
+  template <int ndim, typename DType>
+  inline mshadow::Tensor<cpu, ndim, DType> get_host_space_typed(mshadow::Shape<ndim> shape) const {
+    return mshadow::Tensor<cpu, ndim, DType>(
         reinterpret_cast<DType*>(get_host_space_internal(shape.Size() * sizeof(DType))),
-        shape, shape[ndim - 1], nullptr);
+        shape,
+        shape[ndim - 1],
+        nullptr);
   }
   /*!
    * \brief internal function to get space from resources.
@@ -228,13 +226,13 @@ struct Resource {
    * \param name the Name of the operator requesting the resource.
    * \return The allocated space.
    */
-  void* get_space_internal(size_t size, const std::string &name) const;
+  void* get_space_internal(size_t size, const std::string& name) const;
   /*!
    * \brief internal function to get cpu space from resources.
    * \param size The size of space.
    * \return The allocated space
    */
-  void *get_host_space_internal(size_t size) const;
+  void* get_host_space_internal(size_t size) const;
 };
 
 /*! \brief Global resource manager */
@@ -248,7 +246,7 @@ class ResourceManager {
    * \note The returned resource's ownership is
    *       still hold by the manager singleton.
    */
-  virtual Resource Request(Context ctx, const ResourceRequest &req) = 0;
+  virtual Resource Request(Context ctx, const ResourceRequest& req) = 0;
   /*!
    * \brief Seed all the allocated random number generators.
    * \param seed the seed to the random number generators on all devices.
@@ -264,7 +262,7 @@ class ResourceManager {
   /*!
    * \return Resource manager singleton.
    */
-  static ResourceManager *Get();
+  static ResourceManager* Get();
 };
 }  // namespace mxnet
 #endif  // MXNET_RESOURCE_H_
diff --git a/include/mxnet/rtc.h b/include/mxnet/rtc.h
index 56717f4a34c7..a87615143bc0 100644
--- a/include/mxnet/rtc.h
+++ b/include/mxnet/rtc.h
@@ -83,12 +83,19 @@ class CudaModule {
   class Kernel {
    public:
     /*! \brief Launch the kernel */
-    void Launch(const Context& ctx, const std::vector<dmlc::any>& args,
-                uint32_t grid_dim_x, uint32_t grid_dim_y, uint32_t grid_dim_z,
-                uint32_t block_dim_x, uint32_t block_dim_y, uint32_t block_dim_z,
+    void Launch(const Context& ctx,
+                const std::vector<dmlc::any>& args,
+                uint32_t grid_dim_x,
+                uint32_t grid_dim_y,
+                uint32_t grid_dim_z,
+                uint32_t block_dim_x,
+                uint32_t block_dim_y,
+                uint32_t block_dim_z,
                 uint32_t shared_mem);
     /*! \brief kernel interface signature */
-    const std::vector<ArgType>& signature() { return signature_; }
+    const std::vector<ArgType>& signature() {
+      return signature_;
+    }
 
    private:
     friend class CudaModule;
@@ -125,8 +132,7 @@ class CudaModule {
    * \param signature kernel signature
    * \return shared pointer to cuda kernel
    */
-  std::shared_ptr<Kernel> GetKernel(const std::string& name,
-                                    const std::vector<ArgType>& signature);
+  std::shared_ptr<Kernel> GetKernel(const std::string& name, const std::vector<ArgType>& signature);
 };
 
 }  // namespace rtc
diff --git a/include/mxnet/runtime/c_runtime_api.h b/include/mxnet/runtime/c_runtime_api.h
index 6a2948225ecc..446bd40b682c 100644
--- a/include/mxnet/runtime/c_runtime_api.h
+++ b/include/mxnet/runtime/c_runtime_api.h
@@ -34,7 +34,6 @@ extern "C" {
 #include <stdint.h>
 #include <stddef.h>
 
-
 /*!
  * \brief The type code in MXNetType
  * \note MXNetType is used in two places.
@@ -43,25 +42,25 @@ typedef enum {
   // The type code of other types are compatible with DLPack.
   // The next few fields are extension types
   // that is used by MXNet API calls.
-  kHandle = 3U,
-  kNull = 4U,
-  kMXNetType = 5U,
-  kMXNetContext = 6U,
-  kObjectHandle = 7U,
-  kStr = 8U,
-  kBytes = 9U,
-  kPyArg = 10U,
+  kHandle        = 3U,
+  kNull          = 4U,
+  kMXNetType     = 5U,
+  kMXNetContext  = 6U,
+  kObjectHandle  = 7U,
+  kStr           = 8U,
+  kBytes         = 9U,
+  kPyArg         = 10U,
   kNDArrayHandle = 11U,
   // Extension codes for other frameworks to integrate MXNet PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
   // Open an issue at the repo if you need a section of code.
-  kExtBegin = 15U,
+  kExtBegin  = 15U,
   kNNVMFirst = 16U,
-  kNNVMLast = 20U,
+  kNNVMLast  = 20U,
   // The following section of code is used for non-reserved types.
   kExtReserveEnd = 64U,
-  kExtEnd = 128U,
+  kExtEnd        = 128U,
   // The rest of the space is used for custom, user-supplied datatypes
   kCustomBegin = 129U,
 } MXNetTypeCode;
@@ -144,8 +143,7 @@ MXNET_DLL int MXNetFuncGetGlobal(const char* name, MXNetFunctionHandle* out);
  * \param out_array The array of function names.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNetFuncListGlobalNames(int* out_size,
-                                       const char*** out_array);
+MXNET_DLL int MXNetFuncListGlobalNames(int* out_size, const char*** out_array);
 
 /*!
  * \brief Free the object.
@@ -157,7 +155,6 @@ MXNET_DLL int MXNetFuncListGlobalNames(int* out_size,
  */
 MXNET_DLL int MXNetObjectFree(MXNetObjectHandle obj);
 
-
 /*!
  * \brief Get the type_index from an object.
  *
diff --git a/include/mxnet/runtime/container.h b/include/mxnet/runtime/container.h
index fc1d4a173669..56a0ef9d601c 100644
--- a/include/mxnet/runtime/container.h
+++ b/include/mxnet/runtime/container.h
@@ -105,8 +105,7 @@ class InplaceArrayBase {
    * \brief Destroy the Inplace Array Base object
    */
   ~InplaceArrayBase() {
-    if (!(std::is_standard_layout<ElemType>::value &&
-          std::is_trivial<ElemType>::value)) {
+    if (!(std::is_standard_layout<ElemType>::value && std::is_trivial<ElemType>::value)) {
       size_t size = Self()->GetSize();
       for (size_t i = 0; i < size; ++i) {
         ElemType* fp = reinterpret_cast<ElemType*>(AddressOf(i));
@@ -150,14 +149,14 @@ class InplaceArrayBase {
    * \return Raw pointer to the element.
    */
   void* AddressOf(size_t idx) const {
-    static_assert(alignof(ArrayType) % alignof(ElemType) == 0 &&
-                      sizeof(ArrayType) % alignof(ElemType) == 0,
-                  "The size and alignment of ArrayType should respect "
-                  "ElemType's alignment.");
+    static_assert(
+        alignof(ArrayType) % alignof(ElemType) == 0 && sizeof(ArrayType) % alignof(ElemType) == 0,
+        "The size and alignment of ArrayType should respect "
+        "ElemType's alignment.");
 
     size_t kDataStart = sizeof(ArrayType);
-    ArrayType* self = Self();
-    char* data_start = reinterpret_cast<char*>(self) + kDataStart;
+    ArrayType* self   = Self();
+    char* data_start  = reinterpret_cast<char*>(self) + kDataStart;
     return data_start + idx * sizeof(ElemType);
   }
 };
@@ -171,7 +170,7 @@ class ADTObj : public Object, public InplaceArrayBase<ADTObj, ObjectRef> {
   uint32_t size{0};
   // The fields of the structure follows directly in memory.
 
-  static constexpr const char* _type_key = "MXNet.ADT";
+  static constexpr const char* _type_key      = "MXNet.ADT";
   static constexpr const uint32_t _type_index = TypeIndex::kMXNetADT;
   MXNET_DECLARE_FINAL_OBJECT_INFO(ADTObj, Object)
 
@@ -179,7 +178,9 @@ class ADTObj : public Object, public InplaceArrayBase<ADTObj, ObjectRef> {
   /*!
    * \return The number of elements in the array.
    */
-  size_t GetSize() const { return size; }
+  size_t GetSize() const {
+    return size;
+  }
 
   /*!
    * \brief Initialize the elements in the array.
@@ -191,8 +192,8 @@ class ADTObj : public Object, public InplaceArrayBase<ADTObj, ObjectRef> {
   template <typename Iterator>
   void Init(Iterator begin, Iterator end) {
     size_t num_elems = std::distance(begin, end);
-    this->size = 0;
-    auto it = begin;
+    this->size       = 0;
+    auto it          = begin;
     for (size_t i = 0; i < num_elems; ++i) {
       InplaceArrayBase::EmplaceInit(i, *it++);
       // Only increment size after the initialization succeeds
@@ -213,8 +214,7 @@ class ADT : public ObjectRef {
    * \param fields The fields of the ADT object.
    * \return The constructed ADT object reference.
    */
-  ADT(uint32_t tag, std::vector<ObjectRef> fields)
-      : ADT(tag, fields.begin(), fields.end()){};
+  ADT(uint32_t tag, std::vector<ObjectRef> fields) : ADT(tag, fields.begin(), fields.end()){};
 
   /*!
    * \brief construct an ADT object reference.
@@ -226,8 +226,8 @@ class ADT : public ObjectRef {
   template <typename Iterator>
   ADT(uint32_t tag, Iterator begin, Iterator end) {
     size_t num_elems = std::distance(begin, end);
-    auto ptr = make_inplace_array_object<ADTObj, ObjectRef>(num_elems);
-    ptr->tag = tag;
+    auto ptr         = make_inplace_array_object<ADTObj, ObjectRef>(num_elems);
+    ptr->tag         = tag;
     ptr->Init(begin, end);
     data_ = std::move(ptr);
   }
@@ -238,8 +238,7 @@ class ADT : public ObjectRef {
    * \param init The initializer list of fields.
    * \return The constructed ADT object reference.
    */
-  ADT(uint32_t tag, std::initializer_list<ObjectRef> init)
-      : ADT(tag, init.begin(), init.end()){};
+  ADT(uint32_t tag, std::initializer_list<ObjectRef> init) : ADT(tag, init.begin(), init.end()){};
 
   /*!
    * \brief Access element at index.
@@ -254,12 +253,16 @@ class ADT : public ObjectRef {
   /*!
    * \brief Return the ADT tag.
    */
-  size_t tag() const { return operator->()->tag; }
+  size_t tag() const {
+    return operator->()->tag;
+  }
 
   /*!
    * \brief Return the number of fields.
    */
-  size_t size() const { return operator->()->size; }
+  size_t size() const {
+    return operator->()->size;
+  }
 
   /*!
    * \brief Construct a tuple object.
diff --git a/include/mxnet/runtime/container_ext.h b/include/mxnet/runtime/container_ext.h
index acbc02af8fe5..d9f513151fd8 100644
--- a/include/mxnet/runtime/container_ext.h
+++ b/include/mxnet/runtime/container_ext.h
@@ -83,67 +83,93 @@ class MapObj : public Object {
   static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect");
 
   static constexpr const uint32_t _type_index = runtime::TypeIndex::kMXNetMap;
-  static constexpr const char* _type_key = "MXNet.Map";
+  static constexpr const char* _type_key      = "MXNet.Map";
   MXNET_DECLARE_FINAL_OBJECT_INFO(MapObj, Object);
 
   /*!
    * \brief Number of elements in the MapObj
    * \return The result
    */
-  size_t size() const { return data_.size(); }
+  size_t size() const {
+    return data_.size();
+  }
   /*!
    * \brief Count the number of times a key exists in the hash map
    * \param key The indexing key
    * \return The result, 0 or 1
    */
-  size_t count(const key_type& key) const { return data_.count(key); }
+  size_t count(const key_type& key) const {
+    return data_.count(key);
+  }
   /*!
    * \brief Index value associated with a key, throw exception if the key does not exist
    * \param key The indexing key
    * \return The const reference to the value
    */
-  const mapped_type& at(const key_type& key) const { return data_.at(key); }
+  const mapped_type& at(const key_type& key) const {
+    return data_.at(key);
+  }
   /*!
    * \brief Index value associated with a key, throw exception if the key does not exist
    * \param key The indexing key
    * \return The mutable reference to the value
    */
-  mapped_type& at(const key_type& key) { return data_.at(key); }
+  mapped_type& at(const key_type& key) {
+    return data_.at(key);
+  }
   /*! \return begin iterator */
-  iterator begin() { return data_.begin(); }
+  iterator begin() {
+    return data_.begin();
+  }
   /*! \return const begin iterator */
-  const_iterator begin() const { return data_.begin(); }
+  const_iterator begin() const {
+    return data_.begin();
+  }
   /*! \return end iterator */
-  iterator end() { return data_.end(); }
+  iterator end() {
+    return data_.end();
+  }
   /*! \return end iterator */
-  const_iterator end() const { return data_.end(); }
+  const_iterator end() const {
+    return data_.end();
+  }
   /*!
    * \brief Index value associated with a key
    * \param key The indexing key
    * \return The iterator of the entry associated with the key, end iterator if not exists
    */
-  const_iterator find(const key_type& key) const { return data_.find(key); }
+  const_iterator find(const key_type& key) const {
+    return data_.find(key);
+  }
   /*!
    * \brief Index value associated with a key
    * \param key The indexing key
    * \return The iterator of the entry associated with the key, end iterator if not exists
    */
-  iterator find(const key_type& key) { return data_.find(key); }
+  iterator find(const key_type& key) {
+    return data_.find(key);
+  }
   /*!
    * \brief Erase the entry associated with the iterator
    * \param position The iterator
    */
-  void erase(const iterator& position) { data_.erase(position); }
+  void erase(const iterator& position) {
+    data_.erase(position);
+  }
   /*!
    * \brief Erase the entry associated with the key, do nothing if not exists
    * \param key The indexing key
    */
-  void erase(const key_type& key) { data_.erase(key); }
+  void erase(const key_type& key) {
+    data_.erase(key);
+  }
   /*!
    * \brief Create an empty container
    * \return The object created
    */
-  static ObjectPtr<MapObj> Empty() { return make_object<MapObj>(); }
+  static ObjectPtr<MapObj> Empty() {
+    return make_object<MapObj>();
+  }
 
  protected:
   /*!
@@ -156,7 +182,7 @@ class MapObj : public Object {
   template <typename IterType>
   static ObjectPtr<Object> CreateFromRange(IterType first, IterType last) {
     ObjectPtr<MapObj> p = make_object<MapObj>();
-    p->data_ = ContainerType(first, last);
+    p->data_            = ContainerType(first, last);
     return p;
   }
   /*!
@@ -165,7 +191,7 @@ class MapObj : public Object {
    * \param map The pointer to the map, can be changed if re-hashing happens
    */
   static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
-    MapObj* map_node = static_cast<MapObj*>(map->get());
+    MapObj* map_node          = static_cast<MapObj*>(map->get());
     map_node->data_[kv.first] = kv.second;
   }
   /*!
@@ -175,7 +201,7 @@ class MapObj : public Object {
    */
   static ObjectPtr<MapObj> CopyFrom(MapObj* from) {
     ObjectPtr<MapObj> p = make_object<MapObj>();
-    p->data_ = ContainerType(from->data_.begin(), from->data_.end());
+    p->data_            = ContainerType(from->data_.begin(), from->data_.end());
     return p;
   }
   /*! \brief The real container storing data */
@@ -193,23 +219,28 @@ class MapObj : public Object {
  * \tparam K The key NodeRef type.
  * \tparam V The value NodeRef type.
  */
-template <typename K, typename V,
+template <typename K,
+          typename V,
           typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
           typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
 class Map : public ObjectRef {
  public:
-  using key_type = K;
+  using key_type    = K;
   using mapped_type = V;
   class iterator;
   /*!
    * \brief default constructor
    */
-  Map() { data_ = MapObj::Empty(); }
+  Map() {
+    data_ = MapObj::Empty();
+  }
   /*!
    * \brief move constructor
    * \param other source
    */
-  Map(Map<K, V>&& other) { data_ = std::move(other.data_); }
+  Map(Map<K, V>&& other) {
+    data_ = std::move(other.data_);
+  }
   /*!
    * \brief copy constructor
    * \param other source
@@ -268,13 +299,17 @@ class Map : public ObjectRef {
    * \param key The key
    * \return the corresonding element.
    */
-  const V at(const K& key) const { return DowncastNoCheck<V>(GetMapObj()->at(key)); }
+  const V at(const K& key) const {
+    return DowncastNoCheck<V>(GetMapObj()->at(key));
+  }
   /*!
    * \brief Read element from map.
    * \param key The key
    * \return the corresonding element.
    */
-  const V operator[](const K& key) const { return this->at(key); }
+  const V operator[](const K& key) const {
+    return this->at(key);
+  }
   /*! \return The size of the array */
   size_t size() const {
     MapObj* n = GetMapObj();
@@ -286,7 +321,9 @@ class Map : public ObjectRef {
     return n == nullptr ? 0 : GetMapObj()->count(key);
   }
   /*! \return whether array is empty */
-  bool empty() const { return size() == 0; }
+  bool empty() const {
+    return size() == 0;
+  }
   /*!
    * \brief set the Map.
    * \param key The index key.
@@ -297,13 +334,21 @@ class Map : public ObjectRef {
     MapObj::InsertMaybeReHash(MapObj::KVType(key, value), &data_);
   }
   /*! \return begin iterator */
-  iterator begin() const { return iterator(GetMapObj()->begin()); }
+  iterator begin() const {
+    return iterator(GetMapObj()->begin());
+  }
   /*! \return end iterator */
-  iterator end() const { return iterator(GetMapObj()->end()); }
+  iterator end() const {
+    return iterator(GetMapObj()->end());
+  }
   /*! \return find the key and returns the associated iterator */
-  iterator find(const K& key) const { return iterator(GetMapObj()->find(key)); }
+  iterator find(const K& key) const {
+    return iterator(GetMapObj()->find(key));
+  }
 
-  void erase(const K& key) { CopyOnWrite()->erase(key); }
+  void erase(const K& key) {
+    CopyOnWrite()->erase(key);
+  }
 
   /*!
    * \brief copy on write semantics
@@ -328,17 +373,21 @@ class Map : public ObjectRef {
   class iterator {
    public:
     using iterator_category = std::bidirectional_iterator_tag;
-    using difference_type = int64_t;
-    using value_type = const std::pair<K, V>;
-    using pointer = value_type*;
-    using reference = value_type;
+    using difference_type   = int64_t;
+    using value_type        = const std::pair<K, V>;
+    using pointer           = value_type*;
+    using reference         = value_type;
 
     iterator() : itr() {}
 
     /*! \brief Compare iterators */
-    bool operator==(const iterator& other) const { return itr == other.itr; }
+    bool operator==(const iterator& other) const {
+      return itr == other.itr;
+    }
     /*! \brief Compare iterators */
-    bool operator!=(const iterator& other) const { return itr != other.itr; }
+    bool operator!=(const iterator& other) const {
+      return itr != other.itr;
+    }
     /*! \brief De-reference iterators is not allowed */
     pointer operator->() const = delete;
     /*! \brief De-reference iterators */
@@ -370,7 +419,9 @@ class Map : public ObjectRef {
 
  private:
   /*! \brief Return data_ as type of pointer of MapObj */
-  MapObj* GetMapObj() const { return static_cast<MapObj*>(data_.get()); }
+  MapObj* GetMapObj() const {
+    return static_cast<MapObj*>(data_.get());
+  }
 };
 
 /*!
@@ -379,7 +430,8 @@ class Map : public ObjectRef {
  * \param rhs the second Map to merge.
  * @return The merged Array. Original Maps are kept unchanged.
  */
-template <typename K, typename V,
+template <typename K,
+          typename V,
           typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
           typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
 inline Map<K, V> Merge(Map<K, V> lhs, const Map<K, V>& rhs) {
@@ -399,7 +451,7 @@ class StringObj : public Object {
   uint64_t size;
 
   static constexpr const uint32_t _type_index = TypeIndex::kMXNetString;
-  static constexpr const char* _type_key = "MXNet.String";
+  static constexpr const char* _type_key      = "MXNet.String";
   MXNET_DECLARE_FINAL_OBJECT_INFO(StringObj, Object);
 
  private:
@@ -515,7 +567,9 @@ class String : public ObjectRef {
    *
    * \return const char*
    */
-  const char* c_str() const { return get()->data; }
+  const char* c_str() const {
+    return get()->data;
+  }
 
   /*!
    * \brief Return the length of the string
@@ -532,33 +586,41 @@ class String : public ObjectRef {
    *
    * \return size_t string length
    */
-  size_t length() const { return size(); }
+  size_t length() const {
+    return size();
+  }
 
   /*!
    * \brief Retun if the string is empty
    *
    * \return true if empty, false otherwise.
    */
-  bool empty() const { return size() == 0; }
+  bool empty() const {
+    return size() == 0;
+  }
 
   /*!
    * \brief Return the data pointer
    *
    * \return const char* data pointer
    */
-  const char* data() const { return get()->data; }
+  const char* data() const {
+    return get()->data;
+  }
 
   /*!
    * \brief Convert String to an std::string object
    *
    * \return std::string
    */
-  operator std::string() const { return std::string{get()->data, size()}; }
+  operator std::string() const {
+    return std::string{get()->data, size()};
+  }
 
   /*!
-   * \brief Check if a MXNetArgValue can be converted to String, i.e. it can be std::string or String
-   * \param val The value to be checked
-   * \return A boolean indicating if val can be converted to String
+   * \brief Check if a MXNetArgValue can be converted to String, i.e. it can be std::string or
+   * String \param val The value to be checked \return A boolean indicating if val can be converted
+   * to String
    */
   inline static bool CanConvertFrom(const MXNetArgValue& val);
 
@@ -636,10 +698,10 @@ class StringObj::FromStd : public StringObj {
 };
 
 inline String::String(std::string other) {
-  auto ptr = make_object<StringObj::FromStd>(std::move(other));
+  auto ptr  = make_object<StringObj::FromStd>(std::move(other));
   ptr->size = ptr->data_container.size();
   ptr->data = ptr->data_container.data();
-  data_ = std::move(ptr);
+  data_     = std::move(ptr);
 }
 
 inline String& String::operator=(std::string other) {
@@ -648,7 +710,9 @@ inline String& String::operator=(std::string other) {
   return *this;
 }
 
-inline String& String::operator=(const char* other) { return operator=(std::string(other)); }
+inline String& String::operator=(const char* other) {
+  return operator=(std::string(other));
+}
 
 inline String operator+(const String& lhs, const String& rhs) {
   size_t lhs_size = lhs.size();
@@ -681,70 +745,130 @@ inline String operator+(const String& lhs, const char* rhs) {
 }
 
 // Overload < operator
-inline bool operator<(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) < 0; }
+inline bool operator<(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) < 0;
+}
 
-inline bool operator<(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) > 0; }
+inline bool operator<(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) > 0;
+}
 
-inline bool operator<(const String& lhs, const String& rhs) { return lhs.compare(rhs) < 0; }
+inline bool operator<(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) < 0;
+}
 
-inline bool operator<(const String& lhs, const char* rhs) { return lhs.compare(rhs) < 0; }
+inline bool operator<(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) < 0;
+}
 
-inline bool operator<(const char* lhs, const String& rhs) { return rhs.compare(lhs) > 0; }
+inline bool operator<(const char* lhs, const String& rhs) {
+  return rhs.compare(lhs) > 0;
+}
 
 // Overload > operator
-inline bool operator>(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) > 0; }
+inline bool operator>(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) > 0;
+}
 
-inline bool operator>(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) < 0; }
+inline bool operator>(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) < 0;
+}
 
-inline bool operator>(const String& lhs, const String& rhs) { return lhs.compare(rhs) > 0; }
+inline bool operator>(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) > 0;
+}
 
-inline bool operator>(const String& lhs, const char* rhs) { return lhs.compare(rhs) > 0; }
+inline bool operator>(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) > 0;
+}
 
-inline bool operator>(const char* lhs, const String& rhs) { return rhs.compare(lhs) < 0; }
+inline bool operator>(const char* lhs, const String& rhs) {
+  return rhs.compare(lhs) < 0;
+}
 
 // Overload <= operator
-inline bool operator<=(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) <= 0; }
+inline bool operator<=(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) <= 0;
+}
 
-inline bool operator<=(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) >= 0; }
+inline bool operator<=(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) >= 0;
+}
 
-inline bool operator<=(const String& lhs, const String& rhs) { return lhs.compare(rhs) <= 0; }
+inline bool operator<=(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) <= 0;
+}
 
-inline bool operator<=(const String& lhs, const char* rhs) { return lhs.compare(rhs) <= 0; }
+inline bool operator<=(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) <= 0;
+}
 
-inline bool operator<=(const char* lhs, const String& rhs) { return rhs.compare(lhs) >= 0; }
+inline bool operator<=(const char* lhs, const String& rhs) {
+  return rhs.compare(lhs) >= 0;
+}
 
 // Overload >= operator
-inline bool operator>=(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) >= 0; }
+inline bool operator>=(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) >= 0;
+}
 
-inline bool operator>=(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) <= 0; }
+inline bool operator>=(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) <= 0;
+}
 
-inline bool operator>=(const String& lhs, const String& rhs) { return lhs.compare(rhs) >= 0; }
+inline bool operator>=(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) >= 0;
+}
 
-inline bool operator>=(const String& lhs, const char* rhs) { return lhs.compare(rhs) >= 0; }
+inline bool operator>=(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) >= 0;
+}
 
-inline bool operator>=(const char* lhs, const String& rhs) { return rhs.compare(rhs) <= 0; }
+inline bool operator>=(const char* lhs, const String& rhs) {
+  return rhs.compare(rhs) <= 0;
+}
 
 // Overload == operator
-inline bool operator==(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) == 0; }
+inline bool operator==(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) == 0;
+}
 
-inline bool operator==(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) == 0; }
+inline bool operator==(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) == 0;
+}
 
-inline bool operator==(const String& lhs, const String& rhs) { return lhs.compare(rhs) == 0; }
+inline bool operator==(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) == 0;
+}
 
-inline bool operator==(const String& lhs, const char* rhs) { return lhs.compare(rhs) == 0; }
+inline bool operator==(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) == 0;
+}
 
-inline bool operator==(const char* lhs, const String& rhs) { return rhs.compare(lhs) == 0; }
+inline bool operator==(const char* lhs, const String& rhs) {
+  return rhs.compare(lhs) == 0;
+}
 
 // Overload != operator
-inline bool operator!=(const String& lhs, const std::string& rhs) { return lhs.compare(rhs) != 0; }
+inline bool operator!=(const String& lhs, const std::string& rhs) {
+  return lhs.compare(rhs) != 0;
+}
 
-inline bool operator!=(const std::string& lhs, const String& rhs) { return rhs.compare(lhs) != 0; }
+inline bool operator!=(const std::string& lhs, const String& rhs) {
+  return rhs.compare(lhs) != 0;
+}
 
-inline bool operator!=(const String& lhs, const String& rhs) { return lhs.compare(rhs) != 0; }
+inline bool operator!=(const String& lhs, const String& rhs) {
+  return lhs.compare(rhs) != 0;
+}
 
-inline bool operator!=(const String& lhs, const char* rhs) { return lhs.compare(rhs) != 0; }
+inline bool operator!=(const String& lhs, const char* rhs) {
+  return lhs.compare(rhs) != 0;
+}
 
-inline bool operator!=(const char* lhs, const String& rhs) { return rhs.compare(lhs) != 0; }
+inline bool operator!=(const char* lhs, const String& rhs) {
+  return rhs.compare(lhs) != 0;
+}
 
 inline std::ostream& operator<<(std::ostream& out, const String& input) {
   out.write(input.data(), input.size());
@@ -752,11 +876,14 @@ inline std::ostream& operator<<(std::ostream& out, const String& input) {
 }
 
 inline int String::memncmp(const char* lhs, const char* rhs, size_t lhs_count, size_t rhs_count) {
-  if (lhs == rhs && lhs_count == rhs_count) return 0;
+  if (lhs == rhs && lhs_count == rhs_count)
+    return 0;
 
   for (size_t i = 0; i < lhs_count && i < rhs_count; ++i) {
-    if (lhs[i] < rhs[i]) return -1;
-    if (lhs[i] > rhs[i]) return 1;
+    if (lhs[i] < rhs[i])
+      return -1;
+    if (lhs[i] > rhs[i])
+      return 1;
   }
   if (lhs_count < rhs_count) {
     return -1;
diff --git a/include/mxnet/runtime/data_type.h b/include/mxnet/runtime/data_type.h
index 01d776322e68..78c41bead76d 100644
--- a/include/mxnet/runtime/data_type.h
+++ b/include/mxnet/runtime/data_type.h
@@ -29,7 +29,6 @@
 #include <dmlc/logging.h>
 #include <type_traits>
 
-
 namespace mxnet {
 namespace runtime {
 /*!
@@ -42,9 +41,9 @@ class MXNetDataType {
  public:
   /*! \brief Type code for the MXNetDataType. */
   enum TypeCode {
-    kInt = kDLInt,
-    kUInt = kDLUInt,
-    kFloat = kDLFloat,
+    kInt    = kDLInt,
+    kUInt   = kDLUInt,
+    kFloat  = kDLFloat,
     kHandle = MXNetTypeCode::kHandle,
   };
   /*! \brief default constructor */
@@ -53,8 +52,7 @@ class MXNetDataType {
    * \brief Constructor
    * \param dtype The DLDataType
    */
-  explicit MXNetDataType(DLDataType dtype)
-      : data_(dtype) {}
+  explicit MXNetDataType(DLDataType dtype) : data_(dtype) {}
   /*!
    * \brief Constructor
    * \param code The type code.
@@ -62,8 +60,8 @@ class MXNetDataType {
    * \param lanes The number of lanes.
    */
   MXNetDataType(int code, int bits, int lanes) {
-    data_.code = static_cast<uint8_t>(code);
-    data_.bits = static_cast<uint8_t>(bits);
+    data_.code  = static_cast<uint8_t>(code);
+    data_.bits  = static_cast<uint8_t>(bits);
     data_.lanes = static_cast<uint16_t>(lanes);
   }
   /*! \return The type code. */
@@ -139,10 +137,8 @@ class MXNetDataType {
    * \return The comparison resilt.
    */
   bool operator==(const MXNetDataType& other) const {
-    return
-        data_.code == other.data_.code &&
-        data_.bits == other.data_.bits &&
-        data_.lanes == other.data_.lanes;
+    return data_.code == other.data_.code && data_.bits == other.data_.bits &&
+           data_.lanes == other.data_.lanes;
   }
   /*!
    * \brief NotEqual comparator.
@@ -156,7 +152,7 @@ class MXNetDataType {
    * \brief Converter to DLDataType
    * \return the result.
    */
-  operator DLDataType () const {
+  operator DLDataType() const {
     return data_;
   }
 
diff --git a/include/mxnet/runtime/ffi_helper.h b/include/mxnet/runtime/ffi_helper.h
index cfc79a6c4f47..83896dd8bbe2 100644
--- a/include/mxnet/runtime/ffi_helper.h
+++ b/include/mxnet/runtime/ffi_helper.h
@@ -37,7 +37,7 @@ namespace runtime {
 class EllipsisObj : public Object {
  public:
   static constexpr const uint32_t _type_index = TypeIndex::kEllipsis;
-  static constexpr const char* _type_key = "MXNet.Ellipsis";
+  static constexpr const char* _type_key      = "MXNet.Ellipsis";
   MXNET_DECLARE_FINAL_OBJECT_INFO(EllipsisObj, Object)
 };
 
@@ -53,23 +53,23 @@ class SliceObj : public Object {
   int64_t step;
 
   static constexpr const uint32_t _type_index = TypeIndex::kSlice;
-  static constexpr const char* _type_key = "MXNet.Slice";
+  static constexpr const char* _type_key      = "MXNet.Slice";
   MXNET_DECLARE_FINAL_OBJECT_INFO(SliceObj, Object)
 };
 
 class Slice : public ObjectRef {
  public:
-  explicit inline Slice(int64_t start, int64_t stop, int64_t step,
+  explicit inline Slice(int64_t start,
+                        int64_t stop,
+                        int64_t step,
                         ObjectPtr<SliceObj>&& data = make_object<SliceObj>()) {
     data->start = start;
-    data->stop = stop;
-    data->step = step;
-    data_ = std::move(data);
+    data->stop  = stop;
+    data->step  = step;
+    data_       = std::move(data);
   }
 
-  explicit inline Slice(int64_t stop)
-      : Slice(kNoneValue, stop, kNoneValue) {
-  }
+  explicit inline Slice(int64_t stop) : Slice(kNoneValue, stop, kNoneValue) {}
 
   // constant to represent None.
   static constexpr int64_t kNoneValue = std::numeric_limits<int64_t>::min();
@@ -81,38 +81,36 @@ int64_t inline SliceNoneValue() {
   return Slice::kNoneValue;
 }
 
-class IntegerObj: public Object {
+class IntegerObj : public Object {
  public:
   int64_t value;
   static constexpr const uint32_t _type_index = TypeIndex::kInteger;
-  static constexpr const char* _type_key = "MXNet.Integer";
+  static constexpr const char* _type_key      = "MXNet.Integer";
   MXNET_DECLARE_FINAL_OBJECT_INFO(IntegerObj, Object)
 };
 
-class Integer: public ObjectRef {
+class Integer : public ObjectRef {
  public:
-  explicit Integer(int64_t value,
-                   ObjectPtr<IntegerObj>&& data = make_object<IntegerObj>()) {
+  explicit Integer(int64_t value, ObjectPtr<IntegerObj>&& data = make_object<IntegerObj>()) {
     data->value = value;
-    data_ = std::move(data);
+    data_       = std::move(data);
   }
   MXNET_DEFINE_OBJECT_REF_METHODS(Integer, ObjectRef, IntegerObj)
 };
 
-class FloatObj: public Object {
+class FloatObj : public Object {
  public:
   double value;
   static constexpr const uint32_t _type_index = TypeIndex::kFloat;
-  static constexpr const char* _type_key = "MXNet.Float";
+  static constexpr const char* _type_key      = "MXNet.Float";
   MXNET_DECLARE_FINAL_OBJECT_INFO(FloatObj, Object)
 };
 
-class Float: public ObjectRef {
+class Float : public ObjectRef {
  public:
-  explicit Float(double value,
-                 ObjectPtr<FloatObj>&& data = make_object<FloatObj>()) {
+  explicit Float(double value, ObjectPtr<FloatObj>&& data = make_object<FloatObj>()) {
     data->value = value;
-    data_ = std::move(data);
+    data_       = std::move(data);
   }
   MXNET_DEFINE_OBJECT_REF_METHODS(Float, ObjectRef, FloatObj)
 };
diff --git a/include/mxnet/runtime/memory.h b/include/mxnet/runtime/memory.h
index ea4b5a409d1e..057c7c3d3689 100644
--- a/include/mxnet/runtime/memory.h
+++ b/include/mxnet/runtime/memory.h
@@ -37,7 +37,7 @@ namespace runtime {
  * \tparam T the node type.
  * \return The ObjectPtr to the allocated object.
  */
-template<typename T, typename... Args>
+template <typename T, typename... Args>
 inline ObjectPtr<T> make_object(Args&&... args);
 
 // Detail implementations after this
@@ -56,7 +56,7 @@ inline ObjectPtr<T> make_object(Args&&... args);
  *
  * \tparam Derived The derived class.
  */
-template<typename Derived>
+template <typename Derived>
 class ObjAllocatorBase {
  public:
   /*!
@@ -65,15 +65,13 @@ class ObjAllocatorBase {
    * \tparam Args The constructor signature.
    * \param args The arguments.
    */
-  template<typename T, typename... Args>
+  template <typename T, typename... Args>
   inline ObjectPtr<T> make_object(Args&&... args) {
     using Handler = typename Derived::template Handler<T>;
-    static_assert(std::is_base_of<Object, T>::value,
-                  "make can only be used to create Object");
-    T* ptr = Handler::New(static_cast<Derived*>(this),
-                         std::forward<Args>(args)...);
+    static_assert(std::is_base_of<Object, T>::value, "make can only be used to create Object");
+    T* ptr           = Handler::New(static_cast<Derived*>(this), std::forward<Args>(args)...);
     ptr->type_index_ = T::RuntimeTypeIndex();
-    ptr->deleter_ = Handler::Deleter();
+    ptr->deleter_    = Handler::Deleter();
     return ObjectPtr<T>(ptr);
   }
 
@@ -84,30 +82,28 @@ class ObjAllocatorBase {
    * \param num_elems The number of array elements.
    * \param args The arguments.
    */
-  template<typename ArrayType, typename ElemType, typename... Args>
+  template <typename ArrayType, typename ElemType, typename... Args>
   inline ObjectPtr<ArrayType> make_inplace_array(size_t num_elems, Args&&... args) {
     using Handler = typename Derived::template ArrayHandler<ArrayType, ElemType>;
     static_assert(std::is_base_of<Object, ArrayType>::value,
                   "make_inplace_array can only be used to create Object");
-    ArrayType* ptr = Handler::New(static_cast<Derived*>(this),
-                                  num_elems,
-                                  std::forward<Args>(args)...);
+    ArrayType* ptr =
+        Handler::New(static_cast<Derived*>(this), num_elems, std::forward<Args>(args)...);
     ptr->type_index_ = ArrayType::RuntimeTypeIndex();
-    ptr->deleter_ = Handler::Deleter();
+    ptr->deleter_    = Handler::Deleter();
     return ObjectPtr<ArrayType>(ptr);
   }
 };
 
 // Simple allocator that uses new/delete.
-class SimpleObjAllocator :
-      public ObjAllocatorBase<SimpleObjAllocator> {
+class SimpleObjAllocator : public ObjAllocatorBase<SimpleObjAllocator> {
  public:
-  template<typename T>
+  template <typename T>
   class Handler {
    public:
     using StorageType = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
 
-    template<typename... Args>
+    template <typename... Args>
     static T* New(SimpleObjAllocator*, Args&&... args) {
       // NOTE: the first argument is not needed for SimpleObjAllocator
       // It is reserved for special allocators that needs to recycle
@@ -147,16 +143,16 @@ class SimpleObjAllocator :
   };
 
   // Array handler that uses new/delete.
-  template<typename ArrayType, typename ElemType>
+  template <typename ArrayType, typename ElemType>
   class ArrayHandler {
    public:
     using StorageType = typename std::aligned_storage<sizeof(ArrayType), alignof(ArrayType)>::type;
     // for now only support elements that aligns with array header.
     static_assert(alignof(ArrayType) % alignof(ElemType) == 0 &&
-                  sizeof(ArrayType) % alignof(ElemType) == 0,
+                      sizeof(ArrayType) % alignof(ElemType) == 0,
                   "element alignment constraint");
 
-    template<typename... Args>
+    template <typename... Args>
     static ArrayType* New(SimpleObjAllocator*, size_t num_elems, Args&&... args) {
       // NOTE: the first argument is not needed for ArrayObjAllocator
       // It is reserved for special allocators that needs to recycle
@@ -170,10 +166,10 @@ class SimpleObjAllocator :
       // class with non-virtual destructor.
       // We are fine here as we captured the right deleter during construction.
       // This is also the right way to get storage type for an object pool.
-      size_t unit = sizeof(StorageType);
-      size_t requested_size = num_elems * sizeof(ElemType) + sizeof(ArrayType);
+      size_t unit              = sizeof(StorageType);
+      size_t requested_size    = num_elems * sizeof(ElemType) + sizeof(ArrayType);
       size_t num_storage_slots = (requested_size + unit - 1) / unit;
-      StorageType* data = new StorageType[num_storage_slots];
+      StorageType* data        = new StorageType[num_storage_slots];
       new (data) ArrayType(std::forward<Args>(args)...);
       return reinterpret_cast<ArrayType*>(data);
     }
@@ -194,20 +190,20 @@ class SimpleObjAllocator :
       // call a virtual destructor(which may not be available and is not required).
       tptr->ArrayType::~ArrayType();
       StorageType* p = reinterpret_cast<StorageType*>(tptr);
-      delete []p;
+      delete[] p;
     }
   };
 };
 
-template<typename T, typename... Args>
+template <typename T, typename... Args>
 inline ObjectPtr<T> make_object(Args&&... args) {
   return SimpleObjAllocator().make_object<T>(std::forward<Args>(args)...);
 }
 
-template<typename ArrayType, typename ElemType, typename... Args>
+template <typename ArrayType, typename ElemType, typename... Args>
 inline ObjectPtr<ArrayType> make_inplace_array_object(size_t num_elems, Args&&... args) {
-  return SimpleObjAllocator().make_inplace_array<ArrayType, ElemType>(
-    num_elems, std::forward<Args>(args)...);
+  return SimpleObjAllocator().make_inplace_array<ArrayType, ElemType>(num_elems,
+                                                                      std::forward<Args>(args)...);
 }
 
 }  // namespace runtime
diff --git a/include/mxnet/runtime/ndarray.h b/include/mxnet/runtime/ndarray.h
index 317c3239092d..666fc12a6787 100644
--- a/include/mxnet/runtime/ndarray.h
+++ b/include/mxnet/runtime/ndarray.h
@@ -34,7 +34,7 @@ namespace runtime {
  *  For TVM NDArray itself, code = 0.
  *  All subclasses of NDArray should override code > 0.
  */
-template<typename T>
+template <typename T>
 struct array_type_info {
   /*! \brief the value of the traits */
   static const int code = -1;
diff --git a/include/mxnet/runtime/ndarray_handle.h b/include/mxnet/runtime/ndarray_handle.h
index 22ebc2c09048..d8d2819b966f 100644
--- a/include/mxnet/runtime/ndarray_handle.h
+++ b/include/mxnet/runtime/ndarray_handle.h
@@ -41,8 +41,8 @@ class NDArrayHandle : public ObjectRef {
  public:
   explicit NDArrayHandle(NDArray* value) {
     runtime::ObjectPtr<NDArrayHandleObj> node = make_object<NDArrayHandleObj>();
-    node->value = *value;
-    data_ = std::move(node);
+    node->value                               = *value;
+    data_                                     = std::move(node);
   }
   inline NDArray* getArray() const {
     return static_cast<NDArray*>(&(static_cast<NDArrayHandleObj*>(data_.get())->value));
diff --git a/include/mxnet/runtime/object.h b/include/mxnet/runtime/object.h
index 0b679c7fefd8..d8ec1ee2fd50 100644
--- a/include/mxnet/runtime/object.h
+++ b/include/mxnet/runtime/object.h
@@ -48,18 +48,18 @@ namespace mxnet {
 namespace runtime {
 
 /*! \brief list of the type index. */
-enum TypeIndex  {
+enum TypeIndex {
   /*! \brief Root object type. */
-  kRoot = 0,
-  kMXNetTensor = 1,
+  kRoot         = 0,
+  kMXNetTensor  = 1,
   kMXNetClosure = 2,
-  kMXNetADT = 3,
-  kMXNetMap = 4,
-  kMXNetString = 5,
-  kEllipsis = 6,
-  kSlice = 7,
-  kInteger = 8,
-  kFloat = 9,
+  kMXNetADT     = 3,
+  kMXNetMap     = 4,
+  kMXNetString  = 5,
+  kEllipsis     = 6,
+  kSlice        = 7,
+  kInteger      = 8,
+  kFloat        = 9,
   kStaticIndexEnd,
   /*! \brief Type index is allocated during runtime. */
   kDynamic = kStaticIndexEnd
@@ -93,8 +93,8 @@ enum TypeIndex  {
  *       Recommendation: set to estimate number of children needed.
  * - _type_child_slots_can_overflow:
  *       Whether we can add additional child classes even if the number of child classes
- *       exceeds the _type_child_slots. A fallback mechanism to check global type table will be used.
- *       Recommendation: set to false for optimal runtime speed if we know exact number of children.
+ *       exceeds the _type_child_slots. A fallback mechanism to check global type table will be
+ * used. Recommendation: set to false for optimal runtime speed if we know exact number of children.
  *
  * Two macros are used to declare helper functions in the object:
  * - Use MXNET_DECLARE_BASE_OBJECT_INFO for object classes that can be sub-classed.
@@ -177,7 +177,7 @@ class Object {
    * \tparam TargetType The target type to be checked.
    * \return Whether the target type is true.
    */
-  template<typename TargetType>
+  template <typename TargetType>
   inline bool IsInstance() const;
 
   /*!
@@ -215,8 +215,8 @@ class Object {
   }
 
   // Default object type properties for sub-classes
-  static constexpr bool _type_final = false;
-  static constexpr uint32_t _type_child_slots = 0;
+  static constexpr bool _type_final                    = false;
+  static constexpr uint32_t _type_child_slots          = 0;
   static constexpr bool _type_child_slots_can_overflow = true;
   // NOTE: the following field is not type index of Object
   // but was intended to be used by sub-classes as default value.
@@ -234,10 +234,10 @@ class Object {
   }
   Object(Object&& other) {  // NOLINT(*)
   }
-  Object& operator=(const Object& other) {  //NOLINT(*)
+  Object& operator=(const Object& other) {  // NOLINT(*)
     return *this;
   }
-  Object& operator=(Object&& other) {  //NOLINT(*)
+  Object& operator=(Object&& other) {  // NOLINT(*)
     return *this;
   }
 
@@ -255,7 +255,7 @@ class Object {
   FDeleter deleter_ = nullptr;
   // Invariant checks.
   static_assert(sizeof(int32_t) == sizeof(RefCounterType) &&
-                alignof(int32_t) == sizeof(RefCounterType),
+                    alignof(int32_t) == sizeof(RefCounterType),
                 "RefCounter ABI check.");
 
   /*!
@@ -275,12 +275,11 @@ class Object {
    * \param type_child_slots_can_overflow Whether to allow child to overflow the slots.
    * \return The allocated type index.
    */
-  MXNET_DLL static uint32_t GetOrAllocRuntimeTypeIndex(
-      const std::string& key,
-      uint32_t static_tindex,
-      uint32_t parent_tindex,
-      uint32_t type_child_slots,
-      bool type_child_slots_can_overflow);
+  MXNET_DLL static uint32_t GetOrAllocRuntimeTypeIndex(const std::string& key,
+                                                       uint32_t static_tindex,
+                                                       uint32_t parent_tindex,
+                                                       uint32_t type_child_slots,
+                                                       bool type_child_slots_can_overflow);
 
   // reference counter related operations
   /*! \brief developer function, increases reference counter. */
@@ -304,9 +303,9 @@ class Object {
    */
   MXNET_DLL bool DerivedFrom(uint32_t parent_tindex) const;
   // friend classes
-  template<typename>
+  template <typename>
   friend class ObjAllocatorBase;
-  template<typename>
+  template <typename>
   friend class ObjectPtr;
   friend class MXNetRetValue;
   friend class ObjectInternal;
@@ -483,9 +482,9 @@ class ObjectPtr {
   friend class Object;
   friend class ObjectRef;
   friend struct ObjectHash;
-  template<typename>
+  template <typename>
   friend class ObjectPtr;
-  template<typename>
+  template <typename>
   friend class ObjAllocatorBase;
   friend class MXNetPODValue_;
   friend class MXNetArgsSetter;
@@ -584,7 +583,7 @@ class ObjectRef {
    * \tparam T The target reference type.
    * \return The casted result.
    */
-  template<typename T>
+  template <typename T>
   static T DowncastNoCheck(ObjectRef ref) {
     return T(std::move(ref.data_));
   }
@@ -594,7 +593,7 @@ class ObjectRef {
    * \tparam ObjectType The corresponding object type.
    * \return the corresponding type.
    */
-  template<typename ObjectType>
+  template <typename ObjectType>
   static ObjectPtr<ObjectType> GetDataPtr(const ObjectRef& ref) {
     return ObjectPtr<ObjectType>(ref.data_.data_);
   }
@@ -623,56 +622,53 @@ struct ObjectHash {
     return operator()(a.data_);
   }
 
-  template<typename T>
+  template <typename T>
   size_t operator()(const ObjectPtr<T>& a) const {
     return std::hash<Object*>()(a.get());
   }
 };
 
-
 /*! \brief ObjectRef equal functor */
 struct ObjectEqual {
   bool operator()(const ObjectRef& a, const ObjectRef& b) const {
     return a.same_as(b);
   }
 
-  template<typename T>
+  template <typename T>
   size_t operator()(const ObjectPtr<T>& a, const ObjectPtr<T>& b) const {
     return a == b;
   }
 };
 
-
 /*!
  * \brief helper macro to declare a base object type that can be inheritated.
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
-#define MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                \
-  static uint32_t RuntimeTypeIndex()  {                                     \
-    return TypeName::_type_index != ::mxnet::runtime::TypeIndex::kDynamic ? \
-           TypeName::_type_index : _GetOrAllocRuntimeTypeIndex();           \
-  }                                                                         \
-  static uint32_t _GetOrAllocRuntimeTypeIndex()  {                          \
-    static uint32_t tidx = GetOrAllocRuntimeTypeIndex(                      \
-        TypeName::_type_key,                                                \
-        TypeName::_type_index,                                              \
-        ParentType::_GetOrAllocRuntimeTypeIndex(),                          \
-        TypeName::_type_child_slots,                                        \
-        TypeName::_type_child_slots_can_overflow);                          \
-    return tidx;                                                            \
+#define MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                                     \
+  static uint32_t RuntimeTypeIndex() {                                                           \
+    return TypeName::_type_index != ::mxnet::runtime::TypeIndex::kDynamic ?                      \
+               TypeName::_type_index :                                                           \
+               _GetOrAllocRuntimeTypeIndex();                                                    \
+  }                                                                                              \
+  static uint32_t _GetOrAllocRuntimeTypeIndex() {                                                \
+    static uint32_t tidx = GetOrAllocRuntimeTypeIndex(TypeName::_type_key,                       \
+                                                      TypeName::_type_index,                     \
+                                                      ParentType::_GetOrAllocRuntimeTypeIndex(), \
+                                                      TypeName::_type_child_slots,               \
+                                                      TypeName::_type_child_slots_can_overflow); \
+    return tidx;                                                                                 \
   }
 
 /*!
  * \brief helper macro to declare type information in a final class.
-  * \param TypeName The name of the current type.
-  * \param ParentType The name of the ParentType
-  */
-#define MXNET_DECLARE_FINAL_OBJECT_INFO(TypeName, ParentType)             \
-  static const constexpr bool _type_final = true;                         \
-  static const constexpr int _type_child_slots = 0;                       \
-  MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                    \
-
+ * \param TypeName The name of the current type.
+ * \param ParentType The name of the ParentType
+ */
+#define MXNET_DECLARE_FINAL_OBJECT_INFO(TypeName, ParentType) \
+  static const constexpr bool _type_final      = true;        \
+  static const constexpr int _type_child_slots = 0;           \
+  MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)
 
 /*!
  * \brief Helper macro to register the object type to runtime.
@@ -680,45 +676,49 @@ struct ObjectEqual {
  *
  *  Use this macro in the cc file for each terminal class.
  */
-#define MXNET_REGISTER_OBJECT_TYPE(TypeName)                              \
-  static DMLC_ATTRIBUTE_UNUSED uint32_t __make_Object_tidx ## _ ## TypeName ## __ = \
+#define MXNET_REGISTER_OBJECT_TYPE(TypeName)                                  \
+  static DMLC_ATTRIBUTE_UNUSED uint32_t __make_Object_tidx##_##TypeName##__ = \
       TypeName::_GetOrAllocRuntimeTypeIndex()
 
 #define MXNET_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName) \
   TypeName(const TypeName& other) = default;                \
-  TypeName(TypeName&& other) = default;                     \
+  TypeName(TypeName&& other)      = default;                \
   TypeName& operator=(const TypeName& other) = default;     \
   TypeName& operator=(TypeName&& other) = default;
 
-#define MXNET_DEFINE_OBJECT_REF_METHODS(TypeName, ParentType, ObjectName) \
-  TypeName() {}                                                           \
-  explicit TypeName(                                                      \
-      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)            \
-      : ParentType(n) {}                                                  \
-  const ObjectName* operator->() const {                                  \
-    return static_cast<const ObjectName*>(data_.get());                   \
-  }                                                                       \
-  operator bool() const { return data_ != nullptr; }                      \
+#define MXNET_DEFINE_OBJECT_REF_METHODS(TypeName, ParentType, ObjectName)                       \
+  TypeName() {}                                                                                 \
+  explicit TypeName(::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n) : ParentType(n) {} \
+  const ObjectName* operator->() const {                                                        \
+    return static_cast<const ObjectName*>(data_.get());                                         \
+  }                                                                                             \
+  operator bool() const {                                                                       \
+    return data_ != nullptr;                                                                    \
+  }                                                                                             \
   using ContainerType = ObjectName;
 
-#define MXNET_DEFINE_OBJECT_REF_METHODS_MUT(TypeName, ParentType, ObjectName) \
-  TypeName() {}                                                               \
-  explicit TypeName(                                                          \
-      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)                \
-      : ParentType(n) {}                                                      \
-  ObjectName* operator->() {                                                  \
-    return static_cast<ObjectName*>(data_.get());                             \
-  }                                                                           \
-  operator bool() const { return data_ != nullptr; }                          \
+#define MXNET_DEFINE_OBJECT_REF_METHODS_MUT(TypeName, ParentType, ObjectName)                   \
+  TypeName() {}                                                                                 \
+  explicit TypeName(::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n) : ParentType(n) {} \
+  ObjectName* operator->() {                                                                    \
+    return static_cast<ObjectName*>(data_.get());                                               \
+  }                                                                                             \
+  operator bool() const {                                                                       \
+    return data_ != nullptr;                                                                    \
+  }                                                                                             \
   using ContainerType = ObjectName;
 
-#define MXNET_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TypeName, ParentType, ObjectName)              \
-  explicit TypeName(::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n) : ParentType(n) {}    \
-  MXNET_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName);                                             \
-  const ObjectName* operator->() const { return static_cast<const ObjectName*>(data_.get()); }     \
-  const ObjectName* get() const { return operator->(); }                                           \
-  static constexpr bool _type_is_nullable = false;                                                 \
-  using ContainerType = ObjectName;
+#define MXNET_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TypeName, ParentType, ObjectName)           \
+  explicit TypeName(::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n) : ParentType(n) {} \
+  MXNET_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName);                                          \
+  const ObjectName* operator->() const {                                                        \
+    return static_cast<const ObjectName*>(data_.get());                                         \
+  }                                                                                             \
+  const ObjectName* get() const {                                                               \
+    return operator->();                                                                        \
+  }                                                                                             \
+  static constexpr bool _type_is_nullable = false;                                              \
+  using ContainerType                     = ObjectName;
 
 // Implementations details below
 // Object reference counting.
@@ -761,14 +761,15 @@ inline int Object::use_count() const {
 
 #endif  // MXNET_OBJECT_ATOMIC_REF_COUNTER
 
-template<typename TargetType>
+template <typename TargetType>
 inline bool Object::IsInstance() const {
   const Object* self = this;
   // NOTE: the following code can be optimized by
   // compiler dead-code elimination for already known constants.
   if (self != nullptr) {
     // Everything is a subclass of object.
-    if (std::is_same<TargetType, Object>::value) return true;
+    if (std::is_same<TargetType, Object>::value)
+      return true;
     if (TargetType::_type_final) {
       // if the target type is a final type
       // then we only need to check the equivalence.
@@ -780,13 +781,17 @@ inline bool Object::IsInstance() const {
       // The condition will be optimized by constant-folding.
       if (TargetType::_type_child_slots != 0) {
         uint32_t end = begin + TargetType::_type_child_slots;
-        if (self->type_index_ >= begin && self->type_index_ < end) return true;
+        if (self->type_index_ >= begin && self->type_index_ < end)
+          return true;
       } else {
-        if (self->type_index_ == begin) return true;
+        if (self->type_index_ == begin)
+          return true;
       }
-      if (!TargetType::_type_child_slots_can_overflow) return false;
+      if (!TargetType::_type_child_slots_can_overflow)
+        return false;
       // Invariance: parent index is always smaller than the child.
-      if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false;
+      if (self->type_index_ < TargetType::RuntimeTypeIndex())
+        return false;
       // The rare slower-path, check type hierachy.
       return self->DerivedFrom(TargetType::RuntimeTypeIndex());
     }
@@ -795,11 +800,9 @@ inline bool Object::IsInstance() const {
   }
 }
 
-
 template <typename ObjectType>
 inline const ObjectType* ObjectRef::as() const {
-  if (data_ != nullptr &&
-      data_->IsInstance<ObjectType>()) {
+  if (data_ != nullptr && data_->IsInstance<ObjectType>()) {
     return static_cast<ObjectType*>(data_.get());
   } else {
     return nullptr;
@@ -827,8 +830,8 @@ template <typename SubRef, typename BaseRef>
 inline SubRef Downcast(BaseRef ref) {
   if (ref.defined()) {
     CHECK(ref->template IsInstance<typename SubRef::ContainerType>())
-        << "Downcast from " << ref->GetTypeKey() << " to "
-        << SubRef::ContainerType::_type_key << " failed.";
+        << "Downcast from " << ref->GetTypeKey() << " to " << SubRef::ContainerType::_type_key
+        << " failed.";
   } else {
     CHECK(SubRef::_type_is_nullable) << "Downcast from nullptr to not nullable reference of "
                                      << SubRef::ContainerType::_type_key;
@@ -838,7 +841,7 @@ inline SubRef Downcast(BaseRef ref) {
 
 }  // namespace runtime
 
-template<typename T>
+template <typename T>
 using NodePtr = runtime::ObjectPtr<T>;
 
 }  // namespace mxnet
diff --git a/include/mxnet/runtime/packed_func.h b/include/mxnet/runtime/packed_func.h
index 40ad7bb31ba6..1b5035afd690 100644
--- a/include/mxnet/runtime/packed_func.h
+++ b/include/mxnet/runtime/packed_func.h
@@ -97,7 +97,7 @@ class PackedFunc {
    *   }
    * \endcode
    */
-  using FType = std::function<void (MXNetArgs args, MXNetRetValue* rv)>;
+  using FType = std::function<void(MXNetArgs args, MXNetRetValue* rv)>;
   /*! \brief default constructor */
   PackedFunc() {}
   /*! \brief constructor from null */
@@ -121,8 +121,8 @@ class PackedFunc {
    *   }
    * \endcode
    */
-  template<typename... Args>
-  inline MXNetRetValue operator()(Args&& ...args) const;
+  template <typename... Args>
+  inline MXNetRetValue operator()(Args&&... args) const;
   /*!
    * \brief Call the function in packed format.
    * \param args The arguments
@@ -148,7 +148,7 @@ class PackedFunc {
 /*!
  * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc<R(Args..)>"
  */
-template<typename FType>
+template <typename FType>
 class TypedPackedFunc;
 
 /*!
@@ -183,7 +183,7 @@ class TypedPackedFunc;
  * \tparam R The return value of the function.
  * \tparam Args The argument signature of the function.
  */
-template<typename R, typename ...Args>
+template <typename R, typename... Args>
 class TypedPackedFunc<R(Args...)> {
  public:
   /*! \brief short hand for this function type */
@@ -235,11 +235,10 @@ class TypedPackedFunc<R(Args...)> {
    * \param typed_lambda typed lambda function.
    * \tparam FLambda the type of the lambda function.
    */
-  template<typename FLambda,
-           typename = typename std::enable_if<
-             std::is_convertible<FLambda,
-                                 std::function<R(Args...)>
-                                 >::value>::type>
+  template <typename FLambda,
+            typename = typename std::enable_if<
+                std::is_convertible<FLambda,
+                                    std::function<R(Args...)>>::value>::type>
   TypedPackedFunc(const FLambda& typed_lambda) {  // NOLINT(*)
     this->AssignTypedLambda(typed_lambda);
   }
@@ -259,11 +258,10 @@ class TypedPackedFunc<R(Args...)> {
    * \tparam FLambda the type of the lambda function.
    * \returns reference to self.
    */
-  template<typename FLambda,
-           typename = typename std::enable_if<
-             std::is_convertible<FLambda,
-                                 std::function<R(Args...)>
-                                 >::value>::type>
+  template <typename FLambda,
+            typename = typename std::enable_if<
+                std::is_convertible<FLambda,
+                                    std::function<R(Args...)>>::value>::type>
   TSelf& operator=(FLambda typed_lambda) {  // NOLINT(*)
     this->AssignTypedLambda(typed_lambda);
     return *this;
@@ -282,7 +280,7 @@ class TypedPackedFunc<R(Args...)> {
    * \param args The arguments
    * \returns The return value.
    */
-  inline R operator()(Args ...args) const;
+  inline R operator()(Args... args) const;
   /*!
    * \brief convert to PackedFunc
    * \return the internal PackedFunc
@@ -316,7 +314,7 @@ class TypedPackedFunc<R(Args...)> {
    * \tparam FLambda The lambda function type.
    * \note We capture the lambda when possible for maximum efficiency.
    */
-  template<typename FLambda>
+  template <typename FLambda>
   inline void AssignTypedLambda(FLambda flambda);
 };
 
@@ -332,12 +330,8 @@ class MXNetArgs {
    * \param type_codes The argument type codes
    * \param num_args number of arguments.
    */
-  MXNetArgs(const MXNetValue* values,
-          const int* type_codes,
-          int num_args)
-      : values(values),
-        type_codes(type_codes),
-        num_args(num_args) { }
+  MXNetArgs(const MXNetValue* values, const int* type_codes, int num_args)
+      : values(values), type_codes(type_codes), num_args(num_args) {}
   /*! \return size of the arguments */
   inline int size() const;
   /*!
@@ -363,9 +357,8 @@ inline const char* TypeCode2Str(int type_code);
 // inline TVMType String2TVMType(std::string s);
 
 // macro to check type code.
-#define MXNET_CHECK_TYPE_CODE(CODE, T)                           \
-  CHECK_EQ(CODE, T) << " expected "                            \
-  << TypeCode2Str(T) << " but get " << TypeCode2Str(CODE)      \
+#define MXNET_CHECK_TYPE_CODE(CODE, T) \
+  CHECK_EQ(CODE, T) << " expected " << TypeCode2Str(T) << " but get " << TypeCode2Str(CODE)
 
 /*!
  * \brief Type traits to mark if a class is tvm extension type.
@@ -378,7 +371,7 @@ inline const char* TypeCode2Str(int type_code);
  *
  * \tparam T the typename
  */
-template<typename T>
+template <typename T>
 struct extension_type_info {
   static const int code = 0;
 };
@@ -391,7 +384,8 @@ template <typename T>
 struct ObjectTypeChecker {
   static bool Check(const Object* ptr) {
     using ContainerType = typename T::ContainerType;
-    if (ptr == nullptr) return T::_type_is_nullable;
+    if (ptr == nullptr)
+      return T::_type_is_nullable;
     return ptr->IsInstance<ContainerType>();
   }
   static std::string TypeName() {
@@ -426,8 +420,7 @@ class MXNetPODValue_ {
   }
   operator int() const {
     MXNET_CHECK_TYPE_CODE(type_code_, kDLInt);
-    CHECK_LE(value_.v_int64,
-             std::numeric_limits<int>::max());
+    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
     return static_cast<int>(value_.v_int64);
   }
   operator bool() const {
@@ -435,7 +428,8 @@ class MXNetPODValue_ {
     return value_.v_int64 != 0;
   }
   operator void*() const {
-    if (type_code_ == kNull) return nullptr;
+    if (type_code_ == kNull)
+      return nullptr;
     MXNET_CHECK_TYPE_CODE(type_code_, kHandle);
     return value_.v_handle;
   }
@@ -444,12 +438,10 @@ class MXNetPODValue_ {
       return ObjectRef(ObjectPtr<Object>(nullptr));
     }
     MXNET_CHECK_TYPE_CODE(type_code_, kObjectHandle);
-    return ObjectRef(
-        ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
+    return ObjectRef(ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
   }
-  template<typename TObjectRef,
-           typename = typename std::enable_if<
-             std::is_class<TObjectRef>::value>::type>
+  template <typename TObjectRef,
+            typename = typename std::enable_if<std::is_class<TObjectRef>::value>::type>
   inline bool IsObjectRef() const;
   template <typename TObjectRef>
   inline TObjectRef AsObjectRef() const;
@@ -462,7 +454,7 @@ class MXNetPODValue_ {
    * \tparam T the data type.
    * \return The pointer type.
    */
-  template<typename T>
+  template <typename T>
   T* ptr() const {
     return static_cast<T*>(value_.v_handle);
   }
@@ -471,8 +463,7 @@ class MXNetPODValue_ {
   friend class MXNetArgsSetter;
   friend class MXNetRetValue;
   MXNetPODValue_() : type_code_(kNull) {}
-  MXNetPODValue_(MXNetValue value, int type_code)
-      : value_(value), type_code_(type_code) {}
+  MXNetPODValue_(MXNetValue value, int type_code) : value_(value), type_code_(type_code) {}
 
   /*! \brief The value */
   MXNetValue value_;
@@ -495,9 +486,7 @@ class MXNetArgValue : public MXNetPODValue_ {
    * \param value of the function
    * \param type_code The type code.
    */
-  MXNetArgValue(MXNetValue value, int type_code)
-      : MXNetPODValue_(value, type_code) {
-  }
+  MXNetArgValue(MXNetValue value, int type_code) : MXNetPODValue_(value, type_code) {}
   // reuse converter from parent
   using MXNetPODValue_::operator double;
   using MXNetPODValue_::operator int64_t;
@@ -506,8 +495,8 @@ class MXNetArgValue : public MXNetPODValue_ {
   using MXNetPODValue_::operator bool;
   using MXNetPODValue_::operator void*;
   using MXNetPODValue_::operator ObjectRef;
-  using MXNetPODValue_::IsObjectRef;
   using MXNetPODValue_::AsObjectRef;
+  using MXNetPODValue_::IsObjectRef;
 
   // conversion operator.
   operator std::string() const {
@@ -526,7 +515,9 @@ class MXNetArgValue : public MXNetPODValue_ {
     // None type
     if (type_code_ == kNull) {
       DLDataType t;
-      t.code = kHandle; t.bits = 0; t.lanes = 0;
+      t.code  = kHandle;
+      t.bits  = 0;
+      t.lanes = 0;
       return t;
     }
     MXNET_CHECK_TYPE_CODE(type_code_, kMXNetType);
@@ -542,16 +533,14 @@ class MXNetArgValue : public MXNetPODValue_ {
     MXNET_CHECK_TYPE_CODE(type_code_, kNDArrayHandle);
     return reinterpret_cast<::mxnet::NDArray*>(value_.v_handle);
   }
-  template<typename FType>
+  template <typename FType>
   operator TypedPackedFunc<FType>() const {
     return TypedPackedFunc<FType>(operator PackedFunc());
   }
   const MXNetValue& value() const {
     return value_;
   }
-  template<typename T,
-           typename = typename std::enable_if<
-           std::is_class<T>::value>::type>
+  template <typename T, typename = typename std::enable_if<std::is_class<T>::value>::type>
   inline operator T() const;
 };
 
@@ -571,10 +560,9 @@ class MXNetRetValue : public MXNetPODValue_ {
    * \brief move constructor from anoter return value.
    * \param other The other return value.
    */
-  MXNetRetValue(MXNetRetValue&& other)
-      : MXNetPODValue_(other.value_, other.type_code_) {
+  MXNetRetValue(MXNetRetValue&& other) : MXNetPODValue_(other.value_, other.type_code_) {
     other.value_.v_handle = nullptr;
-    other.type_code_ = kNull;
+    other.type_code_      = kNull;
   }
   /*! \brief destructor */
   ~MXNetRetValue() {
@@ -588,8 +576,8 @@ class MXNetRetValue : public MXNetPODValue_ {
   using MXNetPODValue_::operator bool;
   using MXNetPODValue_::operator void*;
   using MXNetPODValue_::operator ObjectRef;
-  using MXNetPODValue_::IsObjectRef;
   using MXNetPODValue_::AsObjectRef;
+  using MXNetPODValue_::IsObjectRef;
 
   MXNetRetValue(const MXNetRetValue& other) : MXNetPODValue_() {
     this->Assign(other);
@@ -612,15 +600,15 @@ class MXNetRetValue : public MXNetPODValue_ {
   operator MXNetDataType() const {
     return MXNetDataType(operator DLDataType());
   }
-  template<typename FType>
+  template <typename FType>
   operator TypedPackedFunc<FType>() const {
     return TypedPackedFunc<FType>(operator PackedFunc());
   }
   // Assign operators
   MXNetRetValue& operator=(MXNetRetValue&& other) {
     this->Clear();
-    value_ = other.value_;
-    type_code_ = other.type_code_;
+    value_           = other.value_;
+    type_code_       = other.type_code_;
     other.type_code_ = kNull;
     return *this;
   }
@@ -676,12 +664,12 @@ class MXNetRetValue : public MXNetPODValue_ {
     }
     return operator=(std::move(other.data_));
   }
-  template<typename T>
+  template <typename T>
   MXNetRetValue& operator=(ObjectPtr<T> other) {
     SwitchToObject(kObjectHandle, std::move(other));
     return *this;
   }
-  template<typename FType>
+  template <typename FType>
   MXNetRetValue& operator=(const TypedPackedFunc<FType>& f) {
     return operator=(f.packed());
   }
@@ -700,7 +688,7 @@ class MXNetRetValue : public MXNetPODValue_ {
   }
   MXNetRetValue& operator=(NDArrayHandle value) {
     this->SwitchToPOD(kNDArrayHandle);
-    NDArray* arr = new NDArray(value->value);
+    NDArray* arr    = new NDArray(value->value);
     value_.v_handle = reinterpret_cast<void*>(arr);
     return *this;
   }
@@ -709,12 +697,9 @@ class MXNetRetValue : public MXNetPODValue_ {
     value_.v_int64 = value.offset();
     return *this;
   }
-  template<typename T,
-           typename = typename std::enable_if<
-             extension_type_info<T>::code != 0>::type>
+  template <typename T, typename = typename std::enable_if<extension_type_info<T>::code != 0>::type>
   MXNetRetValue& operator=(const T& other) {
-    this->SwitchToClass<T>(
-        extension_type_info<T>::code, other);
+    this->SwitchToClass<T>(extension_type_info<T>::code, other);
     return *this;
   }
   /*!
@@ -726,28 +711,25 @@ class MXNetRetValue : public MXNetPODValue_ {
    * \param ret_value The return value.
    * \param ret_type_code The return type code.
    */
-  void MoveToCHost(MXNetValue* ret_value,
-                   int* ret_type_code) {
+  void MoveToCHost(MXNetValue* ret_value, int* ret_type_code) {
     // cannot move str; need specially handle.
     CHECK(type_code_ != kStr && type_code_ != kBytes);
-    *ret_value = value_;
+    *ret_value     = value_;
     *ret_type_code = type_code_;
-    type_code_ = kNull;
+    type_code_     = kNull;
   }
   /*! \return The value field, if the data is POD */
   const MXNetValue& value() const {
-    CHECK(type_code_ != kObjectHandle &&
-          type_code_ != kStr) << "MXNetRetValue.value can only be used for POD data";
+    CHECK(type_code_ != kObjectHandle && type_code_ != kStr)
+        << "MXNetRetValue.value can only be used for POD data";
     return value_;
   }
   // ObjectRef related extenstions: in tvm/packed_func_ext.h
-  template<typename T,
-           typename = typename std::enable_if<
-             std::is_class<T>::value>::type>
+  template <typename T, typename = typename std::enable_if<std::is_class<T>::value>::type>
   inline operator T() const;
 
  private:
-  template<typename T>
+  template <typename T>
   void Assign(const T& other) {
     switch (other.type_code()) {
       case kStr: {
@@ -780,11 +762,11 @@ class MXNetRetValue : public MXNetPODValue_ {
       type_code_ = type_code;
     }
   }
-  template<typename T>
+  template <typename T>
   void SwitchToClass(int type_code, T v) {
     if (type_code_ != type_code) {
       this->Clear();
-      type_code_ = type_code;
+      type_code_      = type_code;
       value_.v_handle = new T(v);
     } else {
       *static_cast<T*>(value_.v_handle) = v;
@@ -796,15 +778,18 @@ class MXNetRetValue : public MXNetPODValue_ {
       type_code_ = type_code;
       // move the handle out
       value_.v_handle = other.data_;
-      other.data_ = nullptr;
+      other.data_     = nullptr;
     } else {
       SwitchToPOD(kNull);
     }
   }
   void Clear() {
-    if (type_code_ == kNull) return;
+    if (type_code_ == kNull)
+      return;
     switch (type_code_) {
-      case kStr: delete ptr<std::string>(); break;
+      case kStr:
+        delete ptr<std::string>();
+        break;
       case kObjectHandle: {
         static_cast<Object*>(value_.v_handle)->DecRef();
         break;
@@ -821,24 +806,30 @@ inline DLDataType String2DLDataType(std::string s) {
   DLDataType t;
   // handle None type
   if (s.length() == 0) {
-    t.bits = 0; t.lanes = 0; t.code = kHandle;
+    t.bits  = 0;
+    t.lanes = 0;
+    t.code  = kHandle;
     return t;
   }
-  t.bits = 32; t.lanes = 1;
+  t.bits           = 32;
+  t.lanes          = 1;
   const char* scan = nullptr;
   if (s.substr(0, 3) == "int") {
-    t.code = kDLInt;  scan = s.c_str() + 3;
+    t.code = kDLInt;
+    scan   = s.c_str() + 3;
   } else if (s.substr(0, 4) == "uint") {
-    t.code = kDLUInt; scan = s.c_str() + 4;
+    t.code = kDLUInt;
+    scan   = s.c_str() + 4;
   } else if (s.substr(0, 5) == "float") {
-    t.code = kDLFloat; scan = s.c_str() + 5;
+    t.code = kDLFloat;
+    scan   = s.c_str() + 5;
   } else if (s.substr(0, 6) == "handle") {
     t.code = kHandle;
     t.bits = 64;  // handle uses 64 bit by default.
-    scan = s.c_str() + 6;
+    scan   = s.c_str() + 6;
   } else if (s == "bool") {
-    t.code = kDLUInt;
-    t.bits = 1;
+    t.code  = kDLUInt;
+    t.bits  = 1;
     t.lanes = 1;
     return t;
   } else if (s.substr(0, 6) == "custom") {
@@ -850,7 +841,8 @@ inline DLDataType String2DLDataType(std::string s) {
   }
   char* xdelim;  // emulate sscanf("%ux%u", bits, lanes)
   uint8_t bits = static_cast<uint8_t>(strtoul(scan, &xdelim, 10));
-  if (bits != 0) t.bits = bits;
+  if (bits != 0)
+    t.bits = bits;
   char* endpt = xdelim;
   if (*xdelim == 'x') {
     t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, &endpt, 10));
@@ -862,17 +854,27 @@ inline DLDataType String2DLDataType(std::string s) {
 // implementation details
 inline const char* TypeCode2Str(int type_code) {
   switch (type_code) {
-    case kDLInt: return "int";
-    case kDLUInt: return "uint";
-    case kDLFloat: return "float";
-    case kStr: return "str";
-    case kBytes: return "bytes";
-    case kHandle: return "handle";
-    case kNull: return "NULL";
-    case kObjectHandle: return "ObjectCell";
-    case kNDArrayHandle: return "NDArray";
-    default: LOG(FATAL) << "unknown type_code="
-                        << static_cast<int>(type_code); return "";
+    case kDLInt:
+      return "int";
+    case kDLUInt:
+      return "uint";
+    case kDLFloat:
+      return "float";
+    case kStr:
+      return "str";
+    case kBytes:
+      return "bytes";
+    case kHandle:
+      return "handle";
+    case kNull:
+      return "NULL";
+    case kObjectHandle:
+      return "ObjectCell";
+    case kNDArrayHandle:
+      return "NDArray";
+    default:
+      LOG(FATAL) << "unknown type_code=" << static_cast<int>(type_code);
+      return "";
   }
 }
 
@@ -940,7 +942,8 @@ inline int String2MXNetType(const std::string& s) {
 
 inline std::ostream& operator<<(std::ostream& os, DLDataType t) {  // NOLINT(*)
   if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
-    os << "bool"; return os;
+    os << "bool";
+    return os;
   }
   if (t.code < kCustomBegin) {
     os << TypeCode2Str(t.code);
@@ -948,7 +951,8 @@ inline std::ostream& operator<<(std::ostream& os, DLDataType t) {  // NOLINT(*)
     LOG(FATAL) << "custom MXNetDataType is not supported";
     // os << "custom[" << GetCustomTypeName(t.code) << "]";
   }
-  if (t.code == kHandle) return os;
+  if (t.code == kHandle)
+    return os;
   os << static_cast<int>(t.bits);
   if (t.lanes != 1) {
     os << 'x' << static_cast<int>(t.lanes);
@@ -956,15 +960,13 @@ inline std::ostream& operator<<(std::ostream& os, DLDataType t) {  // NOLINT(*)
   return os;
 }
 
-inline std::ostream& operator<<(std::ostream& os, const MXNetDataType& dtype) { // NOLINT(*)
+inline std::ostream& operator<<(std::ostream& os, const MXNetDataType& dtype) {  // NOLINT(*)
   return os << dtype.operator DLDataType();
 }
 
 inline MXNetArgValue MXNetArgs::operator[](int i) const {
-  CHECK_LT(i, num_args)
-      << "not enough argument passed, "
-      << num_args << " passed"
-      << " but request arg[" << i << "].";
+  CHECK_LT(i, num_args) << "not enough argument passed, " << num_args << " passed"
+                        << " but request arg[" << i << "].";
   return MXNetArgValue(values[i], type_codes[i]);
 }
 
@@ -983,93 +985,87 @@ inline PackedFunc::FType PackedFunc::body() const {
 // internal namespace
 namespace detail {
 
-template<bool stop, std::size_t I, typename F>
+template <bool stop, std::size_t I, typename F>
 struct for_each_dispatcher {
-  template<typename T, typename ...Args>
+  template <typename T, typename... Args>
   static void run(const F& f, T&& value, Args&&... args) {  // NOLINT(*)
     f(I, std::forward<T>(value));
-    for_each_dispatcher<sizeof...(Args) == 0, (I+1), F>
-        ::run(f, std::forward<Args>(args)...);
+    for_each_dispatcher<sizeof...(Args) == 0, (I + 1), F>::run(f, std::forward<Args>(args)...);
   }
 };
 
-template<std::size_t I, typename F>
-struct for_each_dispatcher<true, I, F>  {
+template <std::size_t I, typename F>
+struct for_each_dispatcher<true, I, F> {
   static void run(const F& f) {}  // NOLINT(*)
 };
 
-template<typename F, typename ...Args>
+template <typename F, typename... Args>
 inline void for_each(const F& f, Args&&... args) {  // NOLINT(*)
-  for_each_dispatcher<sizeof...(Args) == 0, 0, F>
-      ::run(f, std::forward<Args>(args)...);
+  for_each_dispatcher<sizeof...(Args) == 0, 0, F>::run(f, std::forward<Args>(args)...);
 }
 }  // namespace detail
 
 /* \brief argument settter to PackedFunc */
 class MXNetArgsSetter {
  public:
-  MXNetArgsSetter(MXNetValue* values, int* type_codes)
-      : values_(values), type_codes_(type_codes) {}
+  MXNetArgsSetter(MXNetValue* values, int* type_codes) : values_(values), type_codes_(type_codes) {}
   // setters for POD types
-  template<typename T,
-           typename = typename std::enable_if<
-             std::is_integral<T>::value>::type>
+  template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
   void operator()(size_t i, T value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
-    type_codes_[i] = kDLInt;
+    type_codes_[i]     = kDLInt;
   }
   void operator()(size_t i, uint64_t value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
-    CHECK_LE(value,
-             static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    CHECK_LE(value, static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
     type_codes_[i] = kDLInt;
   }
   void operator()(size_t i, double value) const {
     values_[i].v_float64 = value;
-    type_codes_[i] = kDLFloat;
+    type_codes_[i]       = kDLFloat;
   }
   void operator()(size_t i, std::nullptr_t value) const {
     values_[i].v_handle = value;
-    type_codes_[i] = kNull;
+    type_codes_[i]      = kNull;
   }
   void operator()(size_t i, const MXNetArgValue& value) const {
-    values_[i] = value.value_;
+    values_[i]     = value.value_;
     type_codes_[i] = value.type_code_;
   }
   void operator()(size_t i, void* value) const {
     values_[i].v_handle = value;
-    type_codes_[i] = kHandle;
+    type_codes_[i]      = kHandle;
   }
   void operator()(size_t i, const char* value) const {
     values_[i].v_str = value;
-    type_codes_[i] = kStr;
+    type_codes_[i]   = kStr;
   }
   // setters for container type
   // They must be reference(instead of const ref)
   // to make sure they are alive in the tuple(instead of getting converted)
   void operator()(size_t i, const std::string& value) const {  // NOLINT(*)
     values_[i].v_str = value.c_str();
-    type_codes_[i] = kStr;
+    type_codes_[i]   = kStr;
   }
   void operator()(size_t i, DLDataType value) const {
     values_[i].v_type = value;
-    type_codes_[i] = kMXNetType;
+    type_codes_[i]    = kMXNetType;
   }
   void operator()(size_t i, MXNetDataType dtype) const {
     operator()(i, dtype.operator DLDataType());
   }
   void operator()(size_t i, const MXNetByteArray& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<MXNetByteArray*>(&value);
-    type_codes_[i] = kBytes;
+    type_codes_[i]      = kBytes;
   }
-  template<typename FType>
+  template <typename FType>
   void operator()(size_t i, const TypedPackedFunc<FType>& value) const {  // NOLINT(*)
     operator()(i, value.packed());
   }
   void operator()(size_t i, const ObjectRef& value) const {  // NOLINT(*)
     if (value.defined()) {
       values_[i].v_handle = value.data_.data_;
-      type_codes_[i] = kObjectHandle;
+      type_codes_[i]      = kObjectHandle;
     } else {
       type_codes_[i] = kNull;
     }
@@ -1077,10 +1073,10 @@ class MXNetArgsSetter {
   void operator()(size_t i, const MXNetRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
       values_[i].v_str = value.ptr<std::string>()->c_str();
-      type_codes_[i] = kStr;
+      type_codes_[i]   = kStr;
     } else {
       CHECK_NE(value.type_code(), kBytes) << "not handled.";
-      values_[i] = value.value_;
+      values_[i]     = value.value_;
       type_codes_[i] = value.type_code();
     }
   }
@@ -1092,37 +1088,34 @@ class MXNetArgsSetter {
   int* type_codes_;
 };
 
-template<typename... Args>
-inline MXNetRetValue PackedFunc::operator()(Args&& ...args) const {
-  const int kNumArgs = sizeof...(Args);
+template <typename... Args>
+inline MXNetRetValue PackedFunc::operator()(Args&&... args) const {
+  const int kNumArgs   = sizeof...(Args);
   const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
   MXNetValue values[kArraySize];
   int type_codes[kArraySize];
-  detail::for_each(MXNetArgsSetter(values, type_codes),
-                   std::forward<Args>(args)...);
+  detail::for_each(MXNetArgsSetter(values, type_codes), std::forward<Args>(args)...);
   MXNetRetValue rv;
   body_(MXNetArgs(values, type_codes, kNumArgs), &rv);
   return rv;
 }
 
 namespace detail {
-template<typename R, int nleft, int index, typename F>
+template <typename R, int nleft, int index, typename F>
 struct unpack_call_dispatcher {
-  template<typename ...Args>
+  template <typename... Args>
   static void run(const F& f,
                   const MXNetArgs& args_pack,
                   MXNetRetValue* rv,
                   Args&&... unpacked_args) {
-    unpack_call_dispatcher<R, nleft - 1, index + 1, F>
-        ::run(f, args_pack, rv,
-              std::forward<Args>(unpacked_args)...,
-              args_pack[index]);
+    unpack_call_dispatcher<R, nleft - 1, index + 1, F>::run(
+        f, args_pack, rv, std::forward<Args>(unpacked_args)..., args_pack[index]);
   }
 };
 
-template<typename R, int index, typename F>
+template <typename R, int index, typename F>
 struct unpack_call_dispatcher<R, 0, index, F> {
-  template<typename ...Args>
+  template <typename... Args>
   static void run(const F& f,
                   const MXNetArgs& args_pack,
                   MXNetRetValue* rv,
@@ -1131,9 +1124,9 @@ struct unpack_call_dispatcher<R, 0, index, F> {
   }
 };
 
-template<int index, typename F>
+template <int index, typename F>
 struct unpack_call_dispatcher<void, 0, index, F> {
-  template<typename ...Args>
+  template <typename... Args>
   static void run(const F& f,
                   const MXNetArgs& args_pack,
                   MXNetRetValue* rv,
@@ -1142,62 +1135,60 @@ struct unpack_call_dispatcher<void, 0, index, F> {
   }
 };
 
-template<typename R, int nargs, typename F>
+template <typename R, int nargs, typename F>
 inline void unpack_call(const F& f, const MXNetArgs& args, MXNetRetValue* rv) {
   unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
 }
 
-template<typename R, typename ...Args>
-inline R call_packed(const PackedFunc& pf, Args&& ...args) {
+template <typename R, typename... Args>
+inline R call_packed(const PackedFunc& pf, Args&&... args) {
   return R(pf(std::forward<Args>(args)...));
 }
 
-template<typename R>
+template <typename R>
 struct typed_packed_call_dispatcher {
-  template<typename ...Args>
-  static inline R run(const PackedFunc& pf, Args&& ...args) {
+  template <typename... Args>
+  static inline R run(const PackedFunc& pf, Args&&... args) {
     return pf(std::forward<Args>(args)...);
   }
 };
 
-template<>
+template <>
 struct typed_packed_call_dispatcher<void> {
-  template<typename ...Args>
-  static inline void run(const PackedFunc& pf, Args&& ...args) {
+  template <typename... Args>
+  static inline void run(const PackedFunc& pf, Args&&... args) {
     pf(std::forward<Args>(args)...);
   }
 };
 }  // namespace detail
 
-template<typename R, typename ...Args>
-TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed)
-  : packed_(packed) {}
+template <typename R, typename... Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed) : packed_(packed) {}
 
-template<typename R, typename ...Args>
+template <typename R, typename... Args>
 TypedPackedFunc<R(Args...)>::TypedPackedFunc(const MXNetRetValue& value)
     : packed_(value.operator PackedFunc()) {}
 
-template<typename R, typename ...Args>
+template <typename R, typename... Args>
 TypedPackedFunc<R(Args...)>::TypedPackedFunc(const MXNetArgValue& value)
     : packed_(value.operator PackedFunc()) {}
 
-template<typename R, typename ...Args>
-template<typename FType>
+template <typename R, typename... Args>
+template <typename FType>
 inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
   packed_ = PackedFunc([flambda](const MXNetArgs& args, MXNetRetValue* rv) {
-      detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
-    });
+    detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
+  });
 }
 
-template<typename R, typename ...Args>
+template <typename R, typename... Args>
 inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
-  return detail::typed_packed_call_dispatcher<R>
-      ::run(packed_, std::forward<Args>(args)...);
+  return detail::typed_packed_call_dispatcher<R>::run(packed_, std::forward<Args>(args)...);
 }
 
 // extension and node type handling
 namespace detail {
-template<typename T, typename TSrc, bool is_ext, bool is_nd>
+template <typename T, typename TSrc, bool is_ext, bool is_nd>
 struct MXNetValueCast {
   static T Apply(const TSrc* self) {
     static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
@@ -1223,13 +1214,17 @@ struct PackedFuncValueConverter {
    * \param val The argument value.
    * \return the converted result.
    */
-  static TObjectRef From(const MXNetArgValue& val) { return val.AsObjectRef<TObjectRef>(); }
+  static TObjectRef From(const MXNetArgValue& val) {
+    return val.AsObjectRef<TObjectRef>();
+  }
   /*!
    * \brief Convert a TObjectRef from a return value.
    * \param val The argument value.
    * \return the converted result.
    */
-  static TObjectRef From(const MXNetRetValue& val) { return val.AsObjectRef<TObjectRef>(); }
+  static TObjectRef From(const MXNetRetValue& val) {
+    return val.AsObjectRef<TObjectRef>();
+  }
 };
 
 template <>
@@ -1283,8 +1278,8 @@ inline MXNetArgValue::operator T() const {
 template <typename TObjectRef, typename>
 inline bool MXNetPODValue_::IsObjectRef() const {
   using ContainerType = typename TObjectRef::ContainerType;
-  return  type_code_ == kObjectHandle &&
-          ObjectTypeChecker<TObjectRef>::Check(static_cast<Object*>(value_.v_handle));
+  return type_code_ == kObjectHandle &&
+         ObjectTypeChecker<TObjectRef>::Check(static_cast<Object*>(value_.v_handle));
 }
 
 inline bool String::CanConvertFrom(const MXNetArgValue& val) {
diff --git a/include/mxnet/runtime/py_arg.h b/include/mxnet/runtime/py_arg.h
index 81d1b30a573e..fa8b1adb9ac0 100644
--- a/include/mxnet/runtime/py_arg.h
+++ b/include/mxnet/runtime/py_arg.h
@@ -28,10 +28,11 @@ namespace runtime {
 
 class PythonArg {
  public:
-  explicit PythonArg(int offset): offset_(offset) {}
+  explicit PythonArg(int offset) : offset_(offset) {}
   int offset() const {
     return offset_;
   }
+
  private:
   int offset_;
 };
diff --git a/include/mxnet/runtime/registry.h b/include/mxnet/runtime/registry.h
index 70782b47254d..a59dc24ba208 100644
--- a/include/mxnet/runtime/registry.h
+++ b/include/mxnet/runtime/registry.h
@@ -80,7 +80,7 @@ class Registry {
    * \tparam FType the signature of the function.
    * \tparam FLambda The type of f.
    */
-  template<typename FType, typename FLambda>
+  template <typename FType, typename FLambda>
   Registry& set_body_typed(FLambda f) {
     return set_body(TypedPackedFunc<FType>(f).packed());
   }
@@ -89,7 +89,8 @@ class Registry {
    * \brief set the body of the function to the given function pointer.
    *        Note that this doesn't work with lambdas, you need to
    *        explicitly give a type for those.
-   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *        Note that this will ignore default arg values and always require all arguments to be
+   * provided.
    *
    * \code
    *
@@ -106,14 +107,15 @@ class Registry {
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename R, typename ...Args>
+  template <typename R, typename... Args>
   Registry& set_body_typed(R (*f)(Args...)) {
     return set_body(TypedPackedFunc<R(Args...)>(f));
   }
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *        Note that this will ignore default arg values and always require all arguments to be
+   * provided.
    *
    * \code
    *
@@ -131,7 +133,7 @@ class Registry {
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename T, typename R, typename ...Args>
+  template <typename T, typename R, typename... Args>
   Registry& set_body_method(R (T::*f)(Args...)) {
     return set_body_typed<R(T, Args...)>([f](T target, Args... params) -> R {
       // call method pointer
@@ -141,7 +143,8 @@ class Registry {
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *        Note that this will ignore default arg values and always require all arguments to be
+   * provided.
    *
    * \code
    *
@@ -159,7 +162,7 @@ class Registry {
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename T, typename R, typename ...Args>
+  template <typename T, typename R, typename... Args>
   Registry& set_body_method(R (T::*f)(Args...) const) {
     return set_body_typed<R(T, Args...)>([f](const T target, Args... params) -> R {
       // call method pointer
@@ -170,7 +173,8 @@ class Registry {
   /*!
    * \brief set the body of the function to be the passed method pointer.
    *        Used when calling a method on a Node subclass through a ObjectRef subclass.
-   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *        Note that this will ignore default arg values and always require all arguments to be
+   * provided.
    *
    * \code
    *
@@ -197,8 +201,11 @@ class Registry {
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
+  template <typename TObjectRef,
+            typename TNode,
+            typename R,
+            typename... Args,
+            typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...)) {
     return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       TNode* target = ref.operator->();
@@ -210,7 +217,8 @@ class Registry {
   /*!
    * \brief set the body of the function to be the passed method pointer.
    *        Used when calling a method on a Node subclass through a ObjectRef subclass.
-   *        Note that this will ignore default arg values and always require all arguments to be provided.
+   *        Note that this will ignore default arg values and always require all arguments to be
+   * provided.
    *
    * \code
    *
@@ -237,8 +245,11 @@ class Registry {
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
+  template <typename TObjectRef,
+            typename TNode,
+            typename R,
+            typename... Args,
+            typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...) const) {
     return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       const TNode* target = ref.operator->();
@@ -292,10 +303,10 @@ class Registry {
 #endif
 
 #define MXNET_STR_CONCAT_(__x, __y) __x##__y
-#define MXNET_STR_CONCAT(__x, __y) MXNET_STR_CONCAT_(__x, __y)
+#define MXNET_STR_CONCAT(__x, __y)  MXNET_STR_CONCAT_(__x, __y)
 
-#define MXNET_FUNC_REG_VAR_DEF                                            \
-  static MXNET_ATTRIBUTE_UNUSED ::mxnet::runtime::Registry& __mk_ ## MXNET
+#define MXNET_FUNC_REG_VAR_DEF \
+  static MXNET_ATTRIBUTE_UNUSED ::mxnet::runtime::Registry& __mk_##MXNET
 
 /*!
  * \brief Register a function globally.
@@ -305,8 +316,8 @@ class Registry {
  *   });
  * \endcode
  */
-#define MXNET_REGISTER_GLOBAL(OpName)                              \
-  MXNET_STR_CONCAT(MXNET_FUNC_REG_VAR_DEF, __COUNTER__) =            \
+#define MXNET_REGISTER_GLOBAL(OpName)                     \
+  MXNET_STR_CONCAT(MXNET_FUNC_REG_VAR_DEF, __COUNTER__) = \
       ::mxnet::runtime::Registry::Register(OpName)
 
 }  // namespace runtime
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 1cb35270f026..0d4964bfded9 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -31,8 +31,8 @@
 
 namespace mxnet {
 
-#define MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR  "<unk>:"
-#define MXNET_STORAGE_DEFAULT_NAME_CSTR  "unknown"
+#define MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR "<unk>:"
+#define MXNET_STORAGE_DEFAULT_NAME_CSTR           "unknown"
 
 /*!
  * \brief Storage manager across multiple devices.
@@ -70,7 +70,7 @@ class Storage {
      * \brief Id for IPC shared memory
      */
     int shared_pid{-1};
-    int shared_id {-1};
+    int shared_id{-1};
     /*!
      * \brief Attributes for tracking storage allocations.
      */
@@ -92,7 +92,7 @@ class Storage {
   Handle Alloc(size_t size, Context ctx, bool failsafe = false) {
     Handle hd;
     hd.size = size;
-    hd.ctx = ctx;
+    hd.ctx  = ctx;
     this->Alloc(&hd, failsafe);
     return hd;
   }
@@ -122,12 +122,12 @@ class Storage {
    */
   virtual void DirectFree(Handle handle) = 0;
   /*!
-  * \brief Release all memory from device if using a pooled storage manager
-  *
-  * This release all memory from pool storage managers such as
-  * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
-  * For non-pool memory managers this has no effect.
-  */
+   * \brief Release all memory from device if using a pooled storage manager
+   *
+   * This release all memory from pool storage managers such as
+   * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+   * For non-pool memory managers this has no effect.
+   */
   virtual void ReleaseAll(Context ctx) = 0;
   /*!
    * \brief Destructor.
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
old mode 100755
new mode 100644
index 8fdc3cd6e2ac..479b3cf3a260
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -64,18 +64,17 @@ class NDArray;
  */
 class TBlob {
   friend class NDArray;
+
  public:
   /*! \brief pointer to the data */
-  void *dptr_;
+  void* dptr_;
   /*! \brief shape of the tensor */
   mxnet::TShape shape_;
   /*! \brief type flag of the tensor blob */
   int type_flag_;
 
   /*! \brief default constructor, default copy assign will work */
-  TBlob(void)
-      : dptr_(nullptr),
-        type_flag_(mshadow::DataType<real_t>::kFlag) {
+  TBlob(void) : dptr_(nullptr), type_flag_(mshadow::DataType<real_t>::kFlag) {
     SetDLTensor(cpu::kDevMask, 0);
   }
   /*!
@@ -85,10 +84,9 @@ class TBlob {
    * \param dev_mask the device mask, can be cpu::kDevMask or gpu::kDevMask
    * \param dev_id the device id
    */
-  template<typename DType>
-  TBlob(DType *dptr, const mxnet::TShape &shape, int dev_mask, int dev_id = -1)
-      : dptr_(dptr), shape_(shape),
-        type_flag_(mshadow::DataType<DType>::kFlag) {
+  template <typename DType>
+  TBlob(DType* dptr, const mxnet::TShape& shape, int dev_mask, int dev_id = -1)
+      : dptr_(dptr), shape_(shape), type_flag_(mshadow::DataType<DType>::kFlag) {
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -99,7 +97,7 @@ class TBlob {
    * \param type_flag the type flag. Can be one of enum mshadow::dtype
    * \param dev_id the device id
    */
-  TBlob(void *dptr, const mxnet::TShape &shape, int dev_mask, int type_flag, int dev_id = -1)
+  TBlob(void* dptr, const mxnet::TShape& shape, int dev_mask, int type_flag, int dev_id = -1)
       : dptr_(dptr), shape_(shape), type_flag_(type_flag) {
     SetDLTensor(dev_mask, dev_id);
   }
@@ -107,7 +105,7 @@ class TBlob {
    * \brief constructor that construct TBlob from DLTensor
    * \param DLTensor Object
    */
-  explicit TBlob(const DLTensor &dltensor)
+  explicit TBlob(const DLTensor& dltensor)
       : dptr_(dltensor.data),
         shape_(mxnet::TShape(dltensor.shape, dltensor.shape + dltensor.ndim)),
         type_flag_(DLDataTypeTransform(dltensor.dtype)),
@@ -115,9 +113,9 @@ class TBlob {
     // compactness check for DLTensor
     if (dltensor.strides != nullptr) {
       // check strides
-      const int &ndim = dltensor.ndim;
-      const int64_t *shape = dltensor.shape;
-      const int64_t *strides = dltensor.strides;
+      const int& ndim        = dltensor.ndim;
+      const int64_t* shape   = dltensor.shape;
+      const int64_t* strides = dltensor.strides;
       if (ndim >= 1) {
         bool err = false;
         if (strides[ndim - 1] != 1) {
@@ -143,15 +141,15 @@ class TBlob {
    * \tparam dim tensor dimension
    * \tparam DType the type of elements in the tensor
    */
-  template<typename Device, int dim, typename DType>
-  TBlob(const mshadow::Tensor<Device, dim, DType> &src) {  // NOLINT(*)
+  template <typename Device, int dim, typename DType>
+  TBlob(const mshadow::Tensor<Device, dim, DType>& src) {  // NOLINT(*)
     *this = src;
   }
   /*!
    * \brief constructor from TBlob (copy constructor)
    * \param src source TBlob
    */
-  TBlob(const TBlob &src): dptr_(src.dptr_), shape_(src.shape_), type_flag_(src.type_flag_) {
+  TBlob(const TBlob& src) : dptr_(src.dptr_), shape_(src.shape_), type_flag_(src.type_flag_) {
     this->SetDLTensor(src.dev_mask(), src.dev_id());
   }
   /*!
@@ -162,10 +160,10 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return reference of self
    */
-  template<typename Device, int dim, typename DType>
-  inline TBlob &operator=(const mshadow::Tensor<Device, dim, DType> &src) {
-    dptr_ = src.dptr_;
-    shape_ = src.shape_;
+  template <typename Device, int dim, typename DType>
+  inline TBlob& operator=(const mshadow::Tensor<Device, dim, DType>& src) {
+    dptr_      = src.dptr_;
+    shape_     = src.shape_;
     type_flag_ = mshadow::DataType<DType>::kFlag;
     SetDLTensor(Device::kDevMask, -1);
     return *this;
@@ -175,9 +173,9 @@ class TBlob {
    * \param src source TBlob
    * \return reference of self
    */
-  inline TBlob &operator=(const TBlob &src) {
-    dptr_ = src.dptr_;
-    shape_ = src.shape_;
+  inline TBlob& operator=(const TBlob& src) {
+    dptr_      = src.dptr_;
+    shape_     = src.shape_;
     type_flag_ = src.type_flag_;
     SetDLTensor(src.dev_mask(), src.dev_id());
     return *this;
@@ -194,8 +192,8 @@ class TBlob {
    * \return reshaped blob
    */
   inline TBlob reshape(const mxnet::TShape& shape) const {
-    CHECK_EQ(this->shape_.Size(), shape.Size()) << "Shape size mismatch "
-    << this->shape_.Size() << " v.s. "  << shape.Size();
+    CHECK_EQ(this->shape_.Size(), shape.Size())
+        << "Shape size mismatch " << this->shape_.Size() << " v.s. " << shape.Size();
     TBlob ret(this->dptr_, shape, this->dev_mask(), this->type_flag_, this->dev_id());
     return ret;
   }
@@ -206,18 +204,16 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return tensor after flatten
    */
-  template<typename Device, typename DType>
+  template <typename Device, typename DType>
   inline mshadow::Tensor<Device, 2, DType> FlatTo2D(
-    mshadow::Stream<Device> *stream = nullptr) const {
+      mshadow::Stream<Device>* stream = nullptr) const {
     CHECK(Device::kDevMask == this->dev_mask())
-      << "TBlob.get: device type do not match specified type";
+        << "TBlob.get: device type do not match specified type";
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
-      << "TBlob.get_with_shape: data type do not match specified type."
-      << "Expected: " << mshadow::dtype_string(type_flag_)
-      << " v.s. given " << mshadow::dtype_string(mshadow::DataType<DType>::kFlag);
-    return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
-                                             shape_.FlatTo2D(),
-                                             stream);
+        << "TBlob.get_with_shape: data type do not match specified type."
+        << "Expected: " << mshadow::dtype_string(type_flag_) << " v.s. given "
+        << mshadow::dtype_string(mshadow::DataType<DType>::kFlag);
+    return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_), shape_.FlatTo2D(), stream);
   }
   /*!
    * \brief flatten the tensor to 1 dimension, collapse all the dimensions together.
@@ -226,11 +222,10 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return tensor after flatten
    */
-  template<typename Device, typename DType>
+  template <typename Device, typename DType>
   inline mshadow::Tensor<Device, 1, DType> FlatTo1D(
-      mshadow::Stream<Device> *stream = nullptr) const {
-    return this->get_with_shape<Device, 1, DType>(
-        mshadow::Shape1(shape_.Size()), stream);
+      mshadow::Stream<Device>* stream = nullptr) const {
+    return this->get_with_shape<Device, 1, DType>(mshadow::Shape1(shape_.Size()), stream);
   }
   /*! \brief return number of dimension of the tensor inside */
   inline int ndim(void) const {
@@ -250,12 +245,12 @@ class TBlob {
     return shape_.Size();
   }
   /*! \brief get pointer in dtype */
-  template<typename DType>
+  template <typename DType>
   inline DType* dptr() const {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
-      << "TBlob.get_with_shape: data type do not match specified type."
-      << "Expected: " << mshadow::dtype_string(type_flag_)
-      << " v.s. given " << mshadow::dtype_string(mshadow::DataType<DType>::kFlag);
+        << "TBlob.get_with_shape: data type do not match specified type."
+        << "Expected: " << mshadow::dtype_string(type_flag_) << " v.s. given "
+        << mshadow::dtype_string(mshadow::DataType<DType>::kFlag);
     return static_cast<DType*>(dptr_);
   }
   /*! \brief device mask of the corresponding device */
@@ -283,12 +278,12 @@ class TBlob {
    * \tparam dim dimension of the tensor
    * \tparam DType the type of elements in the tensor
    */
-  template<typename Device, int dim, typename DType>
-  inline mshadow::Tensor<Device, dim, DType> get(mshadow::Stream<Device> *stream = nullptr) const {
+  template <typename Device, int dim, typename DType>
+  inline mshadow::Tensor<Device, dim, DType> get(mshadow::Stream<Device>* stream = nullptr) const {
     CHECK(Device::kDevMask == this->dev_mask())
-      << "TBlob.get: device type do not match specified type";
-    return mshadow::Tensor<Device, dim, DType>(dptr<DType>(),
-        shape_.get<dim>(), shape_[shape_.ndim() - 1], stream);
+        << "TBlob.get: device type do not match specified type";
+    return mshadow::Tensor<Device, dim, DType>(
+        dptr<DType>(), shape_.get<dim>(), shape_[shape_.ndim() - 1], stream);
   }
   /*!
    * \brief fetch a tensor in given shape
@@ -300,17 +295,16 @@ class TBlob {
    * \tparam dim dimension of the tensor
    * \tparam DType the type of elements in the tensor
    */
-  template<typename Device, int dim, typename DType>
+  template <typename Device, int dim, typename DType>
   inline mshadow::Tensor<Device, dim, DType> get_with_shape(
-      const mshadow::Shape<dim> &shape,
-      mshadow::Stream<Device> *stream = nullptr) const {
+      const mshadow::Shape<dim>& shape,
+      mshadow::Stream<Device>* stream = nullptr) const {
     CHECK(Device::kDevMask == this->dev_mask())
-      << "TBlob.get: device type do not match specified type";
+        << "TBlob.get: device type do not match specified type";
     CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be contiguous";
     CHECK_EQ(this->shape_.Size(), static_cast<size_t>(shape.Size()))
-      << "TBlob.get_with_shape: new and old shape do not match total elements";
-    return mshadow::Tensor<Device, dim, DType>(dptr<DType>(), shape,
-                                               shape[dim - 1], stream);
+        << "TBlob.get_with_shape: new and old shape do not match total elements";
+    return mshadow::Tensor<Device, dim, DType>(dptr<DType>(), shape, shape[dim - 1], stream);
   }
   /*!
    * \brief flatten the tensor to 3 dimension,
@@ -321,11 +315,11 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return tensor after flatten
    */
-  template<typename Device, typename DType>
+  template <typename Device, typename DType>
   inline mshadow::Tensor<Device, 3, DType> FlatTo3D(
-      int axis, mshadow::Stream<Device> *stream = nullptr) const {
-    return this->get_with_shape<Device, 3, DType>(
-        this->shape_.FlatTo3D(axis), stream);
+      int axis,
+      mshadow::Stream<Device>* stream = nullptr) const {
+    return this->get_with_shape<Device, 3, DType>(this->shape_.FlatTo3D(axis), stream);
   }
   /*!
    * \brief flatten the tensor to 3 dimension,
@@ -337,12 +331,11 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return tensor after flatten
    */
-  template<typename Device, typename DType>
-  inline mshadow::Tensor<Device, 3, DType> FlatTo3D(
-      int axis_begin, int axis_end,
-      mshadow::Stream<Device> *stream = nullptr) const {
-    return this->get_with_shape<Device, 3, DType>(
-        this->shape_.FlatTo3D(axis_begin, axis_end), stream);
+  template <typename Device, typename DType>
+  inline mshadow::Tensor<Device, 3, DType>
+  FlatTo3D(int axis_begin, int axis_end, mshadow::Stream<Device>* stream = nullptr) const {
+    return this->get_with_shape<Device, 3, DType>(this->shape_.FlatTo3D(axis_begin, axis_end),
+                                                  stream);
   }
   /*!
    * \brief flatten the tensor to specified number of dimensions,
@@ -353,9 +346,9 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    * \return tensor after flatten
    */
-  template<typename Device, int dim, typename DType>
+  template <typename Device, int dim, typename DType>
   inline mshadow::Tensor<Device, dim, DType> FlatToKD(
-     mshadow::Stream<Device> *stream = nullptr) const {
+      mshadow::Stream<Device>* stream = nullptr) const {
     mshadow::Shape<dim> shape;
     shape[0] = 1;
     // Pad higher dimensions in case dim > ndim()
@@ -376,19 +369,32 @@ class TBlob {
  private:
   static DLDataType DTypeTransform(int type_flag) {
     switch (type_flag) {
-      case mshadow::kFloat32: return DLDataType{kDLFloat, 32, 1};
-      case mshadow::kFloat64: return DLDataType{kDLFloat, 64, 1};
-      case mshadow::kFloat16: return DLDataType{kDLFloat, 16, 1};
-      case mshadow::kBfloat16: return DLDataType{kDLBfloat, 16, 1};
-      case mshadow::kUint8: return DLDataType{kDLUInt, 8, 1};
-      case mshadow::kInt32: return DLDataType{kDLInt, 32, 1};
-      case mshadow::kInt8: return DLDataType{kDLInt, 8, 1};
-      case mshadow::kInt64: return DLDataType{kDLInt, 64, 1};
-      case mshadow::kBool: return DLDataType{kDLUInt, 1, 1};
-      case mshadow::kInt16: return DLDataType{kDLInt, 16, 1};
-      case mshadow::kUint16: return DLDataType{kDLUInt, 16, 1};
-      case mshadow::kUint32: return DLDataType{kDLUInt, 32, 1};
-      case mshadow::kUint64: return DLDataType{kDLUInt, 64, 1};
+      case mshadow::kFloat32:
+        return DLDataType{kDLFloat, 32, 1};
+      case mshadow::kFloat64:
+        return DLDataType{kDLFloat, 64, 1};
+      case mshadow::kFloat16:
+        return DLDataType{kDLFloat, 16, 1};
+      case mshadow::kBfloat16:
+        return DLDataType{kDLBfloat, 16, 1};
+      case mshadow::kUint8:
+        return DLDataType{kDLUInt, 8, 1};
+      case mshadow::kInt32:
+        return DLDataType{kDLInt, 32, 1};
+      case mshadow::kInt8:
+        return DLDataType{kDLInt, 8, 1};
+      case mshadow::kInt64:
+        return DLDataType{kDLInt, 64, 1};
+      case mshadow::kBool:
+        return DLDataType{kDLUInt, 1, 1};
+      case mshadow::kInt16:
+        return DLDataType{kDLInt, 16, 1};
+      case mshadow::kUint16:
+        return DLDataType{kDLUInt, 16, 1};
+      case mshadow::kUint32:
+        return DLDataType{kDLUInt, 32, 1};
+      case mshadow::kUint64:
+        return DLDataType{kDLUInt, 64, 1};
       default: {
         LOG(FATAL) << "Unknown type_flag=" << type_flag;
         return DLDataType();
@@ -402,47 +408,59 @@ class TBlob {
     switch (dldata_type.code) {
       case kDLFloat:
         switch (dldata_type.bits) {
-          case 16: return mshadow::kFloat16;
-          case 32: return mshadow::kFloat32;
-          case 64: return mshadow::kFloat64;
+          case 16:
+            return mshadow::kFloat16;
+          case 32:
+            return mshadow::kFloat32;
+          case 64:
+            return mshadow::kFloat64;
         }
         break;
       case kDLBfloat:
         switch (dldata_type.bits) {
-          case 16: return mshadow::kBfloat16;
+          case 16:
+            return mshadow::kBfloat16;
         }
         break;
       case kDLUInt:
         switch (dldata_type.bits) {
-          case 1: return mshadow::kBool;
-          case 8: return mshadow::kUint8;
-          case 16: return mshadow::kUint16;
-          case 32: return mshadow::kUint32;
-          case 64: return mshadow::kUint64;
+          case 1:
+            return mshadow::kBool;
+          case 8:
+            return mshadow::kUint8;
+          case 16:
+            return mshadow::kUint16;
+          case 32:
+            return mshadow::kUint32;
+          case 64:
+            return mshadow::kUint64;
         }
         break;
       case kDLInt:
         switch (dldata_type.bits) {
-          case 8: return mshadow::kInt8;
-          case 16: return mshadow::kInt16;
-          case 32: return mshadow::kInt32;
-          case 64: return mshadow::kInt64;
+          case 8:
+            return mshadow::kInt8;
+          case 16:
+            return mshadow::kInt16;
+          case 32:
+            return mshadow::kInt32;
+          case 64:
+            return mshadow::kInt64;
         }
         break;
     }
-    LOG(FATAL) << "Unknown DLDataType{" << dldata_type.code
-               << ", " << dldata_type.bits
-               << ", " << dldata_type.lanes << "}";
+    LOG(FATAL) << "Unknown DLDataType{" << dldata_type.code << ", " << dldata_type.bits << ", "
+               << dldata_type.lanes << "}";
     return mshadow::kFloat32;
   }
 
   inline void SetDLTensor(int dev_mask, int dev_id) {
-    dltensor_.data = dptr_;
-    dltensor_.ctx = DLContext{static_cast<DLDeviceType>(dev_mask), dev_id};
-    dltensor_.ndim = shape_.ndim();
-    dltensor_.dtype = DTypeTransform(type_flag_);
-    dltensor_.shape = shape_.data();
-    dltensor_.strides = nullptr;
+    dltensor_.data        = dptr_;
+    dltensor_.ctx         = DLContext{static_cast<DLDeviceType>(dev_mask), dev_id};
+    dltensor_.ndim        = shape_.ndim();
+    dltensor_.dtype       = DTypeTransform(type_flag_);
+    dltensor_.shape       = shape_.data();
+    dltensor_.strides     = nullptr;
     dltensor_.byte_offset = 0;
   }
 
@@ -462,22 +480,21 @@ DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 
 namespace parameter {
 
-template<>
-class FieldEntry<mxnet::TShape>
-    : public FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> {
+template <>
+class FieldEntry<mxnet::TShape> : public FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> {
  public:
   FieldEntry() : enforce_nonzero_(false), expect_ndim_(0) {}
   // parent class
   typedef FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> Parent;
 
-  virtual void Check(void *head) const {
+  virtual void Check(void* head) const {
     Parent::Check(head);
-    mxnet::TShape &v = this->Get(head);
+    mxnet::TShape& v = this->Get(head);
     if (expect_ndim_ != 0 && v.ndim() != expect_ndim_) {
       std::ostringstream os;
-        os << "value " << v << "for Parameter " << this->key_
-           << " has wrong dimensions, expected dimension=" << expect_ndim_;
-        throw dmlc::ParamError(os.str());
+      os << "value " << v << "for Parameter " << this->key_
+         << " has wrong dimensions, expected dimension=" << expect_ndim_;
+      throw dmlc::ParamError(os.str());
     }
     if (enforce_nonzero_) {
       for (int i = 0; i < v.ndim(); ++i) {
@@ -490,11 +507,11 @@ class FieldEntry<mxnet::TShape>
       }
     }
   }
-  inline FieldEntry<mxnet::TShape> &enforce_nonzero() {
+  inline FieldEntry<mxnet::TShape>& enforce_nonzero() {
     this->enforce_nonzero_ = true;
     return this->self();
   }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(int ndim) {
+  inline FieldEntry<mxnet::TShape>& set_expect_ndim(int ndim) {
     expect_ndim_ = ndim;
     return this->self();
   }
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index 798622b6ee2a..9fe30c0967a0 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -53,14 +53,14 @@ namespace mxnet {
  * \tparam ValueType The type of data stored inside tuple.
  * \sa TShape
  */
-template<typename ValueType>
+template <typename ValueType>
 class Tuple {
  public:
   /*! \brief default constructor */
   Tuple() = default;
   /*! \brief destructor */
   inline ~Tuple() {
-    delete [] data_heap_;
+    delete[] data_heap_;
   }
   /*!
    * constructor to construct a tuple with all `value`.
@@ -103,7 +103,7 @@ class Tuple {
    * \param src the source shape
    */
 
-  inline Tuple(Tuple<ValueType>&& src) {   // NOLINT(runtime/explicit)
+  inline Tuple(Tuple<ValueType>&& src) {  // NOLINT(runtime/explicit)
     this->swap(src);
   }
   /*!
@@ -112,9 +112,8 @@ class Tuple {
    * \param end end the end of the iterator
    * \tparam RandomAccessIterator iterator type
    */
-  template<typename RandomAccessIterator>
-  inline Tuple(RandomAccessIterator begin,
-               RandomAccessIterator end) {
+  template <typename RandomAccessIterator>
+  inline Tuple(RandomAccessIterator begin, RandomAccessIterator end) {
     this->assign(begin, end);
   }
 
@@ -133,9 +132,8 @@ class Tuple {
    * \param end end the end of the iterator
    * \tparam RandomAccessIterator iterator type
    */
-  template<typename RandomAccessIterator>
-  inline void assign(RandomAccessIterator begin,
-                     RandomAccessIterator end) {
+  template <typename RandomAccessIterator>
+  inline void assign(RandomAccessIterator begin, RandomAccessIterator end) {
     this->SetDim(end - begin);
     CHECK_GE(ndim(), 0);
     std::copy(begin, end, this->begin());
@@ -177,7 +175,7 @@ class Tuple {
    * \param init the source initializer list
    * \return reference of self
    */
-  inline Tuple<ValueType> &operator=(std::initializer_list<ValueType> init) {
+  inline Tuple<ValueType>& operator=(std::initializer_list<ValueType> init) {
     this->assign(init.begin(), init.end());
     return *this;
   }
@@ -185,33 +183,35 @@ class Tuple {
    * \return whether two tuple equals
    * \param s the tuple to compare against
    */
-  inline bool operator==(const Tuple<ValueType> &s) const {
-    if (ndim_ != s.ndim_) return false;
-    if (ndim() == -1) return true;
+  inline bool operator==(const Tuple<ValueType>& s) const {
+    if (ndim_ != s.ndim_)
+      return false;
+    if (ndim() == -1)
+      return true;
     return std::equal(begin(), end(), s.begin());
   }
   /*!
    * \return whether two tuple not equal
    * \param s the tuple to compare against
    */
-  inline bool operator!=(const Tuple<ValueType> &s) const {
+  inline bool operator!=(const Tuple<ValueType>& s) const {
     return !(*this == s);
   }
   /*! \return the begin data pointer to content of the tuple */
-  inline const ValueType *begin() const {
+  inline const ValueType* begin() const {
     return ndim_ <= kStackCache ? data_stack_ : data_heap_;
   }
   /*! \return the begin data pointer to content of the tuple */
-  inline ValueType *begin() {
+  inline ValueType* begin() {
     return ndim_ <= kStackCache ? data_stack_ : data_heap_;
   }
   /*! \return the data pointer to end of the tuple */
   inline const ValueType* end() const {
-    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_) : (data_heap_ + ndim_);
   }
   /*! \return the data pointer to end the tuple */
   inline ValueType* end() {
-    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_) : (data_heap_ + ndim_);
   }
   /*! \return number of dimension of the tuple */
   inline int ndim() const {
@@ -223,12 +223,12 @@ class Tuple {
    * \return the corresponding dimension size
    */
   inline ValueType& operator[](int i) {
-    // it fixes the false alarm of assuming signed overflow does not occur
-    // when assuming that (X - c) > X is always false [-Werror=strict-overflow]
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wstrict-overflow"
+// it fixes the false alarm of assuming signed overflow does not occur
+// when assuming that (X - c) > X is always false [-Werror=strict-overflow]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-overflow"
     CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
-    #pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
     return begin()[i];
   }
   /*!
@@ -237,12 +237,12 @@ class Tuple {
    * \return the corresponding dimension size
    */
   inline const ValueType& operator[](int i) const {
-    // it fixes the false alarm of assuming signed overflow does not occur
-    // when assuming that (X - c) > X is always false [-Werror=strict-overflow]
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wstrict-overflow"
+// it fixes the false alarm of assuming signed overflow does not occur
+// when assuming that (X - c) > X is always false [-Werror=strict-overflow]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-overflow"
     CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
-    #pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
     return begin()[i];
   }
   /*!
@@ -268,7 +268,7 @@ class Tuple {
    * \param t the tuple
    * \return the ostream
    */
-  friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
+  friend std::ostream& operator<<(std::ostream& os, const Tuple<ValueType>& t) {
     if (t.ndim() == -1) {
       // If t is an unknown shape, return string "None".
       // This is consistent with returning unknown shape in Python and generating
@@ -278,9 +278,10 @@ class Tuple {
     }
     os << '[';
     const ValueType* begin = t.begin();
-    const ValueType* end = t.end();
+    const ValueType* end   = t.end();
     for (const ValueType* it = begin; it != end; ++it) {
-      if (it != begin) os << ',';
+      if (it != begin)
+        os << ',';
       os << *it;
     }
     os << ']';
@@ -292,7 +293,7 @@ class Tuple {
    * \param t The tuple
    * \return the istream
    */
-  friend std::istream &operator>>(std::istream &is, Tuple<ValueType> &t) {
+  friend std::istream& operator>>(std::istream& is, Tuple<ValueType>& t) {
     // get (
     while (true) {
       char ch = is.peek();
@@ -304,7 +305,8 @@ class Tuple {
         return is;
       }
       is.get();
-      if (ch == '(' || ch == '[') break;
+      if (ch == '(' || ch == '[')
+        break;
       if (!isspace(ch)) {
         if (ch == 'N') {
           std::string tmp_val;
@@ -344,14 +346,17 @@ class Tuple {
         while (true) {
           ch = is.peek();
           if (isspace(ch)) {
-            is.get(); continue;
+            is.get();
+            continue;
           }
           if (ch == ')' || ch == ']') {
-            is.get(); break;
+            is.get();
+            break;
           }
           break;
         }
-        if (ch == ')' || ch == ']') break;
+        if (ch == ')' || ch == ']')
+          break;
       } else if (ch == ')' || ch == ']') {
         break;
       } else {
@@ -368,8 +373,8 @@ class Tuple {
    * \tparam DType data type that save to
    * \tparam TStream any stream type that have write
    */
-  template<typename DType = ValueType, typename TStream>
-  inline void Save(TStream *strm) const;
+  template <typename DType = ValueType, typename TStream>
+  inline void Save(TStream* strm) const;
   /*!
    * \brief load the content from binary stream
    * \param strm the output stream
@@ -377,8 +382,8 @@ class Tuple {
    * \tparam TStream any stream type that have write
    * \return whether the load is successful
    */
-  template<typename DType = ValueType, typename TStream>
-  inline bool Load(TStream *strm);
+  template <typename DType = ValueType, typename TStream>
+  inline bool Load(TStream* strm);
 
  protected:
   // stack cache size
@@ -394,21 +399,19 @@ class Tuple {
   // internal function to change the dimension
   inline void SetDim(int ndim) {
     CHECK_GE(ndim, -1) << "ndim cannot be less than -1, received " << ndim;
-    if (ndim > kStackCache &&
-        ndim > num_heap_allocated_) {
-      delete [] data_heap_;
-      data_heap_ = new ValueType[ndim];
+    if (ndim > kStackCache && ndim > num_heap_allocated_) {
+      delete[] data_heap_;
+      data_heap_          = new ValueType[ndim];
       num_heap_allocated_ = ndim;
     } else if (ndim <= 0 && data_heap_ != nullptr) {
-      delete [] data_heap_;
-      data_heap_ = nullptr;
+      delete[] data_heap_;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
     }
     ndim_ = ndim;
   }
 };
 
-
 /*! brief check if a shape's ndim is known. */
 inline bool ndim_is_known(const int ndim) {
   CHECK_GE(ndim, -1) << "shape ndim must be >= -1, while received " << ndim;
@@ -455,7 +458,7 @@ class TShape : public Tuple<dim_t> {
    * \brief copy constructor of TShape
    * \param s source shape.
    */
-  inline TShape(const Tuple<dim_t>& s) { // NOLINT(*)
+  inline TShape(const Tuple<dim_t>& s) {  // NOLINT(*)
     if (s.ndim() == -1) {
       this->SetDim(-1);
     } else {
@@ -484,16 +487,16 @@ class TShape : public Tuple<dim_t> {
    * \param end end the end of the iterator
    * \tparam RandomAccessIterator iterator type
    */
-  template<typename RandomAccessIterator,
-           typename std::enable_if<
-               std::is_same<typename std::iterator_traits<RandomAccessIterator>::iterator_category,
-                            std::random_access_iterator_tag>::value, int>::type = 0>
-  inline TShape(RandomAccessIterator begin,
-                RandomAccessIterator end) {
+  template <typename RandomAccessIterator,
+            typename std::enable_if<
+                std::is_same<typename std::iterator_traits<RandomAccessIterator>::iterator_category,
+                             std::random_access_iterator_tag>::value,
+                int>::type = 0>
+  inline TShape(RandomAccessIterator begin, RandomAccessIterator end) {
     this->assign(begin, end);
   }
 
-  inline explicit TShape(const ObjectRef& src): Tuple(src) {}
+  inline explicit TShape(const ObjectRef& src) : Tuple(src) {}
   /*!
    * \brief assignment function from tshape
    * \param src source shape.
@@ -513,14 +516,14 @@ class TShape : public Tuple<dim_t> {
    * \return self.
    */
   inline TShape& operator=(Tuple<dim_t>&& src) {  // NOLINT(*)
-    TShape(std::move(src)).swap(*this);  // NOLINT(*)
+    TShape(std::move(src)).swap(*this);           // NOLINT(*)
     return *this;
   }
   /*! \return total number of elements in the shape */
   inline size_t Size() const {
     CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
-    dim_t size = 1;
-    const dim_t* start = begin(), *fin = end();
+    dim_t size         = 1;
+    const dim_t *start = begin(), *fin = end();
     for (const dim_t* it = start; it != fin; ++it) {
       CHECK(dim_size_is_known(*it)) << "Shape dim size cannot be a negative value " << *it;
       size *= *it;
@@ -535,10 +538,10 @@ class TShape : public Tuple<dim_t> {
   inline size_t ProdShape(int dimstart, int dimend) const {
     CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
     CHECK_GE(dimstart, 0) << "dimstart must be >= 0, while received " << dimstart;
-    CHECK_LE(dimend, this->ndim()) << "dimend must be <= " << this->ndim()
-                                   << ", while received " << dimend;
-    dim_t num = 1;
-    const dim_t *d = this->data();
+    CHECK_LE(dimend, this->ndim())
+        << "dimend must be <= " << this->ndim() << ", while received " << dimend;
+    dim_t num      = 1;
+    const dim_t* d = this->data();
     for (int i = dimstart; i < dimend; ++i) {
       CHECK(dim_size_is_known(d[i])) << "Shape dim size must be known, while received " << d[i];
       num *= d[i];
@@ -546,21 +549,21 @@ class TShape : public Tuple<dim_t> {
     return num;
   }
   /*! \return the begin data pointer to content of the tuple */
-  inline const dim_t *data() const {
+  inline const dim_t* data() const {
     return begin();
   }
   /*! \return the begin data pointer to content of the tuple */
-  inline dim_t *data() {
+  inline dim_t* data() {
     return begin();
   }
 #ifdef MSHADOW_XINLINE
-  template<int dim>
-  inline TShape(const mshadow::Shape<dim> &s) {// NOLINT(*)
+  template <int dim>
+  inline TShape(const mshadow::Shape<dim>& s) {  // NOLINT(*)
     this->assign(s.shape_, s.shape_ + dim);
   }
 
-  template<int dim>
-  inline TShape(mshadow::Shape<dim> &&s) {// NOLINT(*)
+  template <int dim>
+  inline TShape(mshadow::Shape<dim>&& s) {  // NOLINT(*)
     this->assign(s.shape_, s.shape_ + dim);
   }
   /*!
@@ -569,8 +572,8 @@ class TShape : public Tuple<dim_t> {
    * \tparam dim shape dimension
    * \return reference of self
    */
-  template<int dim>
-  inline TShape &operator=(const mshadow::Shape<dim> &shape) {
+  template <int dim>
+  inline TShape& operator=(const mshadow::Shape<dim>& shape) {
     this->assign(shape.shape_, shape.shape_ + dim);
     return *this;
   }
@@ -579,11 +582,10 @@ class TShape : public Tuple<dim_t> {
    * \return the shape requested
    * \tparam dim dimension of the tensor
    */
-  template<int dim>
+  template <int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, ndim())
-        << "dimension do not match target dimension " << dim << " vs " << ndim();
-    const dim_t *d = this->data();
+    CHECK_EQ(dim, ndim()) << "dimension do not match target dimension " << dim << " vs " << ndim();
+    const dim_t* d = this->data();
     mshadow::Shape<dim> s;
     for (int i = 0; i < dim; ++i) {
       s[i] = d[i];
@@ -597,10 +599,11 @@ class TShape : public Tuple<dim_t> {
   inline mshadow::Shape<2> FlatTo2D(void) const {
     mshadow::Shape<2> s;
     CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
-    if (ndim() == 0) return mshadow::Shape2(1, 1);
-    const dim_t *d = this->data();
-    s.shape_[1] = d[ndim() - 1];
-    dim_t ymax = 1;
+    if (ndim() == 0)
+      return mshadow::Shape2(1, 1);
+    const dim_t* d = this->data();
+    s.shape_[1]    = d[ndim() - 1];
+    dim_t ymax     = 1;
     for (int i = 1; i < ndim(); ++i) {
       ymax *= d[i - 1];
     }
@@ -617,11 +620,12 @@ class TShape : public Tuple<dim_t> {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
-    if (ndim() == 0) return mshadow::Shape3(1, 1, 1);
-    const dim_t *d = this->data();
-    s.shape_[0] = 1;
-    s.shape_[1] = 1;
-    s.shape_[2] = 1;
+    if (ndim() == 0)
+      return mshadow::Shape3(1, 1, 1);
+    const dim_t* d = this->data();
+    s.shape_[0]    = 1;
+    s.shape_[1]    = 1;
+    s.shape_[2]    = 1;
 
     for (int i = 0; i < axis_begin; ++i) {
       s.shape_[0] *= d[i];
@@ -642,11 +646,12 @@ class TShape : public Tuple<dim_t> {
   inline mshadow::Shape<3> FlatTo3D(int axis) const {
     return FlatTo3D(axis, axis);
   }
-  inline bool operator==(const TShape &s) const {
-    if (ndim() != s.ndim()) return false;
+  inline bool operator==(const TShape& s) const {
+    if (ndim() != s.ndim())
+      return false;
     return std::equal(begin(), end(), s.begin());
   }
-  inline bool operator!=(const TShape &s) const {
+  inline bool operator!=(const TShape& s) const {
     return !(*this == s);
   }
   /*!
@@ -654,12 +659,14 @@ class TShape : public Tuple<dim_t> {
    * \param s the shape to compare against
    * \tparam dim dimension of the shape
    */
-  template<int dim>
-  inline bool operator==(const mshadow::Shape<dim> &s) const {
-    if (ndim_ != dim) return false;
-    const dim_t *d = dim <= kStackCache ? data_stack_ : data_heap_;
+  template <int dim>
+  inline bool operator==(const mshadow::Shape<dim>& s) const {
+    if (ndim_ != dim)
+      return false;
+    const dim_t* d = dim <= kStackCache ? data_stack_ : data_heap_;
     for (size_t i = 0; i < dim; ++i) {
-      if (d[i] != s.shape_[i]) return false;
+      if (d[i] != s.shape_[i])
+        return false;
     }
     return true;
   }
@@ -668,8 +675,8 @@ class TShape : public Tuple<dim_t> {
    * \param s the shape to compare against
    * \tparam dim dimension of the shape
    */
-  template<int dim>
-  inline bool operator!=(const mshadow::Shape<dim> &s) const {
+  template <int dim>
+  inline bool operator!=(const mshadow::Shape<dim>& s) const {
     return !(*this == s);
   }
 #endif
@@ -690,25 +697,26 @@ inline bool dim_size_is_known(const TShape& x, const int idx) {
 /*! brief check if shape is known using the NumPy compatible definition.
  * zero-dim and zero-size tensors are valid. -1 means unknown.*/
 inline bool shape_is_known(const TShape& x) {
-  if (!ndim_is_known(x)) return false;
+  if (!ndim_is_known(x))
+    return false;
   for (int i = 0; i < x.ndim(); ++i) {
-    if (!dim_size_is_known(x, i)) return false;
+    if (!dim_size_is_known(x, i))
+      return false;
   }
   return true;
 }
 
 inline bool shape_is_known(const std::vector<TShape>& shapes) {
   for (const TShape& shape : shapes) {
-    if (!shape_is_known(shape)) return false;
+    if (!shape_is_known(shape))
+      return false;
   }
   return true;
 }
 
 /*! \brief helper function to cast type of container elements */
-template<typename SrcIter, typename DstIter>
-inline DstIter ShapeTypeCast(const SrcIter begin,
-                             const SrcIter end,
-                             DstIter dst_begin) {
+template <typename SrcIter, typename DstIter>
+inline DstIter ShapeTypeCast(const SrcIter begin, const SrcIter end, DstIter dst_begin) {
   typedef typename std::iterator_traits<SrcIter>::value_type SrcDType;
   typedef typename std::iterator_traits<DstIter>::value_type DstDType;
   auto cast = [](const SrcDType& dim) { return static_cast<DstDType>(dim); };
@@ -716,7 +724,7 @@ inline DstIter ShapeTypeCast(const SrcIter begin,
 }
 
 /*! \brief helper function to transform a container to TShape with type cast */
-template<typename SrcIter>
+template <typename SrcIter>
 inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
   size_t ndim = std::distance(begin, end);
   TShape res(ndim, -1);
@@ -725,9 +733,9 @@ inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
 }
 
 /*! \tparam ValueType The type of data stored inside tuple. */
-template<typename ValueType>
-template<typename DType, typename TStream>
-inline void Tuple<ValueType>::Save(TStream *strm) const {
+template <typename ValueType>
+template <typename DType, typename TStream>
+inline void Tuple<ValueType>::Save(TStream* strm) const {
   strm->Write(&ndim_, sizeof(ndim_));
   if (typeid(DType) == typeid(ValueType)) {
     strm->Write(begin(), sizeof(ValueType) * ndim_);
@@ -739,17 +747,20 @@ inline void Tuple<ValueType>::Save(TStream *strm) const {
 }
 
 /*! \tparam ValueType The type of data stored inside tuple. */
-template<typename ValueType>
-template<typename DType, typename TStream>
-inline bool Tuple<ValueType>::Load(TStream *strm) {
-  if (strm->Read(&ndim_, sizeof(ndim_)) != sizeof(ndim_)) return false;
+template <typename ValueType>
+template <typename DType, typename TStream>
+inline bool Tuple<ValueType>::Load(TStream* strm) {
+  if (strm->Read(&ndim_, sizeof(ndim_)) != sizeof(ndim_))
+    return false;
   this->SetDim(ndim_);
   size_t nread = sizeof(DType) * ndim_;
   if (typeid(DType) == typeid(ValueType)) {
-    if (strm->Read(begin(), nread) != nread) return false;
+    if (strm->Read(begin(), nread) != nread)
+      return false;
   } else {
     std::vector<DType> buffer(ndim_);
-    if (strm->Read(buffer.data(), nread) != nread) return false;
+    if (strm->Read(buffer.data(), nread) != nread)
+      return false;
     ShapeTypeCast(buffer.begin(), buffer.end(), begin());
   }
   return true;
@@ -759,8 +770,8 @@ inline bool Tuple<ValueType>::Load(TStream *strm) {
 
 namespace std {
 /*! \brief hash function for Tuple. */
-template<typename T>
-struct hash<mxnet::Tuple<T> > {
+template <typename T>
+struct hash<mxnet::Tuple<T>> {
   /*! \brief hash a Tuple into unsigned int */
   size_t operator()(const mxnet::Tuple<T>& val) const {
     std::hash<int> hash_int;
@@ -773,7 +784,7 @@ struct hash<mxnet::Tuple<T> > {
 };
 
 /*! \brief hash function for TShape. */
-template<>
+template <>
 struct hash<mxnet::TShape> {
   /*! \brief hash a TShape into unsigned int */
   size_t operator()(const mxnet::TShape& val) const {
@@ -793,8 +804,8 @@ DMLC_DECLARE_TYPE_NAME(optional<mxnet::TShape>, "Shape or None");
 DMLC_DECLARE_TYPE_NAME(optional<mxnet::Tuple<int>>, "Shape or None");
 // avoid low version of MSVC
 #if !(defined(_MSC_VER) && _MSC_VER < 1900)
-template<typename T>
-struct type_name_helper<mxnet::Tuple<T> > {
+template <typename T>
+struct type_name_helper<mxnet::Tuple<T>> {
   static inline std::string value() {
     return "tuple of <" + type_name<T>() + ">";
   }

From 76973336bd59f88ce4c06a8a4656d2bc3ecd3b8f Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:01:34 +0100
Subject: [PATCH 04/10] [CPP-PACKAGE] Re-format .cc .h files

---
 .../multi_threaded_inference.cc               | 152 ++--
 cpp-package/example/utils.h                   |  56 +-
 cpp-package/include/mxnet-cpp/base.h          |  16 +-
 cpp-package/include/mxnet-cpp/contrib.h       | 135 ++--
 cpp-package/include/mxnet-cpp/executor.h      | 143 ++--
 cpp-package/include/mxnet-cpp/initializer.h   |  81 +--
 cpp-package/include/mxnet-cpp/io.h            |  69 +-
 cpp-package/include/mxnet-cpp/kvstore.h       |  20 +-
 cpp-package/include/mxnet-cpp/lr_scheduler.h  |  45 +-
 cpp-package/include/mxnet-cpp/metric.h        |  41 +-
 cpp-package/include/mxnet-cpp/model.h         |  19 +-
 cpp-package/include/mxnet-cpp/ndarray.h       | 683 +++++++++---------
 cpp-package/include/mxnet-cpp/op_map.h        |  76 +-
 cpp-package/include/mxnet-cpp/op_suppl.h      | 122 ++--
 cpp-package/include/mxnet-cpp/op_util.h       |  14 +-
 cpp-package/include/mxnet-cpp/operator.h      | 186 ++---
 cpp-package/include/mxnet-cpp/optimizer.h     |  80 +-
 cpp-package/include/mxnet-cpp/shape.h         | 351 +++++----
 cpp-package/include/mxnet-cpp/symbol.h        | 362 +++++-----
 19 files changed, 1332 insertions(+), 1319 deletions(-)

diff --git a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
index e3b3909a609c..9b279e9c4315 100644
--- a/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
+++ b/cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
@@ -20,7 +20,7 @@
 /*!
  * \file multi_threaded_inference.cc
  * \brief Multi Threaded inference example with CachedOp
-*/
+ */
 
 #include <mxnet/ndarray.h>
 
@@ -37,17 +37,14 @@
 
 const float DEFAULT_MEAN = 117.0;
 
-
 // Code to load image, PrintOutput results, helper functions for the same obtained from:
 // https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/predict-cpp/
 
-static std::string trim(const std::string &input) {
+static std::string trim(const std::string& input) {
   auto not_space = [](int ch) { return !std::isspace(ch); };
-  auto output = input;
-  output.erase(output.begin(),
-               std::find_if(output.begin(), output.end(), not_space));
-  output.erase(std::find_if(output.rbegin(), output.rend(), not_space).base(),
-               output.end());
+  auto output    = input;
+  output.erase(output.begin(), std::find_if(output.begin(), output.end(), not_space));
+  output.erase(std::find_if(output.rbegin(), output.rend(), not_space).base(), output.end());
   return output;
 }
 
@@ -77,24 +74,25 @@ void PrintOutputResult(const float* data, size_t size, const std::vector<std::st
     std::cerr << "Result data and synset size do not match!" << std::endl;
   }
 
-  float best_accuracy = 0.0;
+  float best_accuracy  = 0.0;
   std::size_t best_idx = 0;
 
   for (std::size_t i = 0; i < size; ++i) {
     if (data[i] > best_accuracy) {
       best_accuracy = data[i];
-      best_idx = i;
+      best_idx      = i;
     }
   }
 
-  std::cout << "Best Result: " << trim(synset[best_idx]) << " (id=" << best_idx << ", " <<
-            "accuracy=" << std::setprecision(8) << best_accuracy << ")" << std::endl;
+  std::cout << "Best Result: " << trim(synset[best_idx]) << " (id=" << best_idx << ", "
+            << "accuracy=" << std::setprecision(8) << best_accuracy << ")" << std::endl;
 }
 
-
 // Read Image data into a float array
-void GetImageFile(const std::string &image_file, float *image_data,
-                  int channels, cv::Size resize_size) {
+void GetImageFile(const std::string& image_file,
+                  float* image_data,
+                  int channels,
+                  cv::Size resize_size) {
   // Read all kinds of file into a BGR color 3 channels image
   cv::Mat im_ori = cv::imread(image_file, cv::IMREAD_COLOR);
 
@@ -127,17 +125,17 @@ void GetImageFile(const std::string &image_file, float *image_data,
   }
 }
 
-void prepare_input_data(const mxnet::cpp::Shape& shape, const mxnet::cpp::Context& ctx,
+void prepare_input_data(const mxnet::cpp::Shape& shape,
+                        const mxnet::cpp::Context& ctx,
                         int num_threads,
                         std::vector<mxnet::cpp::NDArray>* data_arr,
                         bool random_uniform = false) {
   for (size_t i = 0; i < num_threads; ++i) {
     data_arr->emplace_back(shape, ctx, false, 0);
     int begin = i * 100;
-    int end = begin + 100;
+    int end   = begin + 100;
     if (random_uniform) {
-      mxnet::cpp::Operator("_random_uniform")(begin, end)
-          .Invoke((*data_arr)[i]);
+      mxnet::cpp::Operator("_random_uniform")(begin, end).Invoke((*data_arr)[i]);
     }
     mxnet::cpp::NDArray::WaitAll();
   }
@@ -146,46 +144,48 @@ void prepare_input_data(const mxnet::cpp::Shape& shape, const mxnet::cpp::Contex
 // Run inference on a model
 void run_inference(const std::string& model_name,
                    const std::vector<mxnet::cpp::NDArray>& input_arrs,
-                   std::vector<mxnet::NDArray*> *output_mx_arr,
-                   int num_inf_per_thread = 1, bool random_sleep = false,
-                   int num_threads = 1, bool static_alloc = false,
-                   bool static_shape = false,
-                   bool is_gpu = false) {
-    LOG(INFO) << "Running inference for " + model_name +
-                 " num_threads: " + std::to_string(num_threads) +
-                 " num_inf_per_thread: " + std::to_string(num_inf_per_thread) +
-                 " random_sleep: " + std::to_string(random_sleep) +
-                 " static_alloc: " + std::to_string(static_alloc) +
-                 " static_shape: " + std::to_string(static_shape);
-  std::string json_file = model_name + "-symbol.json";
-  std::string param_file = model_name + "-0000.params";
-  auto out = mxnet::cpp::Symbol::Load(json_file);
+                   std::vector<mxnet::NDArray*>* output_mx_arr,
+                   int num_inf_per_thread = 1,
+                   bool random_sleep      = false,
+                   int num_threads        = 1,
+                   bool static_alloc      = false,
+                   bool static_shape      = false,
+                   bool is_gpu            = false) {
+  LOG(INFO) << "Running inference for " + model_name +
+                   " num_threads: " + std::to_string(num_threads) +
+                   " num_inf_per_thread: " + std::to_string(num_inf_per_thread) +
+                   " random_sleep: " + std::to_string(random_sleep) +
+                   " static_alloc: " + std::to_string(static_alloc) +
+                   " static_shape: " + std::to_string(static_shape);
+  std::string json_file        = model_name + "-symbol.json";
+  std::string param_file       = model_name + "-0000.params";
+  auto out                     = mxnet::cpp::Symbol::Load(json_file);
   std::string static_alloc_str = static_alloc ? "true" : "false";
   std::string static_shape_str = static_shape ? "true" : "false";
 
   // Prepare context
-# if MXNET_USE_CUDA == 1
+#if MXNET_USE_CUDA == 1
   mxnet::Context backend_ctx;
   mxnet::cpp::Context ctx = mxnet::cpp::Context::cpu(0);
   if (is_gpu) {
     backend_ctx = mxnet::Context::GPU(0);
-    ctx = mxnet::cpp::Context::gpu(0);
+    ctx         = mxnet::cpp::Context::gpu(0);
   } else {
     backend_ctx = mxnet::Context::CPU(0);
-    ctx = mxnet::cpp::Context::cpu(0);
+    ctx         = mxnet::cpp::Context::cpu(0);
   }
-# else
+#else
   mxnet::Context backend_ctx = mxnet::Context::CPU(0);
-  mxnet::cpp::Context ctx = mxnet::cpp::Context::cpu(0);
+  mxnet::cpp::Context ctx    = mxnet::cpp::Context::cpu(0);
 #endif
 
   // Prepare input data and parameters
   std::vector<mxnet::cpp::NDArray> data_arr(num_threads);
   std::vector<mxnet::cpp::NDArray> softmax_arr;
   std::vector<mxnet::cpp::NDArray> params;
-  mxnet::cpp::Shape data_shape = mxnet::cpp::Shape(1, 3, 224, 224);
+  mxnet::cpp::Shape data_shape    = mxnet::cpp::Shape(1, 3, 224, 224);
   mxnet::cpp::Shape softmax_shape = mxnet::cpp::Shape(1);
-  int num_inputs = out.ListInputs().size();
+  int num_inputs                  = out.ListInputs().size();
 
   for (size_t i = 0; i < data_arr.size(); ++i) {
     data_arr[i] = input_arrs[i].Copy(ctx);
@@ -207,16 +207,15 @@ void run_inference(const std::string& model_name,
 
   CachedOpHandle hdl = CachedOpHandle();
 
-  std::vector<std::string> flag_keys{"data_indices", "param_indices",
-                                     "static_alloc", "static_shape"};
+  std::vector<std::string> flag_keys{
+      "data_indices", "param_indices", "static_alloc", "static_shape"};
   std::string param_indices = "[";
   for (size_t i = 1; i < num_inputs; ++i) {
     param_indices += std::to_string(i);
     param_indices += std::string(", ");
   }
   param_indices += "]";
-  std::vector<std::string> flag_vals{"[0]", param_indices, static_alloc_str,
-                                     static_shape_str};
+  std::vector<std::string> flag_vals{"[0]", param_indices, static_alloc_str, static_shape_str};
   std::vector<const char*> flag_key_cstrs, flag_val_cstrs;
   flag_key_cstrs.reserve(flag_keys.size());
   for (size_t i = 0; i < flag_keys.size(); ++i) {
@@ -226,15 +225,14 @@ void run_inference(const std::string& model_name,
     flag_val_cstrs.emplace_back(flag_vals[i].c_str());
   }
 
-  int ret1 = MXCreateCachedOp(out.GetHandle(), flag_keys.size(),
-                              flag_key_cstrs.data(), flag_val_cstrs.data(),
-                              &hdl, true);
+  int ret1 = MXCreateCachedOp(
+      out.GetHandle(), flag_keys.size(), flag_key_cstrs.data(), flag_val_cstrs.data(), &hdl, true);
   if (ret1 < 0) {
     LOG(FATAL) << MXGetLastError();
   }
 
   // Prepare data structures and lambda to run in different threads
-  std::vector<NDArrayHandle *> cached_op_handles(num_threads);
+  std::vector<NDArrayHandle*> cached_op_handles(num_threads);
 
   std::vector<std::vector<NDArrayHandle>> arr_handles(num_threads);
   for (size_t i = 0; i < num_threads; ++i) {
@@ -255,32 +253,37 @@ void run_inference(const std::string& model_name,
       std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
     }
     int num_output = 0;
-    const int *stypes;
-    int ret = MXInvokeCachedOp(hdl, arr_handles[num].size(), arr_handles[num].data(),
-                               ctx.GetDeviceType(), 0, &num_output,
-                               &(cached_op_handles[num]), &stypes);
+    const int* stypes;
+    int ret = MXInvokeCachedOp(hdl,
+                               arr_handles[num].size(),
+                               arr_handles[num].data(),
+                               ctx.GetDeviceType(),
+                               0,
+                               &num_output,
+                               &(cached_op_handles[num]),
+                               &stypes);
     if (ret < 0) {
       LOG(FATAL) << MXGetLastError();
     }
-    (*output_mx_arr)[num] = static_cast<mxnet::NDArray *>(*cached_op_handles[num]);
+    (*output_mx_arr)[num] = static_cast<mxnet::NDArray*>(*cached_op_handles[num]);
   };
 
   // Spawn multiple threads, join and wait for threads to complete
   std::vector<std::thread> worker_threads(num_threads);
   int count = 0;
-  for (auto &&i : worker_threads) {
+  for (auto&& i : worker_threads) {
     i = std::thread(func, count);
     count++;
   }
 
-  for (auto &&i : worker_threads) {
+  for (auto&& i : worker_threads) {
     i.join();
   }
 
   mxnet::cpp::NDArray::WaitAll();
 
   std::string synset_file = "synset.txt";
-  auto synset = LoadSynset(synset_file);
+  auto synset             = LoadSynset(synset_file);
   std::vector<mxnet::NDArray> tmp(num_threads);
   for (size_t i = 0; i < num_threads; i++) {
     tmp[i] = (*output_mx_arr)[i]->Copy(mxnet::Context::CPU(0));
@@ -288,8 +291,9 @@ void run_inference(const std::string& model_name,
     (*output_mx_arr)[i] = &tmp[i];
   }
   for (size_t i = 0; i < num_threads; ++i) {
-    PrintOutputResult(static_cast<float *>((*output_mx_arr)[i]->data().dptr_),
-                      (*output_mx_arr)[i]->shape().Size(), synset);
+    PrintOutputResult(static_cast<float*>((*output_mx_arr)[i]->data().dptr_),
+                      (*output_mx_arr)[i]->shape().Size(),
+                      synset);
   }
   int ret2 = MXFreeCachedOp(hdl);
   if (ret2 < 0) {
@@ -298,11 +302,10 @@ void run_inference(const std::string& model_name,
   mxnet::cpp::NDArray::WaitAll();
 }
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   if (argc < 4) {
     std::cout << "Please provide a model name, is_gpu, test_image" << std::endl
-              << "Usage: ./multi_threaded_inference [model_name] [is_gpu] [file_names]"
-              << std::endl
+              << "Usage: ./multi_threaded_inference [model_name] [is_gpu] [file_names]" << std::endl
               << "Example: ./.multi_threaded_inference imagenet1k-inception-bn 0 apple.jpg"
               << std::endl
               << "NOTE: Thread number ordering will be based on the ordering of file inputs"
@@ -311,21 +314,20 @@ int main(int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
   std::string model_name = std::string(argv[1]);
-  bool is_gpu = std::atoi(argv[2]);
+  bool is_gpu            = std::atoi(argv[2]);
   CHECK(argc >= 4) << "Number of files provided should be atleast 1";
   int num_threads = argc - 3;
   std::vector<std::string> test_files;
   for (size_t i = 0; i < argc - 3; ++i) {
     test_files.emplace_back(argv[3 + i]);
   }
-  int epoch = 0;
+  int epoch         = 0;
   bool static_alloc = true;
   bool static_shape = true;
 
-
   // Image size and channels
-  size_t width = 224;
-  size_t height = 224;
+  size_t width    = 224;
+  size_t height   = 224;
   size_t channels = 3;
 
   size_t image_size = width * height * channels;
@@ -337,18 +339,24 @@ int main(int argc, char *argv[]) {
   mxnet::cpp::Shape input_shape = mxnet::cpp::Shape(1, 3, 224, 224);
   for (size_t i = 0; i < files.size(); i++) {
     files[i].resize(image_size);
-    GetImageFile(test_files[i], files[i].data(), channels,
-                 cv::Size(width, height));
-    input_arrs.emplace_back(mxnet::cpp::NDArray(files[i].data(),
-                            input_shape, mxnet::cpp::Context::cpu(0)));
+    GetImageFile(test_files[i], files[i].data(), channels, cv::Size(width, height));
+    input_arrs.emplace_back(
+        mxnet::cpp::NDArray(files[i].data(), input_shape, mxnet::cpp::Context::cpu(0)));
   }
 
   // load symbol
   std::string static_alloc_str = static_alloc ? "true" : "false";
   std::string static_shape_str = static_shape ? "true" : "false";
   std::vector<mxnet::NDArray*> output_mx_arr(num_threads);
-  run_inference(model_name, input_arrs, &output_mx_arr, 1, false, num_threads,
-                static_alloc, static_shape, is_gpu);
+  run_inference(model_name,
+                input_arrs,
+                &output_mx_arr,
+                1,
+                false,
+                num_threads,
+                static_alloc,
+                static_shape,
+                is_gpu);
   mxnet::cpp::NDArray::WaitAll();
 
   return 0;
diff --git a/cpp-package/example/utils.h b/cpp-package/example/utils.h
index 87847701ce6e..887a807e5d12 100644
--- a/cpp-package/example/utils.h
+++ b/cpp-package/example/utils.h
@@ -27,50 +27,52 @@
 
 using namespace mxnet::cpp;
 
-#define TRY \
-  try {
-#define CATCH \
-  } catch(dmlc::Error &err) { \
-    LG << "Status: FAIL";\
+#define TRY try {
+#define CATCH                                 \
+  }                                           \
+  catch (dmlc::Error & err) {                 \
+    LG << "Status: FAIL";                     \
     LG << "With Error: " << MXGetLastError(); \
-    return 1; \
+    return 1;                                 \
   }
 
-bool isFileExists(const std::string &filename) {
+bool isFileExists(const std::string& filename) {
   std::ifstream fhandle(filename.c_str());
   return fhandle.good();
 }
 
-bool check_datafiles(const std::vector<std::string> &data_files) {
-  for (size_t index=0; index < data_files.size(); index++) {
+bool check_datafiles(const std::vector<std::string>& data_files) {
+  for (size_t index = 0; index < data_files.size(); index++) {
     if (!(isFileExists(data_files[index]))) {
-      LG << "Error: File does not exist: "<< data_files[index];
+      LG << "Error: File does not exist: " << data_files[index];
       return false;
     }
   }
   return true;
 }
 
-bool setDataIter(MXDataIter *iter , const std::string &useType,
-              const std::vector<std::string> &data_files, int batch_size) {
-    if (!check_datafiles(data_files)) {
-        return false;
-    }
+bool setDataIter(MXDataIter* iter,
+                 const std::string& useType,
+                 const std::vector<std::string>& data_files,
+                 int batch_size) {
+  if (!check_datafiles(data_files)) {
+    return false;
+  }
 
-    iter->SetParam("batch_size", batch_size);
-    iter->SetParam("shuffle", 1);
-    iter->SetParam("flat", 1);
+  iter->SetParam("batch_size", batch_size);
+  iter->SetParam("shuffle", 1);
+  iter->SetParam("flat", 1);
 
-    if (useType ==  "Train") {
-      iter->SetParam("image", data_files[0]);
-      iter->SetParam("label", data_files[1]);
-    } else if (useType == "Label") {
-      iter->SetParam("image", data_files[2]);
-      iter->SetParam("label", data_files[3]);
-    }
+  if (useType == "Train") {
+    iter->SetParam("image", data_files[0]);
+    iter->SetParam("label", data_files[1]);
+  } else if (useType == "Label") {
+    iter->SetParam("image", data_files[2]);
+    iter->SetParam("label", data_files[3]);
+  }
 
-    iter->CreateDataIter();
-    return true;
+  iter->CreateDataIter();
+  return true;
 }
 
 #endif  // CPP_PACKAGE_EXAMPLE_UTILS_H_
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
index 19375c0f81e8..ad1ab02c9619 100644
--- a/cpp-package/include/mxnet-cpp/base.h
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file base.h
-* \brief base definitions for mxnetcpp
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file base.h
+ * \brief base definitions for mxnetcpp
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_BASE_H_
 #define MXNET_CPP_BASE_H_
@@ -41,10 +41,10 @@ enum OpReqType {
   /*! \brief write gradient to provided space */
   kWriteTo,
   /*!
-  * \brief perform an inplace write,
-  * Target shares memory with one of input arguments.
-  * This option only happen when
-  */
+   * \brief perform an inplace write,
+   * Target shares memory with one of input arguments.
+   * This option only happen when
+   */
   kWriteInplace,
   /*! \brief add to the provided space */
   kAddTo
diff --git a/cpp-package/include/mxnet-cpp/contrib.h b/cpp-package/include/mxnet-cpp/contrib.h
index 21ca54014109..c6ca3b834b14 100644
--- a/cpp-package/include/mxnet-cpp/contrib.h
+++ b/cpp-package/include/mxnet-cpp/contrib.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file contrib.h
-* \brief utility function to enable some contrib features
-* \author Haohuan Wang
-*/
+ * \file contrib.h
+ * \brief utility function to enable some contrib features
+ * \author Haohuan Wang
+ */
 #ifndef MXNET_CPP_CONTRIB_H_
 #define MXNET_CPP_CONTRIB_H_
 
@@ -35,76 +35,79 @@ namespace mxnet {
 namespace cpp {
 namespace details {
 
-  /*!
-   * split a string with the given delimiter
-   * @param str string to be parsed
-   * @param delimiter delimiter
-   * @return delimited list of string
-   */
-  inline std::vector<std::string> split(const std::string& str, const std::string& delimiter) {
-    std::vector<std::string> splitted;
-    size_t last = 0;
-    size_t next = 0;
-    while ((next = str.find(delimiter, last)) != std::string::npos) {
-      splitted.push_back(str.substr(last, next - last));
-      last = next + 1;
-    }
-    splitted.push_back(str.substr(last));
-    return splitted;
+/*!
+ * split a string with the given delimiter
+ * @param str string to be parsed
+ * @param delimiter delimiter
+ * @return delimited list of string
+ */
+inline std::vector<std::string> split(const std::string& str, const std::string& delimiter) {
+  std::vector<std::string> splitted;
+  size_t last = 0;
+  size_t next = 0;
+  while ((next = str.find(delimiter, last)) != std::string::npos) {
+    splitted.push_back(str.substr(last, next - last));
+    last = next + 1;
   }
+  splitted.push_back(str.substr(last));
+  return splitted;
+}
 
 }  // namespace details
 
 namespace contrib {
 
-  // needs to be same with
-  //   https://github.com/apache/incubator-mxnet/blob/1c874cfc807cee755c38f6486e8e0f4d94416cd8/src/operator/subgraph/tensorrt/tensorrt-inl.h#L190
-  static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";
-  // needs to be same with
-  //   https://github.com/apache/incubator-mxnet/blob/master/src/operator/subgraph/tensorrt/tensorrt.cc#L244
-  static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";
-  /*!
-   * this is a mimic to https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/tensorrt.py#L37
-   * @param symbol symbol that already called subgraph api
-   * @param argParams original arg params, params needed by tensorrt will be removed after calling this function
-   * @param auxParams original aux params, params needed by tensorrt will be removed after calling this function
-   */
-  inline void InitTensorRTParams(const mxnet::cpp::Symbol& symbol,
-      std::map<std::string, mxnet::cpp::NDArray> *argParams,
-      std::map<std::string, mxnet::cpp::NDArray> *auxParams) {
-    mxnet::cpp::Symbol internals = symbol.GetInternals();
-    mx_uint numSymbol = internals.GetNumOutputs();
-    for (mx_uint i = 0; i < numSymbol; ++i) {
-        std::map<std::string, std::string> attrs = internals[i].ListAttributes();
-        if (attrs.find(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER) != attrs.end()) {
-            std::string new_params_names;
-            std::map<std::string, mxnet::cpp::NDArray> tensorrtParams;
-            std::vector<std::string> keys = details::split(
-                attrs[TENSORRT_SUBGRAPH_PARAM_IDENTIFIER], ";");
-            for (const auto& key : keys) {
-                if (argParams->find(key) != argParams->end()) {
-                    new_params_names += key + ";";
-                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*argParams)[key];
-                    argParams->erase(key);
-                } else if (auxParams->find(key) != auxParams->end()) {
-                    new_params_names += key + ";";
-                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*auxParams)[key];
-                    auxParams->erase(key);
-                }
-            }
-            std::map<std::string, std::string> new_attrs = {};
-            for (const auto& kv : tensorrtParams) {
-                // passing the ndarray address into TRT node attributes to get the weight
-                uint64_t address = reinterpret_cast<uint64_t>(kv.second.GetHandle());
-                new_attrs[kv.first] = std::to_string(address);
-            }
-            if (!new_attrs.empty()) {
-                internals[i].SetAttributes(new_attrs);
-                internals[i].SetAttribute(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER,
-                    new_params_names.substr(0, new_params_names.length() - 1));
-            }
+// needs to be same with
+//   https://github.com/apache/incubator-mxnet/blob/1c874cfc807cee755c38f6486e8e0f4d94416cd8/src/operator/subgraph/tensorrt/tensorrt-inl.h#L190
+static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";
+// needs to be same with
+//   https://github.com/apache/incubator-mxnet/blob/master/src/operator/subgraph/tensorrt/tensorrt.cc#L244
+static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";
+/*!
+ * this is a mimic to
+ * https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/tensorrt.py#L37
+ * @param symbol symbol that already called subgraph api
+ * @param argParams original arg params, params needed by tensorrt will be removed after calling
+ * this function
+ * @param auxParams original aux params, params needed by tensorrt will be removed after calling
+ * this function
+ */
+inline void InitTensorRTParams(const mxnet::cpp::Symbol& symbol,
+                               std::map<std::string, mxnet::cpp::NDArray>* argParams,
+                               std::map<std::string, mxnet::cpp::NDArray>* auxParams) {
+  mxnet::cpp::Symbol internals = symbol.GetInternals();
+  mx_uint numSymbol            = internals.GetNumOutputs();
+  for (mx_uint i = 0; i < numSymbol; ++i) {
+    std::map<std::string, std::string> attrs = internals[i].ListAttributes();
+    if (attrs.find(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER) != attrs.end()) {
+      std::string new_params_names;
+      std::map<std::string, mxnet::cpp::NDArray> tensorrtParams;
+      std::vector<std::string> keys =
+          details::split(attrs[TENSORRT_SUBGRAPH_PARAM_IDENTIFIER], ";");
+      for (const auto& key : keys) {
+        if (argParams->find(key) != argParams->end()) {
+          new_params_names += key + ";";
+          tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*argParams)[key];
+          argParams->erase(key);
+        } else if (auxParams->find(key) != auxParams->end()) {
+          new_params_names += key + ";";
+          tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*auxParams)[key];
+          auxParams->erase(key);
         }
+      }
+      std::map<std::string, std::string> new_attrs = {};
+      for (const auto& kv : tensorrtParams) {
+        // passing the ndarray address into TRT node attributes to get the weight
+        uint64_t address    = reinterpret_cast<uint64_t>(kv.second.GetHandle());
+        new_attrs[kv.first] = std::to_string(address);
+      }
+      if (!new_attrs.empty()) {
+        internals[i].SetAttributes(new_attrs);
+        internals[i].SetAttribute(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER,
+                                  new_params_names.substr(0, new_params_names.length() - 1));
+      }
     }
+  }
 }
 
 }  // namespace contrib
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 9b413e1a60fd..fff559b79df3 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file executor.h
-* \brief executor definition
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file executor.h
+ * \brief executor definition
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_EXECUTOR_H_
 #define MXNET_CPP_EXECUTOR_H_
@@ -40,43 +40,51 @@ namespace cpp {
 class Optimizer;
 
 /*!
-* \brief Executor interface
-*/
+ * \brief Executor interface
+ */
 class Executor {
  public:
-  Executor(const Symbol &symbol, Context context,
-           const std::vector<NDArray> &arg_arrays,
-           const std::vector<NDArray> &grad_arrays,
-           const std::vector<OpReqType> &grad_reqs,
-           const std::vector<NDArray> &aux_arrays,
-           const std::map<std::string, Context> &group_to_ctx =
-               std::map<std::string, Context>(),
-           Executor *shared_exec = nullptr);
-  explicit Executor(const CachedOpHandle &h) { handle_ = h; }
+  Executor(const Symbol& symbol,
+           Context context,
+           const std::vector<NDArray>& arg_arrays,
+           const std::vector<NDArray>& grad_arrays,
+           const std::vector<OpReqType>& grad_reqs,
+           const std::vector<NDArray>& aux_arrays,
+           const std::map<std::string, Context>& group_to_ctx = std::map<std::string, Context>(),
+           Executor* shared_exec                              = nullptr);
+  explicit Executor(const CachedOpHandle& h) {
+    handle_ = h;
+  }
   /*!
-  * \brief Perform a Forward operation of Operator
-  *  After this operation, user can get the result by using function head.
-  */
+   * \brief Perform a Forward operation of Operator
+   *  After this operation, user can get the result by using function head.
+   */
   void Forward(bool is_train) {
     std::vector<NDArrayHandle> arg_handles;
-    for (const auto &array : combined_arrays) {
+    for (const auto& array : combined_arrays) {
       arg_handles.push_back(array.GetHandle());
     }
-    int prev_is_record = 0;
+    int prev_is_record  = 0;
     int prev_train_mode = 0;
     CHECK_EQ(MXAutogradSetIsRecording(1, &prev_is_record), 0);
     if (is_train == true) {
       CHECK_EQ(MXAutogradSetIsTraining(1, &prev_train_mode), 0);
     }
     std::vector<NDArrayHandle> output_handles;
-    std::transform(outputs.begin(), outputs.end(),
-        std::back_inserter(output_handles), [](NDArray& a) {
+    std::transform(
+        outputs.begin(), outputs.end(), std::back_inserter(output_handles), [](NDArray& a) {
           return a.GetHandle();
         });
-    int out_size = 0;
-    NDArrayHandle *out_array = nullptr;
-    CHECK_EQ(MXInvokeCachedOp(handle_, arg_handles.size(), arg_handles.data(),
-                              device_type, device_id, &out_size, &out_array, nullptr),
+    int out_size             = 0;
+    NDArrayHandle* out_array = nullptr;
+    CHECK_EQ(MXInvokeCachedOp(handle_,
+                              arg_handles.size(),
+                              arg_handles.data(),
+                              device_type,
+                              device_id,
+                              &out_size,
+                              &out_array,
+                              nullptr),
              0);
     outputs.clear();
     outputs.reserve(out_size);
@@ -84,30 +92,29 @@ class Executor {
       outputs.push_back(NDArray(out_array[i]));
     }
     int cur_train_mode = prev_train_mode;
-    int cur_is_record = prev_is_record;
+    int cur_is_record  = prev_is_record;
     if (is_train == true) {
       CHECK_EQ(MXAutogradSetIsTraining(cur_train_mode, &prev_train_mode), 0);
     }
     CHECK_EQ(MXAutogradSetIsRecording(cur_is_record, &prev_is_record), 0);
   }
   /*!
-  * \brief Perform a Backward operation of the Operator.
-  *  This must be called after Forward.
-  *  After this operation, NDArrays specified by grad_in_args_store will be
-  *updated accordingly.
-  *  User is allowed to pass in an empty Array if the head node is
-  *  loss function and head gradeitn is not needed.
-  *
-  * \param head_grads the gradient of head nodes to be backproped.
-  */
-  void Backward(const std::vector<NDArray> &head_grads =
-                    std::vector<NDArray>()) {
+   * \brief Perform a Backward operation of the Operator.
+   *  This must be called after Forward.
+   *  After this operation, NDArrays specified by grad_in_args_store will be
+   *updated accordingly.
+   *  User is allowed to pass in an empty Array if the head node is
+   *  loss function and head gradeitn is not needed.
+   *
+   * \param head_grads the gradient of head nodes to be backproped.
+   */
+  void Backward(const std::vector<NDArray>& head_grads = std::vector<NDArray>()) {
     if (require_grad == true) {
       if (outputs.size() == 0) {
         Forward(false);
       }
       std::vector<NDArrayHandle> out_handles;
-      for (const auto &array : outputs) {
+      for (const auto& array : outputs) {
         out_handles.push_back(array.GetHandle());
       }
       std::vector<NDArrayHandle> head_grads_;
@@ -115,17 +122,33 @@ class Executor {
         head_grads_.push_back(d.GetHandle());
       }
       if (head_grads_.size() > 0) {
-        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(), out_handles.data(),
-                                      head_grads_.data(), 0, nullptr, 0, 0, 1,
-                                      nullptr, nullptr), 0);
+        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(),
+                                      out_handles.data(),
+                                      head_grads_.data(),
+                                      0,
+                                      nullptr,
+                                      0,
+                                      0,
+                                      1,
+                                      nullptr,
+                                      nullptr),
+                 0);
       } else {
-        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(), out_handles.data(),
-                                      nullptr, 0, nullptr, 0, 0, 1,
-                                      nullptr, nullptr), 0);
+        CHECK_EQ(MXAutogradBackwardEx(out_handles.size(),
+                                      out_handles.data(),
+                                      nullptr,
+                                      0,
+                                      nullptr,
+                                      0,
+                                      0,
+                                      1,
+                                      nullptr,
+                                      nullptr),
+                 0);
       }
       grad_arrays.clear();
       grad_arrays.reserve(arg_arrays.size());
-      for (const auto &array : arg_arrays) {
+      for (const auto& array : arg_arrays) {
         NDArrayHandle grad;
         CHECK_EQ(MXNDArrayGetGrad(array.GetHandle(), &grad), 0);
         grad_arrays.push_back(NDArray(grad));
@@ -136,9 +159,11 @@ class Executor {
   // To implement reshape function
   void Reshape();
   /*!
-  * \brief destructor, free the handle
-  */
-  ~Executor() { MXFreeCachedOp(handle_); }
+   * \brief destructor, free the handle
+   */
+  ~Executor() {
+    MXFreeCachedOp(handle_);
+  }
   std::vector<NDArray> arg_arrays;
   std::vector<NDArray> grad_arrays;
   std::vector<NDArray> aux_arrays;
@@ -147,8 +172,8 @@ class Executor {
   int device_id;
   bool require_grad;
   /*!
-  * \brief arrays store the outputs of forward
-  */
+   * \brief arrays store the outputs of forward
+   */
   std::vector<NDArray> outputs;
   std::map<std::string, NDArray> arg_dict() {
     return GetDict(symbol_.ListArguments(), arg_arrays);
@@ -161,21 +186,19 @@ class Executor {
   }
 
  private:
-  Executor(const Executor &e);
-  Executor &operator=(const Executor &e);
+  Executor(const Executor& e);
+  Executor& operator=(const Executor& e);
   CachedOpHandle handle_;
   Symbol symbol_;
-  std::map<std::string, NDArray> GetDict(const std::vector<std::string> &names,
-                                         const std::vector<NDArray> &arrays) {
+  std::map<std::string, NDArray> GetDict(const std::vector<std::string>& names,
+                                         const std::vector<NDArray>& arrays) {
     std::map<std::string, NDArray> ret;
     std::set<std::string> name_set;
-    for (const auto &s : names) {
-      CHECK(name_set.find(s) == name_set.end()) << "Duplicate names detected, "
-                                                << s;
+    for (const auto& s : names) {
+      CHECK(name_set.find(s) == name_set.end()) << "Duplicate names detected, " << s;
       name_set.insert(s);
     }
-    CHECK_EQ(name_set.size(), arrays.size())
-        << "names size not equal to arrays size";
+    CHECK_EQ(name_set.size(), arrays.size()) << "names size not equal to arrays size";
     for (size_t i = 0; i < names.size(); ++i) {
       ret[names[i]] = arrays[i];
     }
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
index 356591f8bf8e..5f509c2aa5a0 100644
--- a/cpp-package/include/mxnet-cpp/initializer.h
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -37,16 +37,12 @@ namespace cpp {
 
 class Initializer {
  public:
-  static bool StringStartWith(const std::string& name,
-                              const std::string& check_str) {
-    return (name.size() >= check_str.size() &&
-            name.substr(0, check_str.size()) == check_str);
+  static bool StringStartWith(const std::string& name, const std::string& check_str) {
+    return (name.size() >= check_str.size() && name.substr(0, check_str.size()) == check_str);
   }
-  static bool StringEndWith(const std::string& name,
-                            const std::string& check_str) {
+  static bool StringEndWith(const std::string& name, const std::string& check_str) {
     return (name.size() >= check_str.size() &&
-            name.substr(name.size() - check_str.size(), check_str.size()) ==
-                check_str);
+            name.substr(name.size() - check_str.size(), check_str.size()) == check_str);
   }
   virtual void operator()(const std::string& name, NDArray* arr) {
     if (StringStartWith(name, "upsampling")) {
@@ -84,20 +80,30 @@ class Initializer {
   virtual void InitBilinear(NDArray* arr) {
     Shape shape(arr->GetShape());
     std::vector<float> weight(shape.Size(), 0);
-    int f = std::ceil(shape[3] / 2.0);
+    int f   = std::ceil(shape[3] / 2.0);
     float c = (2 * f - 1 - f % 2) / (2. * f);
     for (size_t i = 0; i < shape.Size(); ++i) {
-      int x = i % shape[3];
-      int y = (i / shape[3]) % shape[2];
+      int x     = i % shape[3];
+      int y     = (i / shape[3]) % shape[2];
       weight[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
     }
     (*arr).SyncCopyFromCPU(weight);
   }
-  virtual void InitZero(NDArray* arr) { (*arr) = 0.0f; }
-  virtual void InitOne(NDArray* arr) { (*arr) = 1.0f; }
-  virtual void InitBias(NDArray* arr) { (*arr) = 0.0f; }
-  virtual void InitGamma(NDArray* arr) { (*arr) = 1.0f; }
-  virtual void InitBeta(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitZero(NDArray* arr) {
+    (*arr) = 0.0f;
+  }
+  virtual void InitOne(NDArray* arr) {
+    (*arr) = 1.0f;
+  }
+  virtual void InitBias(NDArray* arr) {
+    (*arr) = 0.0f;
+  }
+  virtual void InitGamma(NDArray* arr) {
+    (*arr) = 1.0f;
+  }
+  virtual void InitBeta(NDArray* arr) {
+    (*arr) = 0.0f;
+  }
   virtual void InitWeight(NDArray* arr) {}
   virtual void InitQuantizedWeight(NDArray* arr) {
     std::default_random_engine generator;
@@ -112,32 +118,30 @@ class Initializer {
 
 class Constant : public Initializer {
  public:
-  explicit Constant(float value)
-    : value(value) {}
-  void operator()(const std::string &name, NDArray *arr) override {
+  explicit Constant(float value) : value(value) {}
+  void operator()(const std::string& name, NDArray* arr) override {
     (*arr) = value;
   }
+
  protected:
   float value;
 };
 
 class Zero : public Constant {
  public:
-  Zero(): Constant(0.0f) {}
+  Zero() : Constant(0.0f) {}
 };
 
 class One : public Constant {
  public:
-  One(): Constant(1.0f) {}
+  One() : Constant(1.0f) {}
 };
 
 class Uniform : public Initializer {
  public:
-  explicit Uniform(float scale)
-    : Uniform(-scale, scale) {}
-  Uniform(float begin, float end)
-    : begin(begin), end(end) {}
-  void operator()(const std::string &name, NDArray *arr) override {
+  explicit Uniform(float scale) : Uniform(-scale, scale) {}
+  Uniform(float begin, float end) : begin(begin), end(end) {}
+  void operator()(const std::string& name, NDArray* arr) override {
     if (StringEndWith(name, "weight_quantize")) {
       InitQuantizedWeight(arr);
       return;
@@ -148,15 +152,15 @@ class Uniform : public Initializer {
     }
     NDArray::SampleUniform(begin, end, arr);
   }
+
  protected:
   float begin, end;
 };
 
 class Normal : public Initializer {
  public:
-  Normal(float mu, float sigma)
-    : mu(mu), sigma(sigma) {}
-  void operator()(const std::string &name, NDArray *arr) override {
+  Normal(float mu, float sigma) : mu(mu), sigma(sigma) {}
+  void operator()(const std::string& name, NDArray* arr) override {
     if (StringEndWith(name, "weight_quantize")) {
       InitQuantizedWeight(arr);
       return;
@@ -167,6 +171,7 @@ class Normal : public Initializer {
     }
     NDArray::SampleGaussian(mu, sigma, arr);
   }
+
  protected:
   float mu, sigma;
 };
@@ -174,7 +179,7 @@ class Normal : public Initializer {
 class Bilinear : public Initializer {
  public:
   Bilinear() {}
-  void operator()(const std::string &name, NDArray *arr) override {
+  void operator()(const std::string& name, NDArray* arr) override {
     if (StringEndWith(name, "weight_quantize")) {
       InitQuantizedWeight(arr);
       return;
@@ -189,21 +194,13 @@ class Bilinear : public Initializer {
 
 class Xavier : public Initializer {
  public:
-  enum RandType {
-    gaussian,
-    uniform
-  } rand_type;
-  enum FactorType {
-    avg,
-    in,
-    out
-  } factor_type;
+  enum RandType { gaussian, uniform } rand_type;
+  enum FactorType { avg, in, out } factor_type;
   float magnitude;
-  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg,
-         float magnitude = 3)
+  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg, float magnitude = 3)
       : rand_type(rand_type), factor_type(factor_type), magnitude(magnitude) {}
 
-  void operator()(const std::string &name, NDArray* arr) override {
+  void operator()(const std::string& name, NDArray* arr) override {
     if (StringEndWith(name, "weight_quantize")) {
       InitQuantizedWeight(arr);
       return;
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
index 09fa8061fef6..72441c84dadb 100644
--- a/cpp-package/include/mxnet-cpp/io.h
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file operator.h
-* \brief definition of io, such as DataIter
-* \author Zhang Chen
-*/
+ * \file operator.h
+ * \brief definition of io, such as DataIter
+ * \author Zhang Chen
+ */
 #ifndef MXNET_CPP_IO_H_
 #define MXNET_CPP_IO_H_
 
@@ -36,9 +36,9 @@
 namespace mxnet {
 namespace cpp {
 /*!
-* \brief Default object for holding a mini-batch of data and related
-* information.
-*/
+ * \brief Default object for holding a mini-batch of data and related
+ * information.
+ */
 class DataBatch {
  public:
   NDArray data;
@@ -48,17 +48,19 @@ class DataBatch {
 };
 class DataIter {
  public:
-  virtual void BeforeFirst(void) = 0;
-  virtual bool Next(void) = 0;
-  virtual NDArray GetData(void) = 0;
-  virtual NDArray GetLabel(void) = 0;
-  virtual int GetPadNum(void) = 0;
+  virtual void BeforeFirst(void)          = 0;
+  virtual bool Next(void)                 = 0;
+  virtual NDArray GetData(void)           = 0;
+  virtual NDArray GetLabel(void)          = 0;
+  virtual int GetPadNum(void)             = 0;
   virtual std::vector<int> GetIndex(void) = 0;
 
   DataBatch GetDataBatch() {
     return DataBatch{GetData(), GetLabel(), GetPadNum(), GetIndex()};
   }
-  void Reset() { BeforeFirst(); }
+  void Reset() {
+    BeforeFirst();
+  }
 
   virtual ~DataIter() = default;
 };
@@ -66,25 +68,29 @@ class DataIter {
 class MXDataIterMap {
  public:
   inline MXDataIterMap() {
-    mx_uint num_data_iter_creators = 0;
-    DataIterCreator *data_iter_creators = nullptr;
+    mx_uint num_data_iter_creators      = 0;
+    DataIterCreator* data_iter_creators = nullptr;
     int r = MXListDataIters(&num_data_iter_creators, &data_iter_creators);
     CHECK_EQ(r, 0);
     for (mx_uint i = 0; i < num_data_iter_creators; i++) {
-      const char *name;
-      const char *description;
+      const char* name;
+      const char* description;
       mx_uint num_args;
-      const char **arg_names;
-      const char **arg_type_infos;
-      const char **arg_descriptions;
-      r = MXDataIterGetIterInfo(data_iter_creators[i], &name, &description,
-                                &num_args, &arg_names, &arg_type_infos,
+      const char** arg_names;
+      const char** arg_type_infos;
+      const char** arg_descriptions;
+      r = MXDataIterGetIterInfo(data_iter_creators[i],
+                                &name,
+                                &description,
+                                &num_args,
+                                &arg_names,
+                                &arg_type_infos,
                                 &arg_descriptions);
       CHECK_EQ(r, 0);
       mxdataiter_creators_[name] = data_iter_creators[i];
     }
   }
-  inline DataIterCreator GetMXDataIterCreator(const std::string &name) {
+  inline DataIterCreator GetMXDataIterCreator(const std::string& name) {
     return mxdataiter_creators_[name];
   }
 
@@ -96,19 +102,21 @@ struct MXDataIterBlob {
  public:
   MXDataIterBlob() : handle_(nullptr) {}
   explicit MXDataIterBlob(DataIterHandle handle) : handle_(handle) {}
-  ~MXDataIterBlob() { MXDataIterFree(handle_); }
+  ~MXDataIterBlob() {
+    MXDataIterFree(handle_);
+  }
   DataIterHandle handle_;
 
  private:
-  MXDataIterBlob &operator=(const MXDataIterBlob &);
+  MXDataIterBlob& operator=(const MXDataIterBlob&);
 };
 
 class MXDataIter : public DataIter {
  public:
-  explicit MXDataIter(const std::string &mxdataiter_type);
-  MXDataIter(const MXDataIter &other) {
-    creator_ = other.creator_;
-    params_ = other.params_;
+  explicit MXDataIter(const std::string& mxdataiter_type);
+  MXDataIter(const MXDataIter& other) {
+    creator_  = other.creator_;
+    params_   = other.params_;
     blob_ptr_ = other.blob_ptr_;
   }
   void BeforeFirst();
@@ -125,7 +133,7 @@ class MXDataIter : public DataIter {
    * \return reference of self
    */
   template <typename T>
-  MXDataIter &SetParam(const std::string &name, const T &value) {
+  MXDataIter& SetParam(const std::string& name, const T& value) {
     std::string value_str;
     std::stringstream ss;
     ss << value;
@@ -145,4 +153,3 @@ class MXDataIter : public DataIter {
 }  // namespace mxnet
 
 #endif  // MXNET_CPP_IO_H_
-
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
index 0080be1e7306..20267f73b4f7 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.h
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file kvstore.h
-* \brief definition of kvstore
-* \author Chuntao Hong
-*/
+ * \file kvstore.h
+ * \brief definition of kvstore
+ * \author Chuntao Hong
+ */
 
 #ifndef MXNET_CPP_KVSTORE_H_
 #define MXNET_CPP_KVSTORE_H_
@@ -44,15 +44,17 @@ class KVStore {
   static void Push(int key, const NDArray& val, int priority = 0);
   static void Push(const std::string& key, const NDArray& val, int priority = 0);
   static void Push(const std::vector<int>& keys,
-                   const std::vector<NDArray>& vals, int priority = 0);
+                   const std::vector<NDArray>& vals,
+                   int priority = 0);
   static void Push(const std::vector<std::string>& keys,
-                   const std::vector<NDArray>& vals, int priority = 0);
+                   const std::vector<NDArray>& vals,
+                   int priority = 0);
   static void Pull(int key, NDArray* out, int priority = 0);
   static void Pull(const std::string& key, NDArray* out, int priority = 0);
-  static void Pull(const std::vector<int>& keys,
-                   std::vector<NDArray>* outs, int priority = 0);
+  static void Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority = 0);
   static void Pull(const std::vector<std::string>& keys,
-                   std::vector<NDArray>* outs, int priority = 0);
+                   std::vector<NDArray>* outs,
+                   int priority = 0);
   // TODO(lx): put lr in optimizer or not?
   static void SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local = false);
   static std::string GetType();
diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
index b9381a830a88..574472d3b7c4 100644
--- a/cpp-package/include/mxnet-cpp/lr_scheduler.h
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -18,9 +18,9 @@
  */
 
 /*!
-* \file lr_scheduler.h
-* \brief Scheduling learning rate
-*/
+ * \file lr_scheduler.h
+ * \brief Scheduling learning rate
+ */
 
 #ifndef MXNET_CPP_LR_SCHEDULER_H_
 #define MXNET_CPP_LR_SCHEDULER_H_
@@ -31,28 +31,29 @@ namespace mxnet {
 namespace cpp {
 
 /*!
-* \brief lr scheduler interface
-*/
+ * \brief lr scheduler interface
+ */
 class LRScheduler {
  public:
   /*!
-  * \brief constructor
-  * \param base_lr the initial learning rate.
-  */
-  explicit LRScheduler(float base_lr = 0.01)
-      : base_lr_(base_lr) {}
+   * \brief constructor
+   * \param base_lr the initial learning rate.
+   */
+  explicit LRScheduler(float base_lr = 0.01) : base_lr_(base_lr) {}
   /*!
-  * \brief set base lr
-  * \param lr learning rate from optimizer
-  */
-  void SetLR(const float lr) { base_lr_ = lr; }
+   * \brief set base lr
+   * \param lr learning rate from optimizer
+   */
+  void SetLR(const float lr) {
+    base_lr_ = lr;
+  }
   /*!
-  * \brief get a new learning rate
-  */
+   * \brief get a new learning rate
+   */
   virtual float GetLR(unsigned num_update) = 0;
   /*!
-  * \brief destructor
-  */
+   * \brief destructor
+   */
   virtual ~LRScheduler() {}
 
  protected:
@@ -63,8 +64,8 @@ class FactorScheduler : public LRScheduler {
  public:
   explicit FactorScheduler(int step, float factor = 1, float stop_factor_lr = 1e-8)
       : LRScheduler() {
-    step_ = step;
-    factor_ = factor;
+    step_           = step;
+    factor_         = factor;
     stop_factor_lr_ = stop_factor_lr;
   }
 
@@ -74,8 +75,8 @@ class FactorScheduler : public LRScheduler {
       base_lr_ *= factor_;
       if (base_lr_ < stop_factor_lr_) {
         base_lr_ = stop_factor_lr_;
-        LG << "Update[" << num_update << "]: now learning rate arrived at " \
-           << base_lr_ << ", will not change in the future";
+        LG << "Update[" << num_update << "]: now learning rate arrived at " << base_lr_
+           << ", will not change in the future";
       } else {
         LG << "Update[" << num_update << "]: Change learning rate to " << base_lr_;
       }
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
index 6dbb197dae49..7e3f39e65b96 100644
--- a/cpp-package/include/mxnet-cpp/metric.h
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file base.h
-* \brief metrics defined
-* \author Zhang Chen
-*/
+ * \file base.h
+ * \brief metrics defined
+ * \author Zhang Chen
+ */
 
 #ifndef MXNET_CPP_METRIC_H_
 #define MXNET_CPP_METRIC_H_
@@ -38,24 +38,24 @@ namespace cpp {
 
 class EvalMetric {
  public:
-  explicit EvalMetric(const std::string& name, int num = 0)
-      : name(name), num(num) {}
+  explicit EvalMetric(const std::string& name, int num = 0) : name(name), num(num) {}
   virtual void Update(NDArray labels, NDArray preds) = 0;
   void Reset() {
-    num_inst = 0;
+    num_inst   = 0;
     sum_metric = 0.0f;
   }
-  float Get() { return sum_metric / num_inst; }
+  float Get() {
+    return sum_metric / num_inst;
+  }
   void GetNameValue();
 
  protected:
   std::string name;
   int num;
   float sum_metric = 0.0f;
-  int num_inst = 0;
+  int num_inst     = 0;
 
-  static void CheckLabelShapes(NDArray labels, NDArray preds,
-                               bool strict = false) {
+  static void CheckLabelShapes(NDArray labels, NDArray preds, bool strict = false) {
     if (strict) {
       CHECK_EQ(Shape(labels.GetShape()), Shape(preds.GetShape()));
     } else {
@@ -88,15 +88,14 @@ class LogLoss : public EvalMetric {
 
   void Update(NDArray labels, NDArray preds) override {
     static const float epsilon = 1e-15;
-    mx_uint len = labels.GetShape()[0];
-    mx_uint m = preds.GetShape()[1];
+    mx_uint len                = labels.GetShape()[0];
+    mx_uint m                  = preds.GetShape()[1];
     std::vector<mx_float> pred_data(len * m);
     std::vector<mx_float> label_data(len);
     preds.SyncCopyToCPU(&pred_data, pred_data.size());
     labels.SyncCopyToCPU(&label_data, len);
     for (mx_uint i = 0; i < len; ++i) {
-      sum_metric +=
-          -std::log(std::max(pred_data[i * m + label_data[i]], epsilon));
+      sum_metric += -std::log(std::max(pred_data[i * m + label_data[i]], epsilon));
       num_inst += 1;
     }
   }
@@ -114,7 +113,7 @@ class MAE : public EvalMetric {
     std::vector<mx_float> label_data;
     labels.SyncCopyToCPU(&label_data);
 
-    size_t len = preds.Size();
+    size_t len   = preds.Size();
     mx_float sum = 0;
     for (size_t i = 0; i < len; ++i) {
       sum += std::abs(pred_data[i] - label_data[i]);
@@ -136,7 +135,7 @@ class MSE : public EvalMetric {
     std::vector<mx_float> label_data;
     labels.SyncCopyToCPU(&label_data);
 
-    size_t len = preds.Size();
+    size_t len   = preds.Size();
     mx_float sum = 0;
     for (size_t i = 0; i < len; ++i) {
       mx_float diff = pred_data[i] - label_data[i];
@@ -159,7 +158,7 @@ class RMSE : public EvalMetric {
     std::vector<mx_float> label_data;
     labels.SyncCopyToCPU(&label_data);
 
-    size_t len = preds.Size();
+    size_t len   = preds.Size();
     mx_float sum = 0;
     for (size_t i = 0; i < len; ++i) {
       mx_float diff = pred_data[i] - label_data[i];
@@ -172,8 +171,7 @@ class RMSE : public EvalMetric {
 
 class PSNR : public EvalMetric {
  public:
-  PSNR() : EvalMetric("psnr") {
-  }
+  PSNR() : EvalMetric("psnr") {}
 
   void Update(NDArray labels, NDArray preds) override {
     CheckLabelShapes(labels, preds);
@@ -183,7 +181,7 @@ class PSNR : public EvalMetric {
     std::vector<mx_float> label_data;
     labels.SyncCopyToCPU(&label_data);
 
-    size_t len = preds.Size();
+    size_t len   = preds.Size();
     mx_float sum = 0;
     for (size_t i = 0; i < len; ++i) {
       mx_float diff = pred_data[i] - label_data[i];
@@ -206,4 +204,3 @@ class PSNR : public EvalMetric {
 }  // namespace mxnet
 
 #endif  // MXNET_CPP_METRIC_H_
-
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
index c8af6a476a52..8ca718d0ed83 100644
--- a/cpp-package/include/mxnet-cpp/model.h
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file model.h
-* \brief MXNET.cpp model module
-* \author Zhang Chen
-*/
+ * \file model.h
+ * \brief MXNET.cpp model module
+ * \author Zhang Chen
+ */
 
 #ifndef MXNET_CPP_MODEL_H_
 #define MXNET_CPP_MODEL_H_
@@ -38,9 +38,9 @@ namespace cpp {
 struct FeedForwardConfig {
   Symbol symbol;
   std::vector<Context> ctx = {Context::cpu()};
-  int num_epoch = 0;
-  int epoch_size = 0;
-  std::string optimizer = "sgd";
+  int num_epoch            = 0;
+  int epoch_size           = 0;
+  std::string optimizer    = "sgd";
   // TODO(zhangchen-qinyinghua) More implement
   // initializer=Uniform(0.01),
   // numpy_batch_size=128,
@@ -48,12 +48,12 @@ struct FeedForwardConfig {
   // allow_extra_params=False,
   // begin_epoch=0,
   // **kwargs):
-  FeedForwardConfig(const FeedForwardConfig &other) {}
+  FeedForwardConfig(const FeedForwardConfig& other) {}
   FeedForwardConfig() {}
 };
 class FeedForward {
  public:
-  explicit FeedForward(const FeedForwardConfig &conf) : conf_(conf) {}
+  explicit FeedForward(const FeedForwardConfig& conf) : conf_(conf) {}
   void Predict();
   void Score();
   void Fit();
@@ -73,4 +73,3 @@ class FeedForward {
 }  // namespace mxnet
 
 #endif  // MXNET_CPP_MODEL_H_
-
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
index 793f0e87d9dd..60c30957a4cc 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.h
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file ndarray.h
-* \brief definition of ndarray
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file ndarray.h
+ * \brief definition of ndarray
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_NDARRAY_H_
 #define MXNET_CPP_NDARRAY_H_
@@ -37,31 +37,31 @@
 namespace mxnet {
 namespace cpp {
 
-enum DeviceType {
-  kCPU = 1,
-  kGPU = 2,
-  kCPUPinned = 3
-};
+enum DeviceType { kCPU = 1, kGPU = 2, kCPUPinned = 3 };
 
 /*!
-* \brief Context interface
-*/
+ * \brief Context interface
+ */
 class Context {
  public:
   /*!
-  * \brief Context constructor
-  * \param type type of the device
-  * \param id id of the device
-  */
-  Context(const DeviceType &type, int id) : type_(type), id_(id) {}
+   * \brief Context constructor
+   * \param type type of the device
+   * \param id id of the device
+   */
+  Context(const DeviceType& type, int id) : type_(type), id_(id) {}
   /*!
-  * \return the type of the device
-  */
-  DeviceType GetDeviceType() const { return type_; }
+   * \return the type of the device
+   */
+  DeviceType GetDeviceType() const {
+    return type_;
+  }
   /*!
-  * \return the id of the device
-  */
-  int GetDeviceId() const { return id_; }
+   * \return the id of the device
+   */
+  int GetDeviceId() const {
+    return id_;
+  }
 
   /*!
    * \brief Return a GPU context
@@ -87,229 +87,231 @@ class Context {
 };
 
 /*!
-* \brief struct to store NDArrayHandle
-*/
+ * \brief struct to store NDArrayHandle
+ */
 struct NDBlob {
  public:
   /*!
-  * \brief default constructor
-  */
+   * \brief default constructor
+   */
   NDBlob() : handle_(nullptr) {}
   /*!
-  * \brief construct with a NDArrayHandle
-  * \param handle NDArrayHandle to store
-  */
+   * \brief construct with a NDArrayHandle
+   * \param handle NDArrayHandle to store
+   */
   explicit NDBlob(NDArrayHandle handle) : handle_(handle) {}
   /*!
-  * \brief destructor, free the NDArrayHandle
-  */
-  ~NDBlob() { MXNDArrayFree(handle_); }
+   * \brief destructor, free the NDArrayHandle
+   */
+  ~NDBlob() {
+    MXNDArrayFree(handle_);
+  }
   /*!
-  * \brief the NDArrayHandle
-  */
+   * \brief the NDArrayHandle
+   */
   NDArrayHandle handle_;
 
  private:
-  NDBlob(const NDBlob &);
-  NDBlob &operator=(const NDBlob &);
+  NDBlob(const NDBlob&);
+  NDBlob& operator=(const NDBlob&);
 };
 
 /*!
-* \brief NDArray interface
-*/
+ * \brief NDArray interface
+ */
 class NDArray {
  public:
   /*!
-  * \brief construct with a none handle
-  */
+   * \brief construct with a none handle
+   */
   NDArray();
   /*!
-  * \brief construct with a NDArrayHandle
-  */
-  explicit NDArray(const NDArrayHandle &handle);
-  /*!
-  * \brief construct a new dynamic NDArray
-  * \param shape the shape of array
-  * \param context context of NDArray
-  * \param delay_alloc whether delay the allocation
-  * \param dtype data type of NDArray
-  */
-  NDArray(const std::vector<mx_uint> &shape, const Context &context,
-          bool delay_alloc = true, int dtype = 0);
-  /*!
-  * \brief construct a new dynamic NDArray
-  * \param shape the shape of array
-  * \param constext context of NDArray
-  * \param delay_alloc whether delay the allocation
-  * \param dtype data type of NDArray
-  */
-  NDArray(const Shape &shape, const Context &context,
-          bool delay_alloc = true, int dtype = 0);
-  NDArray(const mx_float *data, size_t size);
-  /*!
-  * \brief construct a new dynamic NDArray
-  * \param data the data to create NDArray from
-  * \param shape the shape of array
-  * \param constext context of NDArray
-  */
-  NDArray(const mx_float *data, const Shape &shape, const Context &context);
-  /*!
-  * \brief construct a new dynamic NDArray
-  * \param data the data to create NDArray from
-  * \param shape the shape of array
-  * \param constext context of NDArray
-  */
-  NDArray(const std::vector<mx_float> &data, const Shape &shape,
-          const Context &context);
-  explicit NDArray(const std::vector<mx_float> &data);
+   * \brief construct with a NDArrayHandle
+   */
+  explicit NDArray(const NDArrayHandle& handle);
+  /*!
+   * \brief construct a new dynamic NDArray
+   * \param shape the shape of array
+   * \param context context of NDArray
+   * \param delay_alloc whether delay the allocation
+   * \param dtype data type of NDArray
+   */
+  NDArray(const std::vector<mx_uint>& shape,
+          const Context& context,
+          bool delay_alloc = true,
+          int dtype        = 0);
+  /*!
+   * \brief construct a new dynamic NDArray
+   * \param shape the shape of array
+   * \param constext context of NDArray
+   * \param delay_alloc whether delay the allocation
+   * \param dtype data type of NDArray
+   */
+  NDArray(const Shape& shape, const Context& context, bool delay_alloc = true, int dtype = 0);
+  NDArray(const mx_float* data, size_t size);
+  /*!
+   * \brief construct a new dynamic NDArray
+   * \param data the data to create NDArray from
+   * \param shape the shape of array
+   * \param constext context of NDArray
+   */
+  NDArray(const mx_float* data, const Shape& shape, const Context& context);
+  /*!
+   * \brief construct a new dynamic NDArray
+   * \param data the data to create NDArray from
+   * \param shape the shape of array
+   * \param constext context of NDArray
+   */
+  NDArray(const std::vector<mx_float>& data, const Shape& shape, const Context& context);
+  explicit NDArray(const std::vector<mx_float>& data);
   NDArray operator+(mx_float scalar);
   NDArray operator-(mx_float scalar);
   NDArray operator*(mx_float scalar);
   NDArray operator/(mx_float scalar);
   NDArray operator%(mx_float scalar);
-  NDArray operator+(const NDArray &);
-  NDArray operator-(const NDArray &);
-  NDArray operator*(const NDArray &);
-  NDArray operator/(const NDArray &);
-  NDArray operator%(const NDArray &);
-  /*!
-  * \brief set all the elements in ndarray to be scalar
-  * \param scalar the scalar to set
-  * \return reference of self
-  */
-  NDArray &operator=(mx_float scalar);
-  /*!
-  * \brief elementwise add to current space
-  *  this mutate the current NDArray
-  * \param scalar the data to add
-  * \return reference of self
-  */
-  NDArray &operator+=(mx_float scalar);
-  /*!
-  * \brief elementwise subtract from current ndarray
-  * this mutate the current NDArray
-  * \param scalar the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator-=(mx_float scalar);
-  /*!
-  * \brief elementwise multiplication to current ndarray
-  *  this mutate the current NDArray
-  * \param scalar the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator*=(mx_float scalar);
-  /*!
-  * \brief elementwise division from current ndarray
-  *  this mutate the current NDArray
-  * \param scalar the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator/=(mx_float scalar);
-  /*!
-  * \brief elementwise modulo from current ndarray
-  *  this mutate the current NDArray
-  * \param scalar the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator%=(mx_float scalar);
-  /*!
-  * \brief elementwise add to current space
-  *  this mutate the current NDArray
-  * \param src the data to add
-  * \return reference of self
-  */
-  NDArray &operator+=(const NDArray &src);
-  /*!
-  * \brief elementwise subtract from current ndarray
-  * this mutate the current NDArray
-  * \param src the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator-=(const NDArray &src);
-  /*!
-  * \brief elementwise multiplication to current ndarray
-  *  this mutate the current NDArray
-  * \param src the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator*=(const NDArray &src);
-  /*!
-  * \brief elementwise division from current ndarray
-  *  this mutate the current NDArray
-  * \param src the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator/=(const NDArray &src);
-  /*!
-  * \brief elementwise modulo from current ndarray
-  *  this mutate the current NDArray
-  * \param src the data to subtract
-  * \return reference of self
-  */
-  NDArray &operator%=(const NDArray &src);
+  NDArray operator+(const NDArray&);
+  NDArray operator-(const NDArray&);
+  NDArray operator*(const NDArray&);
+  NDArray operator/(const NDArray&);
+  NDArray operator%(const NDArray&);
+  /*!
+   * \brief set all the elements in ndarray to be scalar
+   * \param scalar the scalar to set
+   * \return reference of self
+   */
+  NDArray& operator=(mx_float scalar);
+  /*!
+   * \brief elementwise add to current space
+   *  this mutate the current NDArray
+   * \param scalar the data to add
+   * \return reference of self
+   */
+  NDArray& operator+=(mx_float scalar);
+  /*!
+   * \brief elementwise subtract from current ndarray
+   * this mutate the current NDArray
+   * \param scalar the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator-=(mx_float scalar);
+  /*!
+   * \brief elementwise multiplication to current ndarray
+   *  this mutate the current NDArray
+   * \param scalar the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator*=(mx_float scalar);
+  /*!
+   * \brief elementwise division from current ndarray
+   *  this mutate the current NDArray
+   * \param scalar the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator/=(mx_float scalar);
+  /*!
+   * \brief elementwise modulo from current ndarray
+   *  this mutate the current NDArray
+   * \param scalar the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator%=(mx_float scalar);
+  /*!
+   * \brief elementwise add to current space
+   *  this mutate the current NDArray
+   * \param src the data to add
+   * \return reference of self
+   */
+  NDArray& operator+=(const NDArray& src);
+  /*!
+   * \brief elementwise subtract from current ndarray
+   * this mutate the current NDArray
+   * \param src the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator-=(const NDArray& src);
+  /*!
+   * \brief elementwise multiplication to current ndarray
+   *  this mutate the current NDArray
+   * \param src the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator*=(const NDArray& src);
+  /*!
+   * \brief elementwise division from current ndarray
+   *  this mutate the current NDArray
+   * \param src the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator/=(const NDArray& src);
+  /*!
+   * \brief elementwise modulo from current ndarray
+   *  this mutate the current NDArray
+   * \param src the data to subtract
+   * \return reference of self
+   */
+  NDArray& operator%=(const NDArray& src);
   NDArray ArgmaxChannel();
   /*!
-  * \brief Do a synchronize copy from a contiguous CPU memory region.
-  *
-  *  This function will call WaitToWrite before the copy is performed.
-  *  This is useful to copy data from existing memory region that are
-  *  not wrapped by NDArray(thus dependency not being tracked).
-  *
-  * \param data the data source to copy from.
-  * \param size the memory size we want to copy from.
-  */
-  void SyncCopyFromCPU(const mx_float *data, size_t size);
-  /*!
-  * \brief Do a synchronize copy from a contiguous CPU memory region.
-  *
-  *  This function will call WaitToWrite before the copy is performed.
-  *  This is useful to copy data from existing memory region that are
-  *  not wrapped by NDArray(thus dependency not being tracked).
-  *
-  * \param data the data source to copy from, int the form of mx_float vector
-  */
-  void SyncCopyFromCPU(const std::vector<mx_float> &data);
-  /*!
-  * \brief Do a synchronize copy to a contiguous CPU memory region.
-  *
-  *  This function will call WaitToRead before the copy is performed.
-  *  This is useful to copy data from existing memory region that are
-  *  not wrapped by NDArray(thus dependency not being tracked).
-  *
-  * \param data the data source to copyinto.
-  * \param size the memory size we want to copy into. Defualt value is Size()
-  */
-  void SyncCopyToCPU(mx_float *data, size_t size = 0);
-  /*!
-  * \brief Do a synchronize copy to a contiguous CPU memory region.
-  *
-  *  This function will call WaitToRead before the copy is performed.
-  *  This is useful to copy data from existing memory region that are
-  *  not wrapped by NDArray(thus dependency not being tracked).
-  *
-  * \param data the data source to copyinto.
-  * \param size the memory size we want to copy into. Defualt value is Size()
-  */
-  void SyncCopyToCPU(std::vector<mx_float> *data, size_t size = 0);
-  /*!
-  * \brief copy the content of current array to a target array.
-  * \param other the target NDArray
-  * \return the target NDarray
-  */
-  NDArray CopyTo(NDArray * other) const;
-  /*!
-  * \brief return a new copy to this NDArray
-  * \param Context the new context of this NDArray
-  * \return the new copy
-  */
-  NDArray Copy(const Context &) const;
-  /*!
-  * \brief return offset of the element at (h, w)
-  * \param h height position
-  * \param w width position
-  * \return offset of two dimensions array
-  */
+   * \brief Do a synchronize copy from a contiguous CPU memory region.
+   *
+   *  This function will call WaitToWrite before the copy is performed.
+   *  This is useful to copy data from existing memory region that are
+   *  not wrapped by NDArray(thus dependency not being tracked).
+   *
+   * \param data the data source to copy from.
+   * \param size the memory size we want to copy from.
+   */
+  void SyncCopyFromCPU(const mx_float* data, size_t size);
+  /*!
+   * \brief Do a synchronize copy from a contiguous CPU memory region.
+   *
+   *  This function will call WaitToWrite before the copy is performed.
+   *  This is useful to copy data from existing memory region that are
+   *  not wrapped by NDArray(thus dependency not being tracked).
+   *
+   * \param data the data source to copy from, int the form of mx_float vector
+   */
+  void SyncCopyFromCPU(const std::vector<mx_float>& data);
+  /*!
+   * \brief Do a synchronize copy to a contiguous CPU memory region.
+   *
+   *  This function will call WaitToRead before the copy is performed.
+   *  This is useful to copy data from existing memory region that are
+   *  not wrapped by NDArray(thus dependency not being tracked).
+   *
+   * \param data the data source to copyinto.
+   * \param size the memory size we want to copy into. Defualt value is Size()
+   */
+  void SyncCopyToCPU(mx_float* data, size_t size = 0);
+  /*!
+   * \brief Do a synchronize copy to a contiguous CPU memory region.
+   *
+   *  This function will call WaitToRead before the copy is performed.
+   *  This is useful to copy data from existing memory region that are
+   *  not wrapped by NDArray(thus dependency not being tracked).
+   *
+   * \param data the data source to copyinto.
+   * \param size the memory size we want to copy into. Defualt value is Size()
+   */
+  void SyncCopyToCPU(std::vector<mx_float>* data, size_t size = 0);
+  /*!
+   * \brief copy the content of current array to a target array.
+   * \param other the target NDArray
+   * \return the target NDarray
+   */
+  NDArray CopyTo(NDArray* other) const;
+  /*!
+   * \brief return a new copy to this NDArray
+   * \param Context the new context of this NDArray
+   * \return the new copy
+   */
+  NDArray Copy(const Context&) const;
+  /*!
+   * \brief return offset of the element at (h, w)
+   * \param h height position
+   * \param w width position
+   * \return offset of two dimensions array
+   */
   size_t Offset(size_t h = 0, size_t w = 0) const;
   /*!
    * \brief return offset of three dimensions array
@@ -320,17 +322,17 @@ class NDArray {
    */
   size_t Offset(size_t c, size_t h, size_t w) const;
   /*!
-  * \brief return value of the element at (index)
-  * \param index  position
-  * \return value of one dimensions array
-  */
+   * \brief return value of the element at (index)
+   * \param index  position
+   * \return value of one dimensions array
+   */
   mx_float At(size_t index) const;
   /*!
-  * \brief return value of the element at (h, w)
-  * \param h height position
-  * \param w width position
-  * \return value of two dimensions array
-  */
+   * \brief return value of the element at (h, w)
+   * \param h height position
+   * \param w width position
+   * \return value of two dimensions array
+   */
   mx_float At(size_t h, size_t w) const;
   /*!
    * \brief return value of three dimensions array
@@ -341,143 +343,144 @@ class NDArray {
    */
   mx_float At(size_t c, size_t h, size_t w) const;
   /*!
-  * \brief Slice a NDArray
-  * \param begin begin index in first dim
-  * \param end end index in first dim
-  * \return sliced NDArray
-  */
+   * \brief Slice a NDArray
+   * \param begin begin index in first dim
+   * \param end end index in first dim
+   * \return sliced NDArray
+   */
   NDArray Slice(mx_uint begin, mx_uint end) const;
   /*!
-  * \brief Return a reshaped NDArray that shares memory with current one
-  * \param new_shape the new shape
-  * \return reshaped NDarray
-  */
-  NDArray Reshape(const Shape &new_shape) const;
+   * \brief Return a reshaped NDArray that shares memory with current one
+   * \param new_shape the new shape
+   * \return reshaped NDarray
+   */
+  NDArray Reshape(const Shape& new_shape) const;
   /*!
-  * \brief Block until all the pending write operations with respect
-  *    to current NDArray are finished, and read can be performed.
-  */
+   * \brief Block until all the pending write operations with respect
+   *    to current NDArray are finished, and read can be performed.
+   */
   void WaitToRead() const;
   /*!
-  * \brief Block until all the pending read/write operations with respect
-  *    to current NDArray are finished, and write can be performed.
-  */
+   * \brief Block until all the pending read/write operations with respect
+   *    to current NDArray are finished, and write can be performed.
+   */
   void WaitToWrite();
   /*!
-  * \brief Block until all the pending read/write operations with respect
-  *    to current NDArray are finished, and read/write can be performed.
-  */
+   * \brief Block until all the pending read/write operations with respect
+   *    to current NDArray are finished, and read/write can be performed.
+   */
   static void WaitAll();
   /*!
-  * \brief Sample gaussian distribution for each elements of out.
-  * \param mu mean of gaussian distribution.
-  * \param sigma standard deviation of gaussian distribution.
-  * \param out output NDArray.
-  */
-  static void SampleGaussian(mx_float mu, mx_float sigma, NDArray *out);
-  /*!
-  * \brief Sample uniform distribution for each elements of out.
-  * \param begin lower bound of distribution.
-  * \param end upper bound of distribution.
-  * \param out output NDArray.
-  */
-  static void SampleUniform(mx_float begin, mx_float end, NDArray *out);
-  /*!
-  * \brief Load NDArrays from binary file.
-  * \param file_name name of the binary file.
-  * \param array_list a list of NDArrays returned, do not fill the list if
-  * nullptr is given.
-  * \param array_map a map from names to NDArrays returned, do not fill the map
-  * if nullptr is given or no names is stored in binary file.
-  */
-  static void Load(const std::string &file_name,
-                   std::vector<NDArray> *array_list = nullptr,
-                   std::map<std::string, NDArray> *array_map = nullptr);
-  /*!
-  * \brief Load map of NDArrays from binary file.
-  * \param file_name name of the binary file.
-  * \return a list of NDArrays.
-  */
-  static std::map<std::string, NDArray> LoadToMap(const std::string &file_name);
-  /*!
-  * \brief Load list of NDArrays from binary file.
-  * \param file_name name of the binary file.
-  * \return a map from names to NDArrays.
-  */
-  static std::vector<NDArray> LoadToList(const std::string &file_name);
-  /*!
-  * \brief Load NDArrays from buffer.
-  * \param buffer Pointer to buffer. (ie contents of param file)
-  * \param size Size of buffer
-  * \param array_list a list of NDArrays returned, do not fill the list if
-  * nullptr is given.
-  * \param array_map a map from names to NDArrays returned, do not fill the map
-  * if nullptr is given or no names is stored in binary file.
-  */
-  static void LoadFromBuffer(const void *buffer, size_t size,
-                   std::vector<NDArray> *array_list = nullptr,
-                   std::map<std::string, NDArray> *array_map = nullptr);
-  /*!
-  * \brief Load map of NDArrays from buffer.
-  * \param buffer Pointer to buffer. (ie contents of param file)
-  * \param size Size of buffer
-  * \return a list of NDArrays.
-  */
-  static std::map<std::string, NDArray> LoadFromBufferToMap(const void *buffer, size_t size);
-  /*!
-  * \brief Load list of NDArrays from buffer.
-  * \param buffer Pointer to buffer. (ie contents of param file)
-  * \param size Size of buffer
-  * \return a map from names to NDArrays.
-  */
-  static std::vector<NDArray> LoadFromBufferToList(const void *buffer, size_t size);
-  /*!
-  * \brief save a map of string->NDArray to binary file.
-  * \param file_name name of the binary file.
-  * \param array_map a map from names to NDArrays.
-  */
-  static void Save(const std::string &file_name,
-                   const std::map<std::string, NDArray> &array_map);
-  /*!
-  * \brief save a list of NDArrays to binary file.
-  * \param file_name name of the binary file.
-  * \param array_list a list of NDArrays.
-  */
-  static void Save(const std::string &file_name,
-                   const std::vector<NDArray> &array_list);
-  /*!
-  * \return the size of current NDArray, a.k.a. the production of all shape dims
-  */
+   * \brief Sample gaussian distribution for each elements of out.
+   * \param mu mean of gaussian distribution.
+   * \param sigma standard deviation of gaussian distribution.
+   * \param out output NDArray.
+   */
+  static void SampleGaussian(mx_float mu, mx_float sigma, NDArray* out);
+  /*!
+   * \brief Sample uniform distribution for each elements of out.
+   * \param begin lower bound of distribution.
+   * \param end upper bound of distribution.
+   * \param out output NDArray.
+   */
+  static void SampleUniform(mx_float begin, mx_float end, NDArray* out);
+  /*!
+   * \brief Load NDArrays from binary file.
+   * \param file_name name of the binary file.
+   * \param array_list a list of NDArrays returned, do not fill the list if
+   * nullptr is given.
+   * \param array_map a map from names to NDArrays returned, do not fill the map
+   * if nullptr is given or no names is stored in binary file.
+   */
+  static void Load(const std::string& file_name,
+                   std::vector<NDArray>* array_list          = nullptr,
+                   std::map<std::string, NDArray>* array_map = nullptr);
+  /*!
+   * \brief Load map of NDArrays from binary file.
+   * \param file_name name of the binary file.
+   * \return a list of NDArrays.
+   */
+  static std::map<std::string, NDArray> LoadToMap(const std::string& file_name);
+  /*!
+   * \brief Load list of NDArrays from binary file.
+   * \param file_name name of the binary file.
+   * \return a map from names to NDArrays.
+   */
+  static std::vector<NDArray> LoadToList(const std::string& file_name);
+  /*!
+   * \brief Load NDArrays from buffer.
+   * \param buffer Pointer to buffer. (ie contents of param file)
+   * \param size Size of buffer
+   * \param array_list a list of NDArrays returned, do not fill the list if
+   * nullptr is given.
+   * \param array_map a map from names to NDArrays returned, do not fill the map
+   * if nullptr is given or no names is stored in binary file.
+   */
+  static void LoadFromBuffer(const void* buffer,
+                             size_t size,
+                             std::vector<NDArray>* array_list          = nullptr,
+                             std::map<std::string, NDArray>* array_map = nullptr);
+  /*!
+   * \brief Load map of NDArrays from buffer.
+   * \param buffer Pointer to buffer. (ie contents of param file)
+   * \param size Size of buffer
+   * \return a list of NDArrays.
+   */
+  static std::map<std::string, NDArray> LoadFromBufferToMap(const void* buffer, size_t size);
+  /*!
+   * \brief Load list of NDArrays from buffer.
+   * \param buffer Pointer to buffer. (ie contents of param file)
+   * \param size Size of buffer
+   * \return a map from names to NDArrays.
+   */
+  static std::vector<NDArray> LoadFromBufferToList(const void* buffer, size_t size);
+  /*!
+   * \brief save a map of string->NDArray to binary file.
+   * \param file_name name of the binary file.
+   * \param array_map a map from names to NDArrays.
+   */
+  static void Save(const std::string& file_name, const std::map<std::string, NDArray>& array_map);
+  /*!
+   * \brief save a list of NDArrays to binary file.
+   * \param file_name name of the binary file.
+   * \param array_list a list of NDArrays.
+   */
+  static void Save(const std::string& file_name, const std::vector<NDArray>& array_list);
+  /*!
+   * \return the size of current NDArray, a.k.a. the production of all shape dims
+   */
   size_t Size() const;
   /*!
-  * \return the shape of current NDArray, in the form of mx_uint vector
-  */
+   * \return the shape of current NDArray, in the form of mx_uint vector
+   */
   std::vector<mx_uint> GetShape() const;
   /*!
-  * \return the data type of current NDArray
-  */
+   * \return the data type of current NDArray
+   */
   int GetDType() const;
   /*!
-  * \brief Get the pointer to data (IMPORTANT: The ndarray should not be in GPU)
-  * \return the data pointer to the current NDArray
-  */
-  const mx_float *GetData() const;
+   * \brief Get the pointer to data (IMPORTANT: The ndarray should not be in GPU)
+   * \return the data pointer to the current NDArray
+   */
+  const mx_float* GetData() const;
 
   /*!
-  * \return the context of NDArray
-  */
+   * \return the context of NDArray
+   */
   Context GetContext() const;
 
   /*!
-  * \return the NDArrayHandle of the current NDArray
-  */
-  NDArrayHandle GetHandle() const { return blob_ptr_->handle_; }
+   * \return the NDArrayHandle of the current NDArray
+   */
+  NDArrayHandle GetHandle() const {
+    return blob_ptr_->handle_;
+  }
 
  private:
   std::shared_ptr<NDBlob> blob_ptr_;
 };
 
-std::ostream& operator<<(std::ostream& out, const NDArray &ndarray);
+std::ostream& operator<<(std::ostream& out, const NDArray& ndarray);
 }  // namespace cpp
 }  // namespace mxnet
 
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
index b54cc0ae2c01..fd6944733470 100644
--- a/cpp-package/include/mxnet-cpp/op_map.h
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file op_map.h
-* \brief definition of OpMap
-* \author Chuntao Hong
-*/
+ * \file op_map.h
+ * \brief definition of OpMap
+ * \author Chuntao Hong
+ */
 
 #ifndef MXNET_CPP_OP_MAP_H_
 #define MXNET_CPP_OP_MAP_H_
@@ -35,38 +35,42 @@ namespace mxnet {
 namespace cpp {
 
 /*!
-* \brief OpMap instance holds a map of all the symbol creators so we can
-*  get symbol creators by name.
-*  This is used internally by Symbol and Operator.
-*/
+ * \brief OpMap instance holds a map of all the symbol creators so we can
+ *  get symbol creators by name.
+ *  This is used internally by Symbol and Operator.
+ */
 class OpMap {
  public:
   /*!
-  * \brief Create an Mxnet instance
-  */
+   * \brief Create an Mxnet instance
+   */
   inline OpMap() {
-    mx_uint num_symbol_creators = 0;
-    AtomicSymbolCreator *symbol_creators = nullptr;
-    int r =
-      MXSymbolListAtomicSymbolCreators(&num_symbol_creators, &symbol_creators);
+    mx_uint num_symbol_creators          = 0;
+    AtomicSymbolCreator* symbol_creators = nullptr;
+    int r = MXSymbolListAtomicSymbolCreators(&num_symbol_creators, &symbol_creators);
     CHECK_EQ(r, 0);
     for (mx_uint i = 0; i < num_symbol_creators; i++) {
-      const char *name;
-      const char *description;
+      const char* name;
+      const char* description;
       mx_uint num_args;
-      const char **arg_names;
-      const char **arg_type_infos;
-      const char **arg_descriptions;
-      const char *key_var_num_args;
-      r = MXSymbolGetAtomicSymbolInfo(symbol_creators[i], &name, &description,
-        &num_args, &arg_names, &arg_type_infos,
-        &arg_descriptions, &key_var_num_args);
+      const char** arg_names;
+      const char** arg_type_infos;
+      const char** arg_descriptions;
+      const char* key_var_num_args;
+      r = MXSymbolGetAtomicSymbolInfo(symbol_creators[i],
+                                      &name,
+                                      &description,
+                                      &num_args,
+                                      &arg_names,
+                                      &arg_type_infos,
+                                      &arg_descriptions,
+                                      &key_var_num_args);
       CHECK_EQ(r, 0);
       symbol_creators_[name] = symbol_creators[i];
     }
 
     nn_uint num_ops;
-    const char **op_names;
+    const char** op_names;
     r = NNListAllOpNames(&num_ops, &op_names);
     CHECK_EQ(r, 0);
     for (nn_uint i = 0; i < num_ops; i++) {
@@ -78,24 +82,24 @@ class OpMap {
   }
 
   /*!
-  * \brief Get a symbol creator with its name.
-  *
-  * \param name name of the symbol creator
-  * \return handle to the symbol creator
-  */
-  inline AtomicSymbolCreator GetSymbolCreator(const std::string &name) {
+   * \brief Get a symbol creator with its name.
+   *
+   * \param name name of the symbol creator
+   * \return handle to the symbol creator
+   */
+  inline AtomicSymbolCreator GetSymbolCreator(const std::string& name) {
     if (symbol_creators_.count(name) == 0)
       return GetOpHandle(name);
     return symbol_creators_[name];
   }
 
   /*!
-  * \brief Get an op handle with its name.
-  *
-  * \param name name of the op
-  * \return handle to the op
-  */
-  inline OpHandle GetOpHandle(const std::string &name) {
+   * \brief Get an op handle with its name.
+   *
+   * \param name name of the op
+   * \return handle to the op
+   */
+  inline OpHandle GetOpHandle(const std::string& name) {
     return op_handles_[name];
   }
 
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
index 52cdae772a68..d72b83c11671 100644
--- a/cpp-package/include/mxnet-cpp/op_suppl.h
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file op_suppl.h
-* \brief A supplement and amendment of the operators from op.h
-* \author Zhang Chen, zhubuntu, Xin Li
-*/
+ * \file op_suppl.h
+ * \brief A supplement and amendment of the operators from op.h
+ * \author Zhang Chen, zhubuntu, Xin Li
+ */
 
 #ifndef MXNET_CPP_OP_SUPPL_H_
 #define MXNET_CPP_OP_SUPPL_H_
@@ -38,118 +38,85 @@ namespace mxnet {
 namespace cpp {
 
 inline Symbol _Plus(Symbol lhs, Symbol rhs) {
-  return Operator("_Plus")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Plus")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Mul(Symbol lhs, Symbol rhs) {
-  return Operator("_Mul")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Mul")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Minus(Symbol lhs, Symbol rhs) {
-  return Operator("_Minus")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Minus")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Div(Symbol lhs, Symbol rhs) {
-  return Operator("_Div")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Div")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Mod(Symbol lhs, Symbol rhs) {
-  return Operator("_Mod")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Mod")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Power(Symbol lhs, Symbol rhs) {
-  return Operator("_Power")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Power")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Maximum(Symbol lhs, Symbol rhs) {
-  return Operator("_Maximum")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Maximum")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _Minimum(Symbol lhs, Symbol rhs) {
-  return Operator("_Minimum")(lhs, rhs)
-           .CreateSymbol();
+  return Operator("_Minimum")(lhs, rhs).CreateSymbol();
 }
 inline Symbol _PlusScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_PlusScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_PlusScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _MinusScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_MinusScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_MinusScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _RMinusScalar(mx_float scalar, Symbol rhs) {
-  return Operator("_RMinusScalar")(rhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_RMinusScalar")(rhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _MulScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_MulScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_MulScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _DivScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_DivScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_DivScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _RDivScalar(mx_float scalar, Symbol rhs) {
-  return Operator("_RDivScalar")(rhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_RDivScalar")(rhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _ModScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_ModScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_ModScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _RModScalar(mx_float scalar, Symbol rhs) {
-  return Operator("_RModScalar")(rhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_RModScalar")(rhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _PowerScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_PowerScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_PowerScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _RPowerScalar(mx_float scalar, Symbol rhs) {
-  return Operator("_RPowerScalar")(rhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_RPowerScalar")(rhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _MaximumScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_MaximumScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_MaximumScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 inline Symbol _MinimumScalar(Symbol lhs, mx_float scalar) {
-  return Operator("_MinimumScalar")(lhs)
-           .SetParam("scalar", scalar)
-           .CreateSymbol();
+  return Operator("_MinimumScalar")(lhs).SetParam("scalar", scalar).CreateSymbol();
 }
 // TODO(zhangcheng-qinyinghua)
 //  make crop function run in op.h
 //  This function is due to [zhubuntu](https://github.com/zhubuntu)
 inline Symbol Crop(const std::string& symbol_name,
-    int num_args,
-    Symbol data,
-    Symbol crop_like,
-    Shape offset = Shape(0, 0),
-    Shape h_w = Shape(0, 0),
-    bool center_crop = false) {
+                   int num_args,
+                   Symbol data,
+                   Symbol crop_like,
+                   Shape offset     = Shape(0, 0),
+                   Shape h_w        = Shape(0, 0),
+                   bool center_crop = false) {
   return Operator("Crop")
-    .SetParam("num_args", num_args)
-    .SetParam("offset", offset)
-    .SetParam("h_w", h_w)
-    .SetParam("center_crop", center_crop)
-    .SetInput("arg0", data)
-    .SetInput("arg1", crop_like)
-    .CreateSymbol(symbol_name);
+      .SetParam("num_args", num_args)
+      .SetParam("offset", offset)
+      .SetParam("h_w", h_w)
+      .SetParam("center_crop", center_crop)
+      .SetInput("arg0", data)
+      .SetInput("arg1", crop_like)
+      .CreateSymbol(symbol_name);
 }
 
-
 /*!
  * \brief Apply activation function to input.
  *        Softmax Activation is only available with CUDNN on GPUand will be
@@ -159,21 +126,16 @@ inline Symbol Crop(const std::string& symbol_name,
  * \param act_type Activation function to be applied.
  * \return new symbol
  */
-inline Symbol Activation(const std::string& symbol_name,
-                         Symbol data,
-                         const std::string& act_type) {
-  assert(act_type == "relu" ||
-         act_type == "sigmoid" ||
-         act_type == "softrelu" ||
+inline Symbol Activation(const std::string& symbol_name, Symbol data, const std::string& act_type) {
+  assert(act_type == "relu" || act_type == "sigmoid" || act_type == "softrelu" ||
          act_type == "tanh");
   return Operator("Activation")
-           .SetParam("act_type", act_type.c_str())
-           .SetInput("data", data)
-           .CreateSymbol(symbol_name);
+      .SetParam("act_type", act_type.c_str())
+      .SetInput("data", data)
+      .CreateSymbol(symbol_name);
 }
 
 }  // namespace cpp
 }  // namespace mxnet
 
 #endif  // MXNET_CPP_OP_SUPPL_H_
-
diff --git a/cpp-package/include/mxnet-cpp/op_util.h b/cpp-package/include/mxnet-cpp/op_util.h
index 20e06a851814..616bbbb44886 100644
--- a/cpp-package/include/mxnet-cpp/op_util.h
+++ b/cpp-package/include/mxnet-cpp/op_util.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file op_util.h
-* \brief operator helper functions
-* \author Chris Olivier
-*/
+ * \file op_util.h
+ * \brief operator helper functions
+ * \author Chris Olivier
+ */
 
 #ifndef MXNET_CPP_OP_UTIL_H_
 #define MXNET_CPP_OP_UTIL_H_
@@ -45,12 +45,12 @@ inline ::caffe::LayerParameter textToCaffeLayerParameter(const std::string& text
   return ::caffe::LayerParameter(np.layer(0));
 }
 
-template<typename StreamType>
-inline StreamType& operator << (StreamType& os, const ::caffe::LayerParameter& op) {
+template <typename StreamType>
+inline StreamType& operator<<(StreamType& os, const ::caffe::LayerParameter& op) {
   std::string s;
   caffe::NetParameter np;
   // Avoid wasting time making a copy -- just push in out default object's pointer
-  np.mutable_layer()->AddAllocated(const_cast<::caffe::LayerParameter *>(&op));
+  np.mutable_layer()->AddAllocated(const_cast<::caffe::LayerParameter*>(&op));
   google::protobuf::TextFormat::PrintToString(np, &s);
   np.mutable_layer()->ReleaseLast();
   os << s;
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
index e8dad12d6053..64c283c3d497 100644
--- a/cpp-package/include/mxnet-cpp/operator.h
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file operator.h
-* \brief definition of operator
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file operator.h
+ * \brief definition of operator
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_OPERATOR_H_
 #define MXNET_CPP_OPERATOR_H_
@@ -37,24 +37,24 @@ namespace mxnet {
 namespace cpp {
 class Mxnet;
 /*!
-* \brief Operator interface
-*/
+ * \brief Operator interface
+ */
 class Operator {
  public:
   /*!
-  * \brief Operator constructor
-  * \param operator_name type of the operator
-  */
-  explicit Operator(const std::string &operator_name);
-  Operator &operator=(const Operator &rhs);
-  /*!
-  * \brief set config parameters
-  * \param name name of the config parameter
-  * \param value value of the config parameter
-  * \return reference of self
-  */
+   * \brief Operator constructor
+   * \param operator_name type of the operator
+   */
+  explicit Operator(const std::string& operator_name);
+  Operator& operator=(const Operator& rhs);
+  /*!
+   * \brief set config parameters
+   * \param name name of the config parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
   template <typename T>
-  Operator &SetParam(const std::string &name, const T &value) {
+  Operator& SetParam(const std::string& name, const T& value) {
     std::string value_str;
     std::stringstream ss;
     ss << value;
@@ -64,13 +64,13 @@ class Operator {
     return *this;
   }
   /*!
-  * \brief set config parameters from positional inputs
-  * \param pos the position of parameter
-  * \param value value of the config parameter
-  * \return reference of self
-  */
+   * \brief set config parameters from positional inputs
+   * \param pos the position of parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
   template <typename T>
-  Operator &SetParam(int pos, const T &value) {
+  Operator& SetParam(int pos, const T& value) {
     std::string value_str;
     std::stringstream ss;
     ss << value;
@@ -80,117 +80,119 @@ class Operator {
     return *this;
   }
   /*!
-  * \brief add an input symbol
-  * \param name name of the input symbol
-  * \param symbol the input symbol
-  * \return reference of self
-  */
-  Operator &SetInput(const std::string &name, const Symbol &symbol);
-  /*!
-  * \brief add an input symbol
-  * \param symbol the input symbol
-  */
-  template<int N = 0>
-  void PushInput(const Symbol &symbol) {
+   * \brief add an input symbol
+   * \param name name of the input symbol
+   * \param symbol the input symbol
+   * \return reference of self
+   */
+  Operator& SetInput(const std::string& name, const Symbol& symbol);
+  /*!
+   * \brief add an input symbol
+   * \param symbol the input symbol
+   */
+  template <int N = 0>
+  void PushInput(const Symbol& symbol) {
     input_symbols_.push_back(symbol.GetHandle());
   }
   /*!
-  * \brief add input symbols
-  * \return reference of self
-  */
-  Operator &operator()() { return *this; }
+   * \brief add input symbols
+   * \return reference of self
+   */
+  Operator& operator()() {
+    return *this;
+  }
   /*!
-  * \brief add input symbols
-  * \param symbol the input symbol
-  * \return reference of self
-  */
-  Operator &operator()(const Symbol &symbol) {
+   * \brief add input symbols
+   * \param symbol the input symbol
+   * \return reference of self
+   */
+  Operator& operator()(const Symbol& symbol) {
     input_symbols_.push_back(symbol.GetHandle());
     return *this;
   }
   /*!
-  * \brief add a list of input symbols
-  * \param symbols the vector of the input symbols
-  * \return reference of self
-  */
-  Operator &operator()(const std::vector<Symbol> &symbols) {
-    for (auto &s : symbols) {
+   * \brief add a list of input symbols
+   * \param symbols the vector of the input symbols
+   * \return reference of self
+   */
+  Operator& operator()(const std::vector<Symbol>& symbols) {
+    for (auto& s : symbols) {
       input_symbols_.push_back(s.GetHandle());
     }
     return *this;
   }
   /*!
-  * \brief create a Symbol from the current operator
-  * \param name the name of the operator
-  * \return the operator Symbol
-  */
-  Symbol CreateSymbol(const std::string &name = "");
+   * \brief create a Symbol from the current operator
+   * \param name the name of the operator
+   * \return the operator Symbol
+   */
+  Symbol CreateSymbol(const std::string& name = "");
 
   /*!
-  * \brief add an input ndarray
-  * \param name name of the input ndarray
-  * \param ndarray the input ndarray
-  * \return reference of self
-  */
-  Operator &SetInput(const std::string &name, const NDArray &ndarray);
-  /*!
-  * \brief add an input ndarray
-  * \param ndarray the input ndarray
-  */
-  template<int N = 0>
-  Operator &PushInput(const NDArray &ndarray) {
+   * \brief add an input ndarray
+   * \param name name of the input ndarray
+   * \param ndarray the input ndarray
+   * \return reference of self
+   */
+  Operator& SetInput(const std::string& name, const NDArray& ndarray);
+  /*!
+   * \brief add an input ndarray
+   * \param ndarray the input ndarray
+   */
+  template <int N = 0>
+  Operator& PushInput(const NDArray& ndarray) {
     input_ndarrays_.push_back(ndarray.GetHandle());
     return *this;
   }
   /*!
-  * \brief add positional inputs
-  */
+   * \brief add positional inputs
+   */
   template <class T, class... Args, int N = 0>
-  Operator &PushInput(const T &t, Args... args) {
+  Operator& PushInput(const T& t, Args... args) {
     SetParam(N, t);
-    PushInput<Args..., N+1>(args...);
+    PushInput<Args..., N + 1>(args...);
     return *this;
   }
   /*!
-  * \brief add the last positional input
-  */
+   * \brief add the last positional input
+   */
   template <class T, int N = 0>
-  Operator &PushInput(const T &t) {
+  Operator& PushInput(const T& t) {
     SetParam(N, t);
     return *this;
   }
   /*!
-  * \brief add input ndarrays
-  * \param ndarray the input ndarray
-  * \return reference of self
-  */
-  Operator &operator()(const NDArray &ndarray) {
+   * \brief add input ndarrays
+   * \param ndarray the input ndarray
+   * \return reference of self
+   */
+  Operator& operator()(const NDArray& ndarray) {
     input_ndarrays_.push_back(ndarray.GetHandle());
     return *this;
   }
   /*!
-  * \brief add a list of input ndarrays
-  * \param ndarrays the vector of the input ndarrays
-  * \return reference of self
-  */
-  Operator &operator()(const std::vector<NDArray> &ndarrays) {
-    for (auto &s : ndarrays) {
+   * \brief add a list of input ndarrays
+   * \param ndarrays the vector of the input ndarrays
+   * \return reference of self
+   */
+  Operator& operator()(const std::vector<NDArray>& ndarrays) {
+    for (auto& s : ndarrays) {
       input_ndarrays_.push_back(s.GetHandle());
     }
     return *this;
   }
   /*!
-  * \brief add input ndarrays
-  * \return reference of self
-  */
+   * \brief add input ndarrays
+   * \return reference of self
+   */
   template <typename... Args>
-  Operator &operator()(Args... args) {
+  Operator& operator()(Args... args) {
     PushInput(args...);
     return *this;
   }
   std::vector<NDArray> Invoke();
-  void Invoke(NDArray &output);
-  void Invoke(std::vector<NDArray> &outputs);
+  void Invoke(NDArray& output);
+  void Invoke(std::vector<NDArray>& outputs);
 
  private:
   std::map<std::string, std::string> params_desc_;
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
index 118c10ae12d9..b853703c5f6b 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.h
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file optimizer.h
-* \brief definition of optimizer
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file optimizer.h
+ * \brief definition of optimizer
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_OPTIMIZER_H_
 #define MXNET_CPP_OPTIMIZER_H_
@@ -42,32 +42,32 @@ namespace mxnet {
 namespace cpp {
 
 /*!
-* \brief Optimizer interface
-*/
+ * \brief Optimizer interface
+ */
 class Optimizer {
  public:
   /*!
-  * \brief constructor
-  * \param beign_num_update The initial number of updates
-  */
+   * \brief constructor
+   * \param beign_num_update The initial number of updates
+   */
   explicit Optimizer(unsigned begin_num_update);
   /*!
-  * \brief get optimizer type
-  * \return string of optimizer type
-  */
+   * \brief get optimizer type
+   * \return string of optimizer type
+   */
   virtual std::string GetType() const = 0;
   /*!
-  * \brief destructor
-  */
+   * \brief destructor
+   */
   virtual ~Optimizer();
   /*!
-  * \brief set config parameters
-  * \param name name of the config parameter
-  * \param value value of the config parameter
-  * \return reference of self
-  */
+   * \brief set config parameters
+   * \param name name of the config parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
   template <typename T>
-  Optimizer *SetParam(const std::string &name, const T &value) {
+  Optimizer* SetParam(const std::string& name, const T& value) {
     std::string value_str;
     std::stringstream ss;
     ss << value;
@@ -77,22 +77,22 @@ class Optimizer {
     return this;
   }
   /*!
-  * \bried set the lr scheduler
-  * \param lrScheduler lr scheduler used for this optimizer
-  * \return reference if self
-  */
-  Optimizer *SetLRScheduler(std::unique_ptr<LRScheduler> lrScheduler) {
+   * \bried set the lr scheduler
+   * \param lrScheduler lr scheduler used for this optimizer
+   * \return reference if self
+   */
+  Optimizer* SetLRScheduler(std::unique_ptr<LRScheduler> lrScheduler) {
     CHECK(lrScheduler);
     lrScheduler_ = std::move(lrScheduler);
     lrScheduler_->SetLR(dmlc::stof(params_["lr"]));
     return this;
   }
   /*!
-  *  \brief Update a weight with gradient.
-  *  \param index the unique index for the weight.
-  *  \param weight the weight to update.
-  *  \param grad gradient for the weight.
-  */
+   *  \brief Update a weight with gradient.
+   *  \param index the unique index for the weight.
+   *  \param weight the weight to update.
+   *  \param grad gradient for the weight.
+   */
   virtual void Update(int index, NDArray weight, NDArray grad) = 0;
   // TODO(zhangcheng-qinyinghua)
   // implement Update a list of arrays, maybe in the form of map
@@ -100,9 +100,9 @@ class Optimizer {
   // grad, mx_float lr);
 
   /*!
-  *  \brief Serialize the optimizer parameters to a string.
-  *  \return serialization
-  */
+   *  \brief Serialize the optimizer parameters to a string.
+   *  \return serialization
+   */
   std::string Serialize() const;
 
  protected:
@@ -125,19 +125,21 @@ class OptimizerRegistry {
  public:
   static Optimizer* Find(const std::string& name);
   static int __REGISTER__(const std::string& name, OptimizerCreator creator);
+
  private:
   static std::map<std::string, OptimizerCreator>& cmap();
-  OptimizerRegistry() = delete;
+  OptimizerRegistry()  = delete;
   ~OptimizerRegistry() = delete;
 };
-#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)\
-  OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
+#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType) \
+  OptimizerRegistry::__REGISTER__(#Name, []() { return new OptimizerType(); })
 
 class SGDOptimizer : public Optimizer {
  public:
   explicit SGDOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~SGDOptimizer();
   void CreateState_(int index, NDArray weight) override;
@@ -151,6 +153,7 @@ class SignumOptimizer : public Optimizer {
   explicit SignumOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~SignumOptimizer();
   void CreateState_(int index, NDArray weight) override;
@@ -159,12 +162,12 @@ class SignumOptimizer : public Optimizer {
   AtomicSymbolCreator mom_update_handle_;
 };
 
-
 class RMSPropOptimizer : public Optimizer {
  public:
   explicit RMSPropOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~RMSPropOptimizer();
   void CreateState_(int index, NDArray weight) override;
@@ -178,6 +181,7 @@ class AdamOptimizer : public Optimizer {
   explicit AdamOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~AdamOptimizer();
   void CreateState_(int index, NDArray weight) override;
@@ -191,6 +195,7 @@ class AdaGradOptimizer : public Optimizer {
   explicit AdaGradOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~AdaGradOptimizer();
   void CreateState_(int index, NDArray weight) override;
@@ -202,6 +207,7 @@ class AdaDeltaOptimizer : public Optimizer {
   explicit AdaDeltaOptimizer(unsigned begin_num_update = 0);
   std::string GetType() const override;
   void Update(int index, NDArray weight, NDArray grad) override;
+
  private:
   virtual ~AdaDeltaOptimizer();
   void CreateState_(int index, NDArray weight) override;
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
index 44a10828a366..6d70862a09c3 100644
--- a/cpp-package/include/mxnet-cpp/shape.h
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file shape.h
-* \brief definition of shape
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file shape.h
+ * \brief definition of shape
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_SHAPE_H_
 #define MXNET_CPP_SHAPE_H_
@@ -36,167 +36,155 @@ namespace mxnet {
 namespace cpp {
 
 /*!
-* \brief dynamic shape class that can hold shape
-*   of arbirary dimension
-*/
+ * \brief dynamic shape class that can hold shape
+ *   of arbirary dimension
+ */
 struct Shape {
  public:
   /*! \brief constructor */
-  Shape()
-    : ndim_(0),
-    num_heap_allocated_(0),
-    data_heap_(nullptr) {}
+  Shape() : ndim_(0), num_heap_allocated_(0), data_heap_(nullptr) {}
   /*!
-  * \brief constructor from a vector of index_t
-  * \param v the vector
-  */
-  explicit Shape(const std::vector<index_t> &v)
-    : ndim_(v.size()) {
+   * \brief constructor from a vector of index_t
+   * \param v the vector
+   */
+  explicit Shape(const std::vector<index_t>& v) : ndim_(v.size()) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
       std::copy(v.begin(), v.end(), data_stack_);
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
       std::copy(v.begin(), v.end(), data_heap_);
     }
   }
   /*!
-  * \brief constructor one dimmension shape
-  * \param s1 size of the first dimmension
-  */
-  explicit Shape(index_t s1)
-    : ndim_(1) {
+   * \brief constructor one dimmension shape
+   * \param s1 size of the first dimmension
+   */
+  explicit Shape(index_t s1) : ndim_(1) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
-      data_stack_[0] = s1;
+      data_stack_[0]      = s1;
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
-      data_heap_[0] = s1;
+      data_heap_[0]       = s1;
     }
   }
   /*!
-  * \brief constructor two dimmension shape
-  * \param s1 size of the first dimmension
-  * \param s2 size of the second dimmension
-  */
-  Shape(index_t s1, index_t s2)
-    : ndim_(2) {
+   * \brief constructor two dimmension shape
+   * \param s1 size of the first dimmension
+   * \param s2 size of the second dimmension
+   */
+  Shape(index_t s1, index_t s2) : ndim_(2) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
-      data_stack_[0] = s1;
-      data_stack_[1] = s2;
+      data_stack_[0]      = s1;
+      data_stack_[1]      = s2;
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
-      data_heap_[0] = s1;
-      data_heap_[1] = s2;
+      data_heap_[0]       = s1;
+      data_heap_[1]       = s2;
     }
   }
   /*!
-  * \brief constructor three dimmension shape
-  * \param s1 size of the first dimmension
-  * \param s2 size of the second dimmension
-  * \param s3 size of the third dimmension
-  */
-  Shape(index_t s1, index_t s2, index_t s3)
-    : ndim_(3) {
+   * \brief constructor three dimmension shape
+   * \param s1 size of the first dimmension
+   * \param s2 size of the second dimmension
+   * \param s3 size of the third dimmension
+   */
+  Shape(index_t s1, index_t s2, index_t s3) : ndim_(3) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
-      data_stack_[0] = s1;
-      data_stack_[1] = s2;
-      data_stack_[2] = s3;
+      data_stack_[0]      = s1;
+      data_stack_[1]      = s2;
+      data_stack_[2]      = s3;
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
-      data_heap_[0] = s1;
-      data_heap_[1] = s2;
-      data_heap_[2] = s3;
+      data_heap_[0]       = s1;
+      data_heap_[1]       = s2;
+      data_heap_[2]       = s3;
     }
   }
   /*!
-  * \brief constructor four dimmension shape
-  * \param s1 size of the first dimmension
-  * \param s2 size of the second dimmension
-  * \param s3 size of the third dimmension
-  * \param s4 size of the fourth dimmension
-  */
-  Shape(index_t s1, index_t s2, index_t s3, index_t s4)
-    : ndim_(4) {
+   * \brief constructor four dimmension shape
+   * \param s1 size of the first dimmension
+   * \param s2 size of the second dimmension
+   * \param s3 size of the third dimmension
+   * \param s4 size of the fourth dimmension
+   */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4) : ndim_(4) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
-      data_stack_[0] = s1;
-      data_stack_[1] = s2;
-      data_stack_[2] = s3;
-      data_stack_[3] = s4;
+      data_stack_[0]      = s1;
+      data_stack_[1]      = s2;
+      data_stack_[2]      = s3;
+      data_stack_[3]      = s4;
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
-      data_heap_[0] = s1;
-      data_heap_[1] = s2;
-      data_heap_[2] = s3;
-      data_heap_[3] = s4;
+      data_heap_[0]       = s1;
+      data_heap_[1]       = s2;
+      data_heap_[2]       = s3;
+      data_heap_[3]       = s4;
     }
   }
   /*!
-  * \brief constructor five dimmension shape
-  * \param s1 size of the first dimmension
-  * \param s2 size of the second dimmension
-  * \param s3 size of the third dimmension
-  * \param s4 size of the fourth dimmension
-  * \param s5 size of the fifth dimmension
-  */
-  Shape(index_t s1, index_t s2, index_t s3, index_t s4, index_t s5)
-    : ndim_(5) {
+   * \brief constructor five dimmension shape
+   * \param s1 size of the first dimmension
+   * \param s2 size of the second dimmension
+   * \param s3 size of the third dimmension
+   * \param s4 size of the fourth dimmension
+   * \param s5 size of the fifth dimmension
+   */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4, index_t s5) : ndim_(5) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
-      data_stack_[0] = s1;
-      data_stack_[1] = s2;
-      data_stack_[2] = s3;
-      data_stack_[3] = s4;
-      data_stack_[4] = s5;
+      data_stack_[0]      = s1;
+      data_stack_[1]      = s2;
+      data_stack_[2]      = s3;
+      data_stack_[3]      = s4;
+      data_stack_[4]      = s5;
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
-      data_heap_[0] = s1;
-      data_heap_[1] = s2;
-      data_heap_[2] = s3;
-      data_heap_[3] = s4;
-      data_heap_[4] = s5;
+      data_heap_[0]       = s1;
+      data_heap_[1]       = s2;
+      data_heap_[2]       = s3;
+      data_heap_[3]       = s4;
+      data_heap_[4]       = s5;
     }
   }
   /*!
-  * \brief constructor from Shape
-  * \param s the source shape
-  */
-  Shape(const Shape &s)
-    : ndim_(s.ndim_) {
+   * \brief constructor from Shape
+   * \param s the source shape
+   */
+  Shape(const Shape& s) : ndim_(s.ndim_) {
     if (ndim_ <= kStackCache) {
-      data_heap_ = nullptr;
+      data_heap_          = nullptr;
       num_heap_allocated_ = 0;
       std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
     } else {
-      data_heap_ = new index_t[ndim_];
+      data_heap_          = new index_t[ndim_];
       num_heap_allocated_ = ndim_;
       std::copy(s.data_heap_, s.data_heap_ + ndim_, data_heap_);
     }
   }
 #if MSHADOW_IN_CXX11
   /*!
-  * \brief move constructor from Shape
-  * \param s the source shape
-  */
-  Shape(Shape &&s)
-    : ndim_(s.ndim_),
-    num_heap_allocated_(s.num_heap_allocated_),
-    data_heap_(s.data_heap_) {
+   * \brief move constructor from Shape
+   * \param s the source shape
+   */
+  Shape(Shape&& s)
+      : ndim_(s.ndim_), num_heap_allocated_(s.num_heap_allocated_), data_heap_(s.data_heap_) {
     if (ndim_ <= kStackCache) {
       std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
     }
@@ -210,43 +198,42 @@ struct Shape {
     delete[] data_heap_;
   }
   /*!
-  * \brief copy shape from content betwen two iterators
-  * \param begin the beginning of iterator
-  * \param end the end of the iterator
-  * \tparam RandomAccessIterator iterator type
-  */
-  template<typename RandomAccessIterator>
-  inline void CopyFrom(RandomAccessIterator begin,
-    RandomAccessIterator end) {
+   * \brief copy shape from content betwen two iterators
+   * \param begin the beginning of iterator
+   * \param end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template <typename RandomAccessIterator>
+  inline void CopyFrom(RandomAccessIterator begin, RandomAccessIterator end) {
     this->SetDim(end - begin);
     std::copy(begin, end, data());
   }
   /*!
-  * \brief assignment from shape
-  * \param shape source shape
-  * \return reference of self
-  */
-  inline Shape &operator=(const Shape &shape) {
+   * \brief assignment from shape
+   * \param shape source shape
+   * \return reference of self
+   */
+  inline Shape& operator=(const Shape& shape) {
     this->SetDim(shape.ndim_);
-    const index_t *src = shape.data();
+    const index_t* src = shape.data();
     std::copy(src, src + ndim_, data());
     return *this;
   }
   /*!
-  * \brief assignment from vector
-  * \param shape source shape
-  * \return reference of self
-  */
-  inline Shape &operator=(const std::vector<index_t> &shape) {
+   * \brief assignment from vector
+   * \param shape source shape
+   * \return reference of self
+   */
+  inline Shape& operator=(const std::vector<index_t>& shape) {
     this->CopyFrom(shape.begin(), shape.end());
     return *this;
   }
   /*! \return the data content of the shape */
-  inline const index_t *data() const {
+  inline const index_t* data() const {
     return ndim_ <= kStackCache ? data_stack_ : data_heap_;
   }
   /*! \return the data content of the shape */
-  inline index_t *data() {
+  inline index_t* data() {
     return ndim_ <= kStackCache ? data_stack_ : data_heap_;
   }
   /*! \brief return number of dimension of the tensor inside */
@@ -254,57 +241,60 @@ struct Shape {
     return ndim_;
   }
   /*!
-  * \brief get corresponding index
-  * \param i dimension index
-  * \return the corresponding dimension size
-  */
-  inline index_t &operator[](index_t i) {
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline index_t& operator[](index_t i) {
     return data()[i];
   }
   /*!
-  * \brief get corresponding index
-  * \param i dimension index
-  * \return the corresponding dimension size
-  */
-  inline const index_t &operator[](index_t i) const {
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline const index_t& operator[](index_t i) const {
     return data()[i];
   }
   /*! \brief total number of elements in the tensor */
   inline size_t Size(void) const {
-    size_t size = 1;
-    const index_t *d = this->data();
+    size_t size      = 1;
+    const index_t* d = this->data();
     for (index_t i = 0; i < ndim_; ++i) {
       size *= d[i];
     }
     return size;
   }
   /*!
-  * \return whether two shape equals
-  * \param s the shape to compare against
-  */
-  inline bool operator==(const Shape &s) const {
-    if (ndim_ != s.ndim_) return false;
+   * \return whether two shape equals
+   * \param s the shape to compare against
+   */
+  inline bool operator==(const Shape& s) const {
+    if (ndim_ != s.ndim_)
+      return false;
     if (ndim_ <= kStackCache) {
       for (index_t i = 0; i < ndim_; ++i) {
-        if (data_stack_[i] != s.data_stack_[i]) return false;
+        if (data_stack_[i] != s.data_stack_[i])
+          return false;
       }
     } else {
       for (index_t i = 0; i < ndim_; ++i) {
-        if (data_heap_[i] != s.data_heap_[i]) return false;
+        if (data_heap_[i] != s.data_heap_[i])
+          return false;
       }
     }
     return true;
   }
   /*!
-  * \return whether two shape not equals
-  * \param s the shape to compare against
-  */
-  inline bool operator!=(const Shape &s) const {
+   * \return whether two shape not equals
+   * \param s the shape to compare against
+   */
+  inline bool operator!=(const Shape& s) const {
     return !(*this == s);
   }
 
-  friend std::ostream &operator<<(std::ostream &os, const Shape &shape);
-  friend std::istream &operator>>(std::istream &is, Shape &shape);
+  friend std::ostream& operator<<(std::ostream& os, const Shape& shape);
+  friend std::istream& operator>>(std::istream& is, Shape& shape);
 
  private:
   // the shape will be stored in data_stack_
@@ -319,17 +309,16 @@ struct Shape {
   /*! \brief in stack space used to store shape when it is small */
   index_t data_stack_[kStackCache];
   /*! \brief space to store shape when dimension is big*/
-  index_t *data_heap_;
+  index_t* data_heap_;
   /*!
-  * \brief internal function to set the dimension
-  * \param dim the dimension of the shape
-  */
+   * \brief internal function to set the dimension
+   * \param dim the dimension of the shape
+   */
   inline void SetDim(index_t dim) {
-    if (dim > kStackCache &&
-      dim > num_heap_allocated_) {
+    if (dim > kStackCache && dim > num_heap_allocated_) {
       // data_heap_ can be nullptr
       delete[] data_heap_;
-      data_heap_ = new index_t[dim];
+      data_heap_          = new index_t[dim];
       num_heap_allocated_ = dim;
     }
     ndim_ = dim;
@@ -337,34 +326,37 @@ struct Shape {
 };
 
 /*!
-* \brief allow string printing of the shape
-* \param os the output stream
-* \param shape the shape
-* \return the ostream
-*/
-inline std::ostream &operator<<(std::ostream &os, const Shape &shape) {
+ * \brief allow string printing of the shape
+ * \param os the output stream
+ * \param shape the shape
+ * \return the ostream
+ */
+inline std::ostream& operator<<(std::ostream& os, const Shape& shape) {
   os << '(';
   for (index_t i = 0; i < shape.ndim(); ++i) {
-    if (i != 0) os << ',';
+    if (i != 0)
+      os << ',';
     os << static_cast<int>(shape[i]);  // Supports negative Shape 'special codes' for inferring
   }
   // python style tuple
-  if (shape.ndim() == 1) os << ',';
+  if (shape.ndim() == 1)
+    os << ',';
   os << ')';
   return os;
 }
 
 /*!
-* \brief read shape from the istream
-* \param is the input stream
-* \param shape the shape
-* \return the istream
-*/
-inline std::istream &operator>>(std::istream &is, Shape &shape) {
+ * \brief read shape from the istream
+ * \param is the input stream
+ * \param shape the shape
+ * \return the istream
+ */
+inline std::istream& operator>>(std::istream& is, Shape& shape) {
   // get (
   while (true) {
     char ch = is.get();
-    if (ch == '(') break;
+    if (ch == '(')
+      break;
     if (!isspace(ch)) {
       is.setstate(std::ios::failbit);
       return is;
@@ -382,14 +374,17 @@ inline std::istream &operator>>(std::istream &is, Shape &shape) {
       while (true) {
         ch = is.peek();
         if (isspace(ch)) {
-          is.get(); continue;
+          is.get();
+          continue;
         }
         if (ch == ')') {
-          is.get(); break;
+          is.get();
+          break;
         }
         break;
       }
-      if (ch == ')') break;
+      if (ch == ')')
+        break;
     } else if (ch == ')') {
       break;
     } else {
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 8e94637908be..6d9e57471154 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -18,10 +18,10 @@
  */
 
 /*!
-* \file symbol.h
-* \brief definition of symbol
-* \author Chuntao Hong, Zhang Chen
-*/
+ * \file symbol.h
+ * \brief definition of symbol
+ * \author Chuntao Hong, Zhang Chen
+ */
 
 #ifndef MXNET_CPP_SYMBOL_H_
 #define MXNET_CPP_SYMBOL_H_
@@ -39,58 +39,60 @@ namespace cpp {
 class Executor;
 
 /*!
-* \brief struct to store SymbolHandle
-*/
+ * \brief struct to store SymbolHandle
+ */
 struct SymBlob {
  public:
   /*!
-  * \brief default constructor
-  */
+   * \brief default constructor
+   */
   SymBlob() : handle_(nullptr) {}
   /*!
-  * \brief construct with SymbolHandle to store
-  */
+   * \brief construct with SymbolHandle to store
+   */
   explicit SymBlob(SymbolHandle handle) : handle_(handle) {}
   /*!
-  * \brief destructor, free the SymbolHandle
-  */
-  ~SymBlob() { MXSymbolFree(handle_); }
+   * \brief destructor, free the SymbolHandle
+   */
+  ~SymBlob() {
+    MXSymbolFree(handle_);
+  }
   /*!
-  * \brief the SymbolHandle to store
-  */
+   * \brief the SymbolHandle to store
+   */
   SymbolHandle handle_;
 
  private:
-  SymBlob(const SymBlob &);
-  SymBlob &operator=(const SymBlob &);
+  SymBlob(const SymBlob&);
+  SymBlob& operator=(const SymBlob&);
 };
 
 /*!
-* \brief Symbol interface
-*/
+ * \brief Symbol interface
+ */
 class Symbol {
  public:
   Symbol() {}
   /*!
-  * \brief construct a Symbol with SymbolHandle
-  * \param handle the given SymbolHandle
-  */
+   * \brief construct a Symbol with SymbolHandle
+   * \param handle the given SymbolHandle
+   */
   explicit Symbol(SymbolHandle handle);
   /*!
-  * \brief construct a variable Symbol
-  * \param name the name of the variable
-  */
-  explicit Symbol(const char *name);
+   * \brief construct a variable Symbol
+   * \param name the name of the variable
+   */
+  explicit Symbol(const char* name);
   /*!
-  * \brief construct a variable Symbol
-  * \param name the name of the variable
-  */
-  explicit Symbol(const std::string &name);
-  Symbol operator+(const Symbol &rhs) const;
-  Symbol operator-(const Symbol &rhs) const;
-  Symbol operator*(const Symbol &rhs) const;
-  Symbol operator/(const Symbol &rhs) const;
-  Symbol operator%(const Symbol &rhs) const;
+   * \brief construct a variable Symbol
+   * \param name the name of the variable
+   */
+  explicit Symbol(const std::string& name);
+  Symbol operator+(const Symbol& rhs) const;
+  Symbol operator-(const Symbol& rhs) const;
+  Symbol operator*(const Symbol& rhs) const;
+  Symbol operator/(const Symbol& rhs) const;
+  Symbol operator%(const Symbol& rhs) const;
 
   Symbol operator+(mx_float scalar) const;
   Symbol operator-(mx_float scalar) const;
@@ -99,79 +101,81 @@ class Symbol {
   Symbol operator%(mx_float scalar) const;
   Symbol Copy() const;
   /*!
-  * \brief construct a variable Symbol
-  * \param name the name of the variable
-  */
-  static Symbol Variable(const std::string &name = "");
+   * \brief construct a variable Symbol
+   * \param name the name of the variable
+   */
+  static Symbol Variable(const std::string& name = "");
   Symbol operator[](int index);
-  Symbol operator[](const std::string &index);
+  Symbol operator[](const std::string& index);
   /*!
-  * \brief Create a symbol that groups symbols together
-  * \param symbols List of symbols to be groupe
-  */
-  static Symbol Group(const std::vector<Symbol> &symbols);
+   * \brief Create a symbol that groups symbols together
+   * \param symbols List of symbols to be groupe
+   */
+  static Symbol Group(const std::vector<Symbol>& symbols);
   /*!
-  * \brief load Symbol from a JSON file
-  * \param file_name the name of the file
-  */
-  static Symbol Load(const std::string &file_name);
+   * \brief load Symbol from a JSON file
+   * \param file_name the name of the file
+   */
+  static Symbol Load(const std::string& file_name);
   /*!
-  * \brief load Symbol from a JSON string
-  * \param json_str the JSON string
-  */
-  static Symbol LoadJSON(const std::string &json_str);
+   * \brief load Symbol from a JSON string
+   * \param json_str the JSON string
+   */
+  static Symbol LoadJSON(const std::string& json_str);
   /*!
-  * \brief save Symbol to a file
-  * \param file_name the name of the file
-  */
-  void Save(const std::string &file_name) const;
+   * \brief save Symbol to a file
+   * \param file_name the name of the file
+   */
+  void Save(const std::string& file_name) const;
   /*!
-  * \brief save Symbol into a JSON string
-  */
+   * \brief save Symbol into a JSON string
+   */
   std::string ToJSON() const;
   /*!
-  * \brief save Symbol into a JSON string
-  * \retutrn the symbol whose outputs are all the internals.
-  */
+   * \brief save Symbol into a JSON string
+   * \retutrn the symbol whose outputs are all the internals.
+   */
   Symbol GetInternals() const;
   /*!
-  * \return the SymbolHandle
-  */
-  SymbolHandle GetHandle() const { return (blob_ptr_) ? blob_ptr_->handle_: nullptr; }
+   * \return the SymbolHandle
+   */
+  SymbolHandle GetHandle() const {
+    return (blob_ptr_) ? blob_ptr_->handle_ : nullptr;
+  }
   /*!
-  * \brief construct an operator Symbol, with given input Symbol and config
-  * \param name the name of the Symbol
-  * \param input_keys the vector of keys of the input
-  * \param input_values the vector of the intput Symbols
-  * \param config_keys the vector of keys of the config
-  * \param config_values the vecotr of values of the config
-  */
-  Symbol(const std::string &operator_name, const std::string &name,
-         std::vector<const char *> input_keys,
+   * \brief construct an operator Symbol, with given input Symbol and config
+   * \param name the name of the Symbol
+   * \param input_keys the vector of keys of the input
+   * \param input_values the vector of the intput Symbols
+   * \param config_keys the vector of keys of the config
+   * \param config_values the vecotr of values of the config
+   */
+  Symbol(const std::string& operator_name,
+         const std::string& name,
+         std::vector<const char*> input_keys,
          std::vector<SymbolHandle> input_values,
-         std::vector<const char *> config_keys,
-         std::vector<const char *> config_values);
+         std::vector<const char*> config_keys,
+         std::vector<const char*> config_values);
   /*!
-  * \brief infer the shapes by providing shapes of known argument shapes.
-  * \param arg_shapes map of argument name to shape of arguments with known
-  * shapes.
-  * \param in_shapes used to store infered shapes of input arguments.
-  * \param out_shapes used to store infered shapes of outputs.
-  * \param aux_shapes use to store the infered shapes of auxiliary states
-  */
-  void InferShape(
-      const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
-      std::vector<std::vector<mx_uint> > *in_shape,
-      std::vector<std::vector<mx_uint> > *aux_shape,
-      std::vector<std::vector<mx_uint> > *out_shape) const;
+   * \brief infer the shapes by providing shapes of known argument shapes.
+   * \param arg_shapes map of argument name to shape of arguments with known
+   * shapes.
+   * \param in_shapes used to store infered shapes of input arguments.
+   * \param out_shapes used to store infered shapes of outputs.
+   * \param aux_shapes use to store the infered shapes of auxiliary states
+   */
+  void InferShape(const std::map<std::string, std::vector<mx_uint> >& arg_shapes,
+                  std::vector<std::vector<mx_uint> >* in_shape,
+                  std::vector<std::vector<mx_uint> >* aux_shape,
+                  std::vector<std::vector<mx_uint> >* out_shape) const;
   /*!
-  * \brief List the arguments names.
-  *
-  * The position of the returned list also corresponds to calling position in
-  *operator()
-  * \return the arguments list of this symbol, they can be either named or
-  *unnamed (empty string).
-  */
+   * \brief List the arguments names.
+   *
+   * The position of the returned list also corresponds to calling position in
+   *operator()
+   * \return the arguments list of this symbol, they can be either named or
+   *unnamed (empty string).
+   */
   std::vector<std::string> ListArguments() const;
   /*! \return lists all argument names and aux states of the symbol */
   std::vector<std::string> ListInputs() const;
@@ -199,101 +203,99 @@ class Symbol {
   /*! \return get the name of the symbol */
   std::string GetName() const;
   /*!
-  * \brief infer and construct all the arrays to bind to executor by providing
-  * some known arrays.
-  * \param context the context of all the infered arrays
-  * \param arg_arrays infered input arguments arrays.
-  * \param arad_arrays infered arrays to store the gradient output of the input
-  * arguments.
-  * \param aux_arrays infered arrays that is used as internal state in op.
-  * \param args_map map of some given arguments arrays.
-  * \param args_grad_store map of some gradient given store arrays.
-  * \param args_req_type map of some given type of gradient saving. Can only be
-  * in {kNullOp, kAddTo, kWriteTo}.
-  * \param aux_map NDArray that stores the internal state in op
-  */
+   * \brief infer and construct all the arrays to bind to executor by providing
+   * some known arrays.
+   * \param context the context of all the infered arrays
+   * \param arg_arrays infered input arguments arrays.
+   * \param arad_arrays infered arrays to store the gradient output of the input
+   * arguments.
+   * \param aux_arrays infered arrays that is used as internal state in op.
+   * \param args_map map of some given arguments arrays.
+   * \param args_grad_store map of some gradient given store arrays.
+   * \param args_req_type map of some given type of gradient saving. Can only be
+   * in {kNullOp, kAddTo, kWriteTo}.
+   * \param aux_map NDArray that stores the internal state in op
+   */
   void InferExecutorArrays(
-      const Context &context, std::vector<NDArray> *arg_arrays,
-      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
-      std::vector<NDArray> *aux_arrays,
-      const std::map<std::string, NDArray> &args_map,
-      const std::map<std::string, NDArray> &arg_grad_store =
-          std::map<std::string, NDArray>(),
-      const std::map<std::string, OpReqType> &grad_req_type =
-          std::map<std::string, OpReqType>(),
-      const std::map<std::string, NDArray> &aux_map =
-          std::map<std::string, NDArray>()) const;
+      const Context& context,
+      std::vector<NDArray>* arg_arrays,
+      std::vector<NDArray>* grad_arrays,
+      std::vector<OpReqType>* grad_reqs,
+      std::vector<NDArray>* aux_arrays,
+      const std::map<std::string, NDArray>& args_map,
+      const std::map<std::string, NDArray>& arg_grad_store  = std::map<std::string, NDArray>(),
+      const std::map<std::string, OpReqType>& grad_req_type = std::map<std::string, OpReqType>(),
+      const std::map<std::string, NDArray>& aux_map = std::map<std::string, NDArray>()) const;
   /*!
-  * \brief infer and construct all the input arguments arrays to bind to
-  * executor by providing some known arguments arrays.
-  * \param context the context of all the infered arrays.
-  * \param args_map map of all the infered input arguments arrays.
-  * \param known_args map of some given arguments arrays.
-  */
-  void InferArgsMap(const Context &context,
-                    std::map<std::string, NDArray> *args_map,
-                    const std::map<std::string, NDArray> &known_args) const;
+   * \brief infer and construct all the input arguments arrays to bind to
+   * executor by providing some known arguments arrays.
+   * \param context the context of all the infered arrays.
+   * \param args_map map of all the infered input arguments arrays.
+   * \param known_args map of some given arguments arrays.
+   */
+  void InferArgsMap(const Context& context,
+                    std::map<std::string, NDArray>* args_map,
+                    const std::map<std::string, NDArray>& known_args) const;
   /*!
-  * \brief Create an executor by bind symbol with context and arguments.
-  *  If user do not want to compute the gradients of i-th argument,
-  *grad_req_type[i] can be kNullOp.
-  *  The input arrays in the given maps should have the same name with the input
-  *symbol.
-  *  Only need some of the necessary arrays, and the other arrays can be infered
-  *automatically.
-  *
-  * \param context the context of binding.
-  * \param args_map the NDArray that stores the input arguments to the symbol.
-  * \param arg_grad_store NDArray that is used to store the gradient output of
-  *the input arguments.
-  * \param grad_req_type requirment type of gradient saving. Can only be in
-  *{kNullOp, kAddTo, kWriteTo}.
-  * \param aux_map NDArray that stores the internal state in op
-  * \return a new executor, which need to be free manually.
-  */
-  Executor *SimpleBind(const Context &context,
-                       const std::map<std::string, NDArray> &args_map,
-                       const std::map<std::string, NDArray> &arg_grad_store =
-                           std::map<std::string, NDArray>(),
-                       const std::map<std::string, OpReqType> &grad_req_type =
-                           std::map<std::string, OpReqType>(),
-                       const std::map<std::string, NDArray> &aux_map =
-                           std::map<std::string, NDArray>());
+   * \brief Create an executor by bind symbol with context and arguments.
+   *  If user do not want to compute the gradients of i-th argument,
+   *grad_req_type[i] can be kNullOp.
+   *  The input arrays in the given maps should have the same name with the input
+   *symbol.
+   *  Only need some of the necessary arrays, and the other arrays can be infered
+   *automatically.
+   *
+   * \param context the context of binding.
+   * \param args_map the NDArray that stores the input arguments to the symbol.
+   * \param arg_grad_store NDArray that is used to store the gradient output of
+   *the input arguments.
+   * \param grad_req_type requirment type of gradient saving. Can only be in
+   *{kNullOp, kAddTo, kWriteTo}.
+   * \param aux_map NDArray that stores the internal state in op
+   * \return a new executor, which need to be free manually.
+   */
+  Executor* SimpleBind(
+      const Context& context,
+      const std::map<std::string, NDArray>& args_map,
+      const std::map<std::string, NDArray>& arg_grad_store  = std::map<std::string, NDArray>(),
+      const std::map<std::string, OpReqType>& grad_req_type = std::map<std::string, OpReqType>(),
+      const std::map<std::string, NDArray>& aux_map         = std::map<std::string, NDArray>());
   /*!
-  * \brief Create an executor by bind symbol with context and arguments.
-  *  If user do not want to compute the gradients of i-th argument,
-  *grad_req_type[i] can be kNullOp.
-  *
-  * \param context the context of binding.
-  * \param arg_arrays the NDArray that stores the input arguments to the symbol.
-  * \param grad_arrays NDArray that is used to store the gradient output of the
-  *input arguments.
-  * \param grad_reqs requirment type of gradient saving. Can only be in
-  *{kNullOp, kAddTo, kWriteTo}.
-  * \param aux_arrays NDArray that is used as internal state in op
-  * \param group_to_ctx dict of string to mx.Context
-  * \param shared_exec Executor to share memory with. This is intended for
-  *runtime reshaping, variable length sequencesn etc.  The returned executor
-  *shares state with shared_exec, and should not be used in parallel with it.
-  * \return a new executor, which need to be free manually.
-  */
-  Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
-                 const std::vector<NDArray> &grad_arrays,
-                 const std::vector<OpReqType> &grad_reqs,
-                 const std::vector<NDArray> &aux_arrays,
-                 const std::map<std::string, Context> &group_to_ctx =
-                     std::map<std::string, Context>(),
-                 Executor *shared_exec = nullptr);
+   * \brief Create an executor by bind symbol with context and arguments.
+   *  If user do not want to compute the gradients of i-th argument,
+   *grad_req_type[i] can be kNullOp.
+   *
+   * \param context the context of binding.
+   * \param arg_arrays the NDArray that stores the input arguments to the symbol.
+   * \param grad_arrays NDArray that is used to store the gradient output of the
+   *input arguments.
+   * \param grad_reqs requirment type of gradient saving. Can only be in
+   *{kNullOp, kAddTo, kWriteTo}.
+   * \param aux_arrays NDArray that is used as internal state in op
+   * \param group_to_ctx dict of string to mx.Context
+   * \param shared_exec Executor to share memory with. This is intended for
+   *runtime reshaping, variable length sequencesn etc.  The returned executor
+   *shares state with shared_exec, and should not be used in parallel with it.
+   * \return a new executor, which need to be free manually.
+   */
+  Executor* Bind(
+      const Context& context,
+      const std::vector<NDArray>& arg_arrays,
+      const std::vector<NDArray>& grad_arrays,
+      const std::vector<OpReqType>& grad_reqs,
+      const std::vector<NDArray>& aux_arrays,
+      const std::map<std::string, Context>& group_to_ctx = std::map<std::string, Context>(),
+      Executor* shared_exec                              = nullptr);
 
  private:
   std::shared_ptr<SymBlob> blob_ptr_;
   static OpMap*& op_map();
 };
-Symbol operator+(mx_float lhs, const Symbol &rhs);
-Symbol operator-(mx_float lhs, const Symbol &rhs);
-Symbol operator*(mx_float lhs, const Symbol &rhs);
-Symbol operator/(mx_float lhs, const Symbol &rhs);
-Symbol operator%(mx_float lhs, const Symbol &rhs);
+Symbol operator+(mx_float lhs, const Symbol& rhs);
+Symbol operator-(mx_float lhs, const Symbol& rhs);
+Symbol operator*(mx_float lhs, const Symbol& rhs);
+Symbol operator/(mx_float lhs, const Symbol& rhs);
+Symbol operator%(mx_float lhs, const Symbol& rhs);
 }  // namespace cpp
 }  // namespace mxnet
 #endif  // MXNET_CPP_SYMBOL_H_

From c89c0786f78552f9b8ee88cd463002b1bfec20a5 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:01:48 +0100
Subject: [PATCH 05/10] [EXAMPLE] Re-format .cc .h files

---
 example/extensions/lib_api/libtest.cc         |   8 +-
 example/extensions/lib_custom_op/gemm_lib.cc  |  68 +++----
 example/extensions/lib_custom_op/relu_lib.cc  |  67 ++++---
 example/extensions/lib_custom_op/relu_lib.cu  |  59 +++---
 example/extensions/lib_custom_op/relu_lib.h   |  44 ++---
 .../lib_custom_op/transposecsr_lib.cc         | 101 +++++-----
 .../lib_custom_op/transposerowsp_lib.cc       |  98 +++++-----
 .../extensions/lib_external_ops/min_ex-inl.h  |  19 +-
 example/extensions/lib_external_ops/min_ex.cc |  16 +-
 example/extensions/lib_external_ops/min_ex.cu |   3 +-
 example/extensions/lib_pass/pass_lib.cc       |   5 +-
 .../extensions/lib_subgraph/subgraph_lib.cc   | 181 +++++++++---------
 12 files changed, 339 insertions(+), 330 deletions(-)

diff --git a/example/extensions/lib_api/libtest.cc b/example/extensions/lib_api/libtest.cc
index 0b2c6f64789c..fda2d96bc767 100644
--- a/example/extensions/lib_api/libtest.cc
+++ b/example/extensions/lib_api/libtest.cc
@@ -42,8 +42,8 @@ int main(void) {
   HINSTANCE handle;
   handle = LoadLibrary(TEXT("libinit_lib.dll"));
 #else
-  void *handle;
-  handle = dlopen("libinit_lib.so", RTLD_LAZY);
+  void* handle;
+  handle   = dlopen("libinit_lib.so", RTLD_LAZY);
 #endif
 
   if (!handle) {
@@ -54,9 +54,9 @@ int main(void) {
   // get initialize function address from the library
   initialize_t init_lib;
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  init_lib = (initialize_t) GetProcAddress(handle, MXLIB_INITIALIZE_STR);
+  init_lib = (initialize_t)GetProcAddress(handle, MXLIB_INITIALIZE_STR);
 #else
-  init_lib = (initialize_t) dlsym(handle, MXLIB_INITIALIZE_STR);
+  init_lib = (initialize_t)dlsym(handle, MXLIB_INITIALIZE_STR);
 #endif
 
   if (!init_lib) {
diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 4a6a337a91df..a4e518409ea7 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -30,14 +30,18 @@
 using namespace mxnet::ext;
 
 // main matrix multiplication routine
-void gemm(const float* A, const float* B, float* C,
-          const unsigned n, const unsigned k, const unsigned m) {
+void gemm(const float* A,
+          const float* B,
+          float* C,
+          const unsigned n,
+          const unsigned k,
+          const unsigned m) {
   unsigned i, j, kk;
   for (i = 0; i < n; i++) {
     for (j = 0; j < m; j++) {
-      C[i*m+j] = 0;
+      C[i * m + j] = 0;
       for (kk = 0; kk < k; kk++) {
-        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
+        C[i * m + j] += A[i * k + kk] * B[kk * m + j];
       }
     }
   }
@@ -47,7 +51,7 @@ void transpose(const float* A, float* At, const unsigned n, const unsigned m) {
   unsigned i, j;
   for (i = 0; i < n; i++) {
     for (j = 0; j < m; j++) {
-      At[i*m+j] = A[j*n+i];
+      At[i * m + j] = A[j * n + i];
     }
   }
 }
@@ -96,8 +100,8 @@ MXReturnValue backward(const std::unordered_map<std::string, std::string>& attrs
                        const OpResource& res) {
   // extract data pointers from tensors
   float* dC = inputs->at(0).data<float>();
-  float* A = inputs->at(1).data<float>();
-  float* B = inputs->at(2).data<float>();
+  float* A  = inputs->at(1).data<float>();
+  float* B  = inputs->at(2).data<float>();
   float* dA = outputs->at(0).data<float>();
   float* dB = outputs->at(1).data<float>();
   // set tensor shapes
@@ -106,9 +110,9 @@ MXReturnValue backward(const std::unordered_map<std::string, std::string>& attrs
   unsigned m = inputs->at(2).shape[1];
   // allocate temporary workspace memory through resource manager
   // for multiple arrays better to request a big memory pool
-  void *workspace = res.alloc_cpu((k*n + m*k) * sizeof(float));
-  float *At = static_cast<float*>(workspace);
-  float *Bt = static_cast<float*>(workspace) + (k*n);
+  void* workspace = res.alloc_cpu((k * n + m * k) * sizeof(float));
+  float* At       = static_cast<float*>(workspace);
+  float* Bt       = static_cast<float*>(workspace) + (k * n);
 
   transpose(A, At, k, n);
   transpose(B, Bt, m, k);
@@ -119,15 +123,16 @@ MXReturnValue backward(const std::unordered_map<std::string, std::string>& attrs
 }
 
 MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
-                         int* num_in, int* num_out) {
-  *num_in = 2;
+                         int* num_in,
+                         int* num_out) {
+  *num_in  = 2;
   *num_out = 1;
   return MX_SUCCESS;
 }
 
 MXReturnValue inferType(const std::unordered_map<std::string, std::string>& attrs,
-                        std::vector<int> *intypes,
-                        std::vector<int> *outtypes) {
+                        std::vector<int>* intypes,
+                        std::vector<int>* outtypes) {
   // validate inputs
   if (intypes->size() != 2) {
     MX_ERROR_MSG << "Expected 2 inputs to inferType";
@@ -157,10 +162,10 @@ MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& att
     return MX_FAIL;
   }
 
-  unsigned n = inshapes->at(0)[0];
-  unsigned k = inshapes->at(0)[1];
+  unsigned n  = inshapes->at(0)[0];
+  unsigned k  = inshapes->at(0)[1];
   unsigned kk = inshapes->at(1)[0];
-  unsigned m = inshapes->at(1)[1];
+  unsigned m  = inshapes->at(1)[1];
   if (k != kk) {
     MX_ERROR_MSG << "Exected first input axis 1 equals to second input axis 0";
     return MX_FAIL;
@@ -171,24 +176,23 @@ MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& att
 }
 
 REGISTER_OP(my_gemm)
-.setForward(forward, "cpu")
-.setBackward(backward, "cpu")
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape);
+    .setForward(forward, "cpu")
+    .setBackward(backward, "cpu")
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferShape(inferShape);
 
 /* ------------------------------------------------------------------------- */
 
 class MyStatefulGemm : public CustomStatefulOp {
  public:
-  explicit MyStatefulGemm(int count,
-                          std::unordered_map<std::string, std::string>  attrs)
-    : count(count), attrs_(std::move(attrs)) {}
+  explicit MyStatefulGemm(int count, std::unordered_map<std::string, std::string> attrs)
+      : count(count), attrs_(std::move(attrs)) {}
 
   ~MyStatefulGemm() override {
     std::cout << "Info: destructing MyStatefulGemm" << std::endl;
   }
-  
+
   MXReturnValue Forward(std::vector<MXTensor>* inputs,
                         std::vector<MXTensor>* outputs,
                         const OpResource& op_res) override {
@@ -209,7 +213,7 @@ class MyStatefulGemm : public CustomStatefulOp {
 
 MXReturnValue createOpState(const std::unordered_map<std::string, std::string>& attrs,
                             const MXContext& ctx,
-                            const std::vector<std::vector<unsigned int> >& in_shapes,
+                            const std::vector<std::vector<unsigned int>>& in_shapes,
                             const std::vector<int> in_types,
                             CustomStatefulOp** op_inst) {
   // testing passing of keyword arguments
@@ -227,11 +231,11 @@ MXReturnValue mutateInputs(const std::unordered_map<std::string, std::string>& a
 }
 
 REGISTER_OP(state_gemm)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setMutateInputs(mutateInputs)
-.setCreateOpState(createOpState, "cpu");
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferShape(inferShape)
+    .setMutateInputs(mutateInputs)
+    .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
   if (version >= 10700) {
diff --git a/example/extensions/lib_custom_op/relu_lib.cc b/example/extensions/lib_custom_op/relu_lib.cc
index 8bbb99f61a54..ff2041b9ff04 100644
--- a/example/extensions/lib_custom_op/relu_lib.cc
+++ b/example/extensions/lib_custom_op/relu_lib.cc
@@ -29,8 +29,9 @@
 using namespace mxnet::ext;
 
 MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
-                         int* num_in, int* num_out) {
-  *num_in = 1;
+                         int* num_in,
+                         int* num_out) {
+  *num_in  = 1;
   *num_out = 1;
   return MX_SUCCESS;
 }
@@ -53,9 +54,9 @@ MXReturnValue forwardCPU(const std::unordered_map<std::string, std::string>& att
                          std::vector<MXTensor>* inputs,
                          std::vector<MXTensor>* outputs,
                          const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
+  float* in_data  = inputs->at(0).data<float>();
   float* out_data = outputs->at(0).data<float>();
-  for (int i=0; i<inputs->at(0).size(); i++) {
+  for (int i = 0; i < inputs->at(0).size(); i++) {
     out_data[i] = in_data[i] > 0 ? in_data[i] : 0;
   }
   return MX_SUCCESS;
@@ -66,26 +67,25 @@ MXReturnValue backwardCPU(const std::unordered_map<std::string, std::string>& at
                           std::vector<MXTensor>* outputs,
                           const OpResource& res) {
   float* out_grad = inputs->at(0).data<float>();
-  float* in_data = inputs->at(1).data<float>();
-  float* in_grad = outputs->at(0).data<float>();
-  for (int i=0; i<inputs->at(1).size(); i++) {
+  float* in_data  = inputs->at(1).data<float>();
+  float* in_grad  = outputs->at(0).data<float>();
+  for (int i = 0; i < inputs->at(1).size(); i++) {
     in_grad[i] = in_data[i] > 0 ? 1 * out_grad[i] : 0;
   }
   return MX_SUCCESS;
 }
 
 REGISTER_OP(my_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setForward(forwardCPU, "cpu")
-.setForward(forwardGPU, "gpu")
-.setBackward(backwardCPU, "cpu")
-.setBackward(backwardGPU, "gpu");
-
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferShape(inferShape)
+    .setForward(forwardCPU, "cpu")
+    .setForward(forwardGPU, "gpu")
+    .setBackward(backwardCPU, "cpu")
+    .setBackward(backwardGPU, "gpu");
 
 MyStatefulReluCPU::MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs)
-  : attrs_(attrs) {}
+    : attrs_(attrs) {}
 
 MXReturnValue MyStatefulReluCPU::Forward(std::vector<MXTensor>* inputs,
                                          std::vector<MXTensor>* outputs,
@@ -100,7 +100,7 @@ MXReturnValue MyStatefulReluCPU::Backward(std::vector<MXTensor>* inputs,
 }
 
 MyStatefulReluGPU::MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs)
-  : attrs_(attrs) {}
+    : attrs_(attrs) {}
 
 MXReturnValue MyStatefulReluGPU::Forward(std::vector<MXTensor>* inputs,
                                          std::vector<MXTensor>* outputs,
@@ -114,10 +114,9 @@ MXReturnValue MyStatefulReluGPU::Backward(std::vector<MXTensor>* inputs,
   return backwardGPU(attrs_, inputs, outputs, op_res);
 }
 
-
 MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std::string>& attrs,
                                const MXContext& ctx,
-                               const std::vector<std::vector<unsigned int> >& in_shapes,
+                               const std::vector<std::vector<unsigned int>>& in_shapes,
                                const std::vector<int> in_types,
                                CustomStatefulOp** op_inst) {
   *op_inst = new MyStatefulReluCPU(attrs);
@@ -126,7 +125,7 @@ MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std::string
 
 MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std::string>& attrs,
                                const MXContext& ctx,
-                               const std::vector<std::vector<unsigned int> >& in_shapes,
+                               const std::vector<std::vector<unsigned int>>& in_shapes,
                                const std::vector<int> in_types,
                                CustomStatefulOp** op_inst) {
   *op_inst = new MyStatefulReluGPU(attrs);
@@ -134,23 +133,23 @@ MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std::string
 }
 
 REGISTER_OP(my_state_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setCreateOpState(createOpStateCPU, "cpu")
-.setCreateOpState(createOpStateGPU, "gpu");
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferShape(inferShape)
+    .setCreateOpState(createOpStateCPU, "cpu")
+    .setCreateOpState(createOpStateGPU, "gpu");
 
 MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std::string>& attrs,
                               std::vector<MXTensor>* inputs,
                               std::vector<MXTensor>* outputs,
                               const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
+  float* in_data  = inputs->at(0).data<float>();
   float* out_data = outputs->at(0).data<float>();
 
   mx_cpu_rand_t* states = res.get_cpu_rand_states();
   std::normal_distribution<float> dist_normal;
 
-  for (int i=0; i<inputs->at(0).size(); ++i) {
+  for (int i = 0; i < inputs->at(0).size(); ++i) {
     float noise = dist_normal(*states);
     out_data[i] = in_data[i] + noise > 0 ? in_data[i] + noise : 0;
   }
@@ -158,13 +157,13 @@ MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std::string>
 }
 
 REGISTER_OP(my_noisy_relu)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape)
-.setForward(noisyForwardCPU, "cpu")
-.setForward(noisyForwardGPU, "gpu")
-.setBackward(backwardCPU, "cpu")
-.setBackward(backwardGPU, "gpu");
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferShape(inferShape)
+    .setForward(noisyForwardCPU, "cpu")
+    .setForward(noisyForwardGPU, "gpu")
+    .setBackward(backwardCPU, "cpu")
+    .setBackward(backwardGPU, "gpu");
 
 MXReturnValue initialize(int version) {
   if (version >= 20000) {
diff --git a/example/extensions/lib_custom_op/relu_lib.cu b/example/extensions/lib_custom_op/relu_lib.cu
index c309274e61c6..d9643cd68ab4 100644
--- a/example/extensions/lib_custom_op/relu_lib.cu
+++ b/example/extensions/lib_custom_op/relu_lib.cu
@@ -28,7 +28,7 @@
 
 using namespace mxnet::ext;
 
-__global__ void relu_gpu_forward(float *out, float *in, int64_t N) {
+__global__ void relu_gpu_forward(float* out, float* in, int64_t N) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < N)
     out[tid] = in[tid] > 0 ? in[tid] : 0;
@@ -38,19 +38,19 @@ MXReturnValue forwardGPU(const std::unordered_map<std::string, std::string>& att
                          std::vector<MXTensor>* inputs,
                          std::vector<MXTensor>* outputs,
                          const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
+  float* in_data  = inputs->at(0).data<float>();
   float* out_data = outputs->at(0).data<float>();
 
   mx_stream_t cuda_stream = res.get_cuda_stream();
-  int64_t N = inputs->at(0).size();
-  int num_block = (N + NumThreadPerBlock - 1) / NumThreadPerBlock;
+  int64_t N               = inputs->at(0).size();
+  int num_block           = (N + NumThreadPerBlock - 1) / NumThreadPerBlock;
 
-  relu_gpu_forward<<<num_block,NumThreadPerBlock,0,cuda_stream>>>(out_data, in_data, N);
+  relu_gpu_forward<<<num_block, NumThreadPerBlock, 0, cuda_stream>>>(out_data, in_data, N);
 
   return MX_SUCCESS;
 }
 
-__global__ void relu_gpu_backward(float *ingrad, float *outgrad, float *indata, int64_t N) {
+__global__ void relu_gpu_backward(float* ingrad, float* outgrad, float* indata, int64_t N) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < N)
     ingrad[tid] = indata[tid] > 0 ? 1 * outgrad[tid] : 0;
@@ -61,40 +61,45 @@ MXReturnValue backwardGPU(const std::unordered_map<std::string, std::string>& at
                           std::vector<MXTensor>* outputs,
                           const OpResource& res) {
   float* out_grad = inputs->at(0).data<float>();
-  float* in_data = inputs->at(1).data<float>();
-  float* in_grad = outputs->at(0).data<float>();
+  float* in_data  = inputs->at(1).data<float>();
+  float* in_grad  = outputs->at(0).data<float>();
 
   mx_stream_t cuda_stream = res.get_cuda_stream();
-  int64_t N = inputs->at(0).size();
-  int num_block = (N + NumThreadPerBlock - 1) / NumThreadPerBlock;
-  relu_gpu_backward<<<num_block,NumThreadPerBlock,0,cuda_stream>>>(in_grad, out_grad, in_data, N);
+  int64_t N               = inputs->at(0).size();
+  int num_block           = (N + NumThreadPerBlock - 1) / NumThreadPerBlock;
+  relu_gpu_backward<<<num_block, NumThreadPerBlock, 0, cuda_stream>>>(
+      in_grad, out_grad, in_data, N);
 
   return MX_SUCCESS;
 }
 
-__global__ void noisy_relu_gpu_forward(float *out, float *in, int64_t N, mx_gpu_rand_t* states, int step) {
-    // the launcher logic ensures tid less than NumGPURandomStates
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    // each thread generates unique sequence of random numbers
-    mx_gpu_rand_t thread_state = states[tid];
-    // each thread works on <step> number of calculation
-    int start = tid * step;
-    int end = start + step;
-    for (int i=start; i<end && i<N; ++i) {
-        float noise = curand_normal(&thread_state);
-        out[i] = in[i] + noise > 0 ? in[i] + noise : 0;
-    }
+__global__ void noisy_relu_gpu_forward(float* out,
+                                       float* in,
+                                       int64_t N,
+                                       mx_gpu_rand_t* states,
+                                       int step) {
+  // the launcher logic ensures tid less than NumGPURandomStates
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  // each thread generates unique sequence of random numbers
+  mx_gpu_rand_t thread_state = states[tid];
+  // each thread works on <step> number of calculation
+  int start = tid * step;
+  int end   = start + step;
+  for (int i = start; i < end && i < N; ++i) {
+    float noise = curand_normal(&thread_state);
+    out[i]      = in[i] + noise > 0 ? in[i] + noise : 0;
+  }
 }
 
 MXReturnValue noisyForwardGPU(const std::unordered_map<std::string, std::string>& attrs,
                               std::vector<MXTensor>* inputs,
                               std::vector<MXTensor>* outputs,
                               const OpResource& res) {
-  float* in_data = inputs->at(0).data<float>();
+  float* in_data  = inputs->at(0).data<float>();
   float* out_data = outputs->at(0).data<float>();
 
   mx_stream_t cuda_stream = res.get_cuda_stream();
-  int64_t N = inputs->at(0).size();
+  int64_t N               = inputs->at(0).size();
 
   // below is mxnet recommended workflow to parallel random number generating
   int nthread = (N + NumRandomPerThread - 1) / NumRandomPerThread;
@@ -105,8 +110,8 @@ MXReturnValue noisyForwardGPU(const std::unordered_map<std::string, std::string>
   // this can ensure number of parallel threads less than mxnet supported random number states
   int num_block = (num_thread_need + NumThreadPerBlock - 1) / NumThreadPerBlock;
 
-  noisy_relu_gpu_forward<<<num_block,NumThreadPerBlock,0,cuda_stream>>>(
-                                out_data, in_data, N, res.get_gpu_rand_states(), step);
+  noisy_relu_gpu_forward<<<num_block, NumThreadPerBlock, 0, cuda_stream>>>(
+      out_data, in_data, N, res.get_gpu_rand_states(), step);
 
   return MX_SUCCESS;
 }
diff --git a/example/extensions/lib_custom_op/relu_lib.h b/example/extensions/lib_custom_op/relu_lib.h
index 5aadfe930340..c0e250f340e5 100644
--- a/example/extensions/lib_custom_op/relu_lib.h
+++ b/example/extensions/lib_custom_op/relu_lib.h
@@ -31,38 +31,38 @@
 
 using namespace mxnet::ext;
 
-#define NumThreadPerBlock 256 // mxnet recommended cuda thread number per block
-#define NumRandomPerThread 64 // mxnet recommended random numbers generated per thread
+#define NumThreadPerBlock  256  // mxnet recommended cuda thread number per block
+#define NumRandomPerThread 64   // mxnet recommended random numbers generated per thread
 
 class MyStatefulReluCPU : public CustomStatefulOp {
-  public:
-   explicit MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs);
+ public:
+  explicit MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs);
 
-   MXReturnValue Forward(std::vector<MXTensor>* inputs,
+  MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                        std::vector<MXTensor>* outputs,
+                        const OpResource& op_res);
+  MXReturnValue Backward(std::vector<MXTensor>* inputs,
                          std::vector<MXTensor>* outputs,
                          const OpResource& op_res);
-   MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res);
 
-  private:
-    const std::unordered_map<std::string, std::string> attrs_;
+ private:
+  const std::unordered_map<std::string, std::string> attrs_;
 };
 
 class MyStatefulReluGPU : public CustomStatefulOp {
-  public:
-   explicit MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs);
+ public:
+  explicit MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs);
 
-    MXReturnValue Forward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res);
-    
-    MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                           std::vector<MXTensor>* outputs,
-                           const OpResource& op_res);
-    
-  private:
-    const std::unordered_map<std::string, std::string> attrs_;
+  MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                        std::vector<MXTensor>* outputs,
+                        const OpResource& op_res);
+
+  MXReturnValue Backward(std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& op_res);
+
+ private:
+  const std::unordered_map<std::string, std::string> attrs_;
 };
 
 MXReturnValue forwardGPU(const std::unordered_map<std::string, std::string>& attrs,
diff --git a/example/extensions/lib_custom_op/transposecsr_lib.cc b/example/extensions/lib_custom_op/transposecsr_lib.cc
index e8a8bb7a3ee1..97c0153b63d2 100644
--- a/example/extensions/lib_custom_op/transposecsr_lib.cc
+++ b/example/extensions/lib_custom_op/transposecsr_lib.cc
@@ -30,35 +30,35 @@
 using namespace mxnet::ext;
 
 void transpose(MXTensor& src, MXTensor& dst, const OpResource& res) {
-  MXSparse* A = src.data<MXSparse>();
-  MXSparse* B = dst.data<MXSparse>(); 
+  MXSparse* A                = src.data<MXSparse>();
+  MXSparse* B                = dst.data<MXSparse>();
   std::vector<int64_t> shape = src.shape;
-  int64_t h = shape[0];
-  int64_t w = shape[1];
-  if(src.stype == kCSRStorage) {
-    float *Aval = (float*) (A->data);
+  int64_t h                  = shape[0];
+  int64_t w                  = shape[1];
+  if (src.stype == kCSRStorage) {
+    float* Aval = (float*)(A->data);
     // Here we need one more element to help calculate index(line 57).
     std::vector<int64_t> rowPtr(w + 2, 0);
     // count column
-    for(int i = 0; i < A->data_len; i++) {
+    for (int i = 0; i < A->data_len; i++) {
       rowPtr[A->indices[i] + 2]++;
     }
-    // Accumulated sum. After this for loop, rowPtr[1:w+2) stores the correct 
+    // Accumulated sum. After this for loop, rowPtr[1:w+2) stores the correct
     // result of transposed rowPtr.
-    for(int i = 2; i < rowPtr.size(); i++) {
+    for (int i = 2; i < rowPtr.size(); i++) {
       rowPtr[i] += rowPtr[i - 1];
     }
-    
+
     // Alloc memory for sparse data, where 0 is the index
     // of B in output vector.
     res.alloc_sparse(B, 0, A->data_len, w + 1);
-    float *Bval = (float*) (B->data);
-    for(int i = 0; i < h; i++) {
-      for(int j = A->indptr[i]; j < A->indptr[i + 1]; j++) {
-        // Helps calculate index and after that rowPtr[0:w+1) stores the 
+    float* Bval = (float*)(B->data);
+    for (int i = 0; i < h; i++) {
+      for (int j = A->indptr[i]; j < A->indptr[i + 1]; j++) {
+        // Helps calculate index and after that rowPtr[0:w+1) stores the
         // correct result of transposed rowPtr.
-        int index = rowPtr[A->indices[j] + 1]++;
-        Bval[index] = Aval[j];
+        int index         = rowPtr[A->indices[j] + 1]++;
+        Bval[index]       = Aval[j];
         B->indices[index] = i;
       }
     }
@@ -70,10 +70,9 @@ MXReturnValue forward(const std::unordered_map<std::string, std::string>& attrs,
                       std::vector<MXTensor>* inputs,
                       std::vector<MXTensor>* outputs,
                       const OpResource& res) {
-  // The data types and storage types of inputs and outputs should be the same.  
-  if(inputs->at(0).dtype != outputs->at(0).dtype ||
-     inputs->at(0).stype != outputs->at(0).stype) {
-    MX_ERROR_MSG << "Error! Expected all inputs and outputs to be the same type." 
+  // The data types and storage types of inputs and outputs should be the same.
+  if (inputs->at(0).dtype != outputs->at(0).dtype || inputs->at(0).stype != outputs->at(0).stype) {
+    MX_ERROR_MSG << "Error! Expected all inputs and outputs to be the same type."
                  << "Found input storage type:" << inputs->at(0).stype
                  << " Found output storage type:" << outputs->at(0).stype
                  << " Found input data type:" << inputs->at(0).dtype
@@ -93,8 +92,9 @@ MXReturnValue backward(const std::unordered_map<std::string, std::string>& attrs
 }
 
 MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
-                         int* num_in, int* num_out) {
-  *num_in = 1;
+                         int* num_in,
+                         int* num_out) {
+  *num_in  = 1;
   *num_out = 1;
   return MX_SUCCESS;
 }
@@ -142,42 +142,41 @@ MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& att
 }
 
 REGISTER_OP(my_transposecsr)
-.setForward(forward, "cpu")
-.setBackward(backward, "cpu")
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferSType(inferSType)
-.setInferShape(inferShape);
+    .setForward(forward, "cpu")
+    .setBackward(backward, "cpu")
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferSType(inferSType)
+    .setInferShape(inferShape);
 
 /* ------------------------------------------------------------------------- */
 
 class MyStatefulTransposeCSR : public CustomStatefulOp {
-  public:
-    explicit MyStatefulTransposeCSR(int count,
-                                    std::unordered_map<std::string, std::string>  attrs)
+ public:
+  explicit MyStatefulTransposeCSR(int count, std::unordered_map<std::string, std::string> attrs)
       : count(count), attrs_(std::move(attrs)) {}
 
-    MXReturnValue Forward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res) override {
-      std::cout << "Info: keyword + number of forward: " << ++count << std::endl;
-      return forward(attrs_, inputs, outputs, op_res);
-    }
+  MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                        std::vector<MXTensor>* outputs,
+                        const OpResource& op_res) override {
+    std::cout << "Info: keyword + number of forward: " << ++count << std::endl;
+    return forward(attrs_, inputs, outputs, op_res);
+  }
 
-    MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                           std::vector<MXTensor>* outputs,
-                           const OpResource& op_res) override {
-      return backward(attrs_, inputs, outputs, op_res);
-    }
+  MXReturnValue Backward(std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& op_res) override {
+    return backward(attrs_, inputs, outputs, op_res);
+  }
 
-  private:
-    int count;
-    const std::unordered_map<std::string, std::string> attrs_;
+ private:
+  int count;
+  const std::unordered_map<std::string, std::string> attrs_;
 };
 
 MXReturnValue createOpState(const std::unordered_map<std::string, std::string>& attrs,
                             const MXContext& ctx,
-                            const std::vector<std::vector<unsigned int> >& in_shapes,
+                            const std::vector<std::vector<unsigned int>>& in_shapes,
                             const std::vector<int> in_types,
                             CustomStatefulOp** op_inst) {
   // testing passing of keyword arguments
@@ -189,11 +188,11 @@ MXReturnValue createOpState(const std::unordered_map<std::string, std::string>&
 }
 
 REGISTER_OP(my_state_transposecsr)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferSType(inferSType)
-.setInferShape(inferShape)
-.setCreateOpState(createOpState, "cpu");
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferSType(inferSType)
+    .setInferShape(inferShape)
+    .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
   if (version >= 10700) {
diff --git a/example/extensions/lib_custom_op/transposerowsp_lib.cc b/example/extensions/lib_custom_op/transposerowsp_lib.cc
index ffb43db16dbc..6255fab78801 100644
--- a/example/extensions/lib_custom_op/transposerowsp_lib.cc
+++ b/example/extensions/lib_custom_op/transposerowsp_lib.cc
@@ -31,26 +31,25 @@ using namespace mxnet::ext;
 
 void transpose(MXTensor& src, MXTensor& dst, const OpResource& res) {
   MXSparse* A = src.data<MXSparse>();
-  MXSparse* B = dst.data<MXSparse>(); 
+  MXSparse* B = dst.data<MXSparse>();
 
   std::vector<int64_t> shape = src.shape;
-  int64_t h = shape[0];
-  int64_t w = shape[1];
-  if(src.stype == kRowSparseStorage) {
+  int64_t h                  = shape[0];
+  int64_t w                  = shape[1];
+  if (src.stype == kRowSparseStorage) {
     // Keys of the map is the row index of transposed tensors.
-    // Values of the map is the rows which have non-zero elements.    
+    // Values of the map is the rows which have non-zero elements.
     std::map<int, std::vector<float>> mp;
-    float *Aval = (float*) (A->data);
-    for(int i = 0; i < A->data_len; i++) {
+    float* Aval = (float*)(A->data);
+    for (int i = 0; i < A->data_len; i++) {
       int row = i / w;
       int col = i % w;
-      row = A->indices[row];
-      if(Aval[i] != 0) {
-        if(mp.find(col) == mp.end()) {
-          mp[col] = std::vector<float>(h, 0);
+      row     = A->indices[row];
+      if (Aval[i] != 0) {
+        if (mp.find(col) == mp.end()) {
+          mp[col]      = std::vector<float>(h, 0);
           mp[col][row] = Aval[i];
-        }
-        else {
+        } else {
           mp[col][row] = Aval[i];
         }
       }
@@ -58,11 +57,11 @@ void transpose(MXTensor& src, MXTensor& dst, const OpResource& res) {
 
     // Alloc memory for output tensors.
     res.alloc_sparse(B, 0, mp.size());
-    float *Bval = (float*) (B->data);
+    float* Bval = (float*)(B->data);
     int didx = 0, iidx = 0;
-    for(const auto& i : mp) {
+    for (const auto& i : mp) {
       B->indices[iidx++] = i.first;
-      for(auto j : i.second) {
+      for (auto j : i.second) {
         Bval[didx++] = j;
       }
     }
@@ -74,8 +73,7 @@ MXReturnValue forward(const std::unordered_map<std::string, std::string>& attrs,
                       std::vector<MXTensor>* outputs,
                       const OpResource& res) {
   // The data types and storage types of inputs and outputs should be the same.
-  if(inputs->at(0).dtype != outputs->at(0).dtype ||
-     inputs->at(0).stype != outputs->at(0).stype) {
+  if (inputs->at(0).dtype != outputs->at(0).dtype || inputs->at(0).stype != outputs->at(0).stype) {
     MX_ERROR_MSG << "Error! Expected all inputs and outputs to be the same type."
                  << "Found input storage type:" << inputs->at(0).stype
                  << " Found output storage type:" << outputs->at(0).stype
@@ -95,8 +93,9 @@ MXReturnValue backward(const std::unordered_map<std::string, std::string>& attrs
 }
 
 MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
-                         int* num_in, int* num_out) {
-  *num_in = 1;
+                         int* num_in,
+                         int* num_out) {
+  *num_in  = 1;
   *num_out = 1;
   return MX_SUCCESS;
 }
@@ -144,59 +143,58 @@ MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& att
 }
 
 REGISTER_OP(my_transposerowsp)
-.setForward(forward, "cpu")
-.setBackward(backward, "cpu")
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferSType(inferSType)
-.setInferShape(inferShape);
+    .setForward(forward, "cpu")
+    .setBackward(backward, "cpu")
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferSType(inferSType)
+    .setInferShape(inferShape);
 
 /* ------------------------------------------------------------------------- */
 
 class MyStatefulTransposeRowSP : public CustomStatefulOp {
-  public:
-    explicit MyStatefulTransposeRowSP(int count,
-                                      std::unordered_map<std::string, std::string>  attrs)
+ public:
+  explicit MyStatefulTransposeRowSP(int count, std::unordered_map<std::string, std::string> attrs)
       : count(count), attrs_(std::move(attrs)) {}
 
-    MXReturnValue Forward(std::vector<MXTensor>* inputs,
-                          std::vector<MXTensor>* outputs,
-                          const OpResource& op_res) override {
-      std::cout << "Info: keyword + number of forward: " << ++count << std::endl;
-      return forward(attrs_, inputs, outputs, op_res);
-    }
+  MXReturnValue Forward(std::vector<MXTensor>* inputs,
+                        std::vector<MXTensor>* outputs,
+                        const OpResource& op_res) override {
+    std::cout << "Info: keyword + number of forward: " << ++count << std::endl;
+    return forward(attrs_, inputs, outputs, op_res);
+  }
 
-    MXReturnValue Backward(std::vector<MXTensor>* inputs,
-                           std::vector<MXTensor>* outputs,
-                           const OpResource& op_res) override {
-      return backward(attrs_, inputs, outputs, op_res);
-    }
+  MXReturnValue Backward(std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& op_res) override {
+    return backward(attrs_, inputs, outputs, op_res);
+  }
 
-  private:
-    int count;
-    const std::unordered_map<std::string, std::string> attrs_;
+ private:
+  int count;
+  const std::unordered_map<std::string, std::string> attrs_;
 };
 
 MXReturnValue createOpState(const std::unordered_map<std::string, std::string>& attrs,
                             const MXContext& ctx,
-                            const std::vector<std::vector<unsigned int> >& in_shapes,
+                            const std::vector<std::vector<unsigned int>>& in_shapes,
                             const std::vector<int> in_types,
                             CustomStatefulOp** op_inst) {
   // testing passing of keyword arguments
   int count = attrs.count("test_kw") > 0 ? std::stoi(attrs.at("test_kw")) : 0;
   // creating stateful operator instance
-  *op_inst = new MyStatefulTransposeRowSP(count, attrs);
+  *op_inst                = new MyStatefulTransposeRowSP(count, attrs);
   (*op_inst)->ignore_warn = true;
   std::cout << "Info: stateful operator created" << std::endl;
   return MX_SUCCESS;
 }
 
 REGISTER_OP(my_state_transposerowsp)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferSType(inferSType)
-.setInferShape(inferShape)
-.setCreateOpState(createOpState, "cpu");
+    .setParseAttrs(parseAttrs)
+    .setInferType(inferType)
+    .setInferSType(inferSType)
+    .setInferShape(inferShape)
+    .setCreateOpState(createOpState, "cpu");
 
 MXReturnValue initialize(int version) {
   if (version >= 10700) {
diff --git a/example/extensions/lib_external_ops/min_ex-inl.h b/example/extensions/lib_external_ops/min_ex-inl.h
index 79ce5d407890..a3f1d9e45932 100644
--- a/example/extensions/lib_external_ops/min_ex-inl.h
+++ b/example/extensions/lib_external_ops/min_ex-inl.h
@@ -36,31 +36,30 @@
 namespace mxnet {
 namespace op {
 
-template<typename xpu>
+template <typename xpu>
 void MinExForward(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
                   const std::vector<TBlob>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
-  //do nothing                                                                                                                                                                         
+  // do nothing
 }
 
-
 inline bool MinExOpShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector* in_attrs,
                          mxnet::ShapeVector* out_attrs) {
-    //do nothing                                                                                                                                                                       
-    return true;
+  // do nothing
+  return true;
 }
 
 inline bool MinExOpType(const nnvm::NodeAttrs& attrs,
-                        std::vector<int> *in_attrs,
-                        std::vector<int> *out_attrs) {
-  //do nothing                                                                                                                                                                         
+                        std::vector<int>* in_attrs,
+                        std::vector<int>* out_attrs) {
+  // do nothing
   return true;
 }
 
-}  // namespace op                                                                                                                                                                     
-}  // namespace mxnet                                                                                                                                                                  
+}  // namespace op
+}  // namespace mxnet
 
 #endif  // MXNET_OPERATOR_TENSOR_MIN_EX_OP_INL_H_
diff --git a/example/extensions/lib_external_ops/min_ex.cc b/example/extensions/lib_external_ops/min_ex.cc
index cb9f6dda8b1e..eb6d5bd6fda1 100644
--- a/example/extensions/lib_external_ops/min_ex.cc
+++ b/example/extensions/lib_external_ops/min_ex.cc
@@ -29,12 +29,12 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(min_ex)
-.describe("some description")
-.set_num_inputs(0)
-.set_num_outputs(0)
-.set_attr<mxnet::FInferShape>("FInferShape", MinExOpShape)
-.set_attr<nnvm::FInferType>("FInferType", MinExOpType)
-.set_attr<FCompute>("FCompute<cpu>", MinExForward<cpu>);
+    .describe("some description")
+    .set_num_inputs(0)
+    .set_num_outputs(0)
+    .set_attr<mxnet::FInferShape>("FInferShape", MinExOpShape)
+    .set_attr<nnvm::FInferType>("FInferType", MinExOpType)
+    .set_attr<FCompute>("FCompute<cpu>", MinExForward<cpu>);
 
-}  // namespace op                                                                                                                                                                     
-}  // namespace mxnet 
+}  // namespace op
+}  // namespace mxnet
diff --git a/example/extensions/lib_external_ops/min_ex.cu b/example/extensions/lib_external_ops/min_ex.cu
index 6257ea703ba3..c65d9b67ccf6 100644
--- a/example/extensions/lib_external_ops/min_ex.cu
+++ b/example/extensions/lib_external_ops/min_ex.cu
@@ -28,8 +28,7 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(min_ex)
-.set_attr<FCompute>("FCompute<gpu>", MinExForward<gpu>);
+NNVM_REGISTER_OP(min_ex).set_attr<FCompute>("FCompute<gpu>", MinExForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index fb9a2d42f8d3..2e90c9f0b3e7 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -31,7 +31,7 @@
 using namespace mxnet::ext;
 
 /* \brief a basic pass that prints out the options and the graph */
-MXReturnValue myPass(mxnet::ext::Graph *g,
+MXReturnValue myPass(mxnet::ext::Graph* g,
                      const std::unordered_map<std::string, std::string>& options) {
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
@@ -40,8 +40,7 @@ MXReturnValue myPass(mxnet::ext::Graph *g,
   return MX_SUCCESS;
 }
 
-REGISTER_PASS(myPass)
-.setBody(myPass);
+REGISTER_PASS(myPass).setBody(myPass);
 
 MXReturnValue initialize(int version) {
   if (version >= 10700) {
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 9345b6a13ab4..96b2a1adf0d9 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -32,17 +32,17 @@
 using namespace mxnet::ext;
 
 /* function to execute log operator on floats */
-void myLog(MXTensor *in, MXTensor *out) {
-  float* inp = in->data<float>();
+void myLog(MXTensor* in, MXTensor* out) {
+  float* inp  = in->data<float>();
   float* outp = out->data<float>();
   for (int64_t i = 0; i < in->size(); i++) {
     outp[i] = logf(inp[i]);
   }
 }
 /* function to execute exp operator on floats */
-void myExp(MXTensor *in, MXTensor *out) {
-  float* inp = in->data<float>();
-  float* outp =out->data<float>();
+void myExp(MXTensor* in, MXTensor* out) {
+  float* inp  = in->data<float>();
+  float* outp = out->data<float>();
   for (int64_t i = 0; i < in->size(); i++) {
     outp[i] = expf(inp[i]);
   }
@@ -51,15 +51,15 @@ void myExp(MXTensor *in, MXTensor *out) {
 /* function to execute ops in subgraph
  * In MXNet, subgraphs are sorted in topological order
  * so all we need to do is go through the ops in order
- * and execute each op. 
+ * and execute each op.
  */
 MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
                          std::vector<MXTensor>* outputs,
-                         mxnet::ext::Graph *subgraph) {
+                         mxnet::ext::Graph* subgraph) {
   std::cout << "Info: subgraph is: " << std::endl;
   subgraph->print();
 
-  //counter for inputs
+  // counter for inputs
   int input_cnt = 0;
   // temporary tensor storage
   std::vector<MXTensor> data;
@@ -67,7 +67,7 @@ MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
   std::vector<void*> to_free;
 
   // loop over nodes
-  for(int i=0; i<subgraph->size(); i++) {
+  for (int i = 0; i < subgraph->size(); i++) {
     mxnet::ext::Node* node = subgraph->getNode(i);
     // handle each op type
     if (node->op.compare("null") == 0) {
@@ -75,26 +75,36 @@ MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
       node->tensor = &inputs->at(input_cnt++);
     } else if (node->op.compare("log") == 0) {
       // get input tensor based on node ID inputs from data storage
-      MXTensor *input = node->inputs.at(0).node->tensor;
+      MXTensor* input = node->inputs.at(0).node->tensor;
       // create temporary storage
-      MXTensor tmp(malloc(input->size()*4), input->shape, input->dtype, 0, MXContext::CPU(0), kDefaultStorage);  // NOLINT
+      MXTensor tmp(malloc(input->size() * 4),
+                   input->shape,
+                   input->dtype,
+                   0,
+                   MXContext::CPU(0),
+                   kDefaultStorage);  // NOLINT
       // save allocated ptr to free later
       to_free.push_back(tmp.data_ptr);
       // execute log operator
-      myLog(input,&tmp);
+      myLog(input, &tmp);
       // add output tensor to data storage
       data.push_back(tmp);
       // set tensor for this node so we can read it later
       node->tensor = &data.back();
     } else if (node->op.compare("exp") == 0) {
       // get input tensor based on node ID inputs from data storage
-      MXTensor *input = node->inputs.at(0).node->tensor;
+      MXTensor* input = node->inputs.at(0).node->tensor;
       // create temporary storage
-      MXTensor tmp(malloc(input->size()*4), input->shape, input->dtype, 0, MXContext::CPU(0), kDefaultStorage);  // NOLINT
+      MXTensor tmp(malloc(input->size() * 4),
+                   input->shape,
+                   input->dtype,
+                   0,
+                   MXContext::CPU(0),
+                   kDefaultStorage);  // NOLINT
       // save allocated ptr to free later
       to_free.push_back(tmp.data_ptr);
-      // execute exp operator 
-      myExp(input,&tmp);
+      // execute exp operator
+      myExp(input, &tmp);
       // add output tensor to data storage
       data.push_back(tmp);
       // set tensor for this node so we can read it later
@@ -107,15 +117,15 @@ MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
       return MX_FAIL;
     }
   }
-  
+
   // copy all operator results to outputs of subgraph
   for (int j = 0; j < subgraph->outputs.size(); j++) {
     // get computed result
-    MXTensor *result = subgraph->outputs[j].node->tensor;
+    MXTensor* result = subgraph->outputs[j].node->tensor;
     // get output tensor to pass to MX
-    MXTensor &out = outputs->at(j);
-    float *out_data = out.data<float>();
-    float *res_data = result->data<float>();
+    MXTensor& out   = outputs->at(j);
+    float* out_data = out.data<float>();
+    float* res_data = result->data<float>();
     // loop and copy data
     for (int64_t i = 0; i < result->size(); i++) {
       out_data[i] = res_data[i];
@@ -126,16 +136,15 @@ MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
   for (void* ptr : to_free) {
     free(ptr);  // NOLINT
   }
-  
+
   return MX_SUCCESS;
 }
 
 class MyStatefulOp : public CustomStatefulOp {
  public:
-  explicit MyStatefulOp(std::string json,
-                        const std::unordered_map<std::string, std::string>& attrs)
-    : attrs_(attrs) {
-    for (const auto &kv : attrs) {
+  explicit MyStatefulOp(std::string json, const std::unordered_map<std::string, std::string>& attrs)
+      : attrs_(attrs) {
+    for (const auto& kv : attrs) {
       std::cout << "subgraphOp attributes: " << kv.first << " ==> " << kv.second << std::endl;
     }
     subgraph_ = mxnet::ext::Graph::fromString(json);
@@ -144,14 +153,14 @@ class MyStatefulOp : public CustomStatefulOp {
   MXReturnValue Forward(std::vector<MXTensor>* inputs,
                         std::vector<MXTensor>* outputs,
                         const OpResource& op_res) override {
-    if(attrs_.count(MX_STR_EXTRA_INPUTS) > 0 && std::stoi(attrs_.at(MX_STR_EXTRA_INPUTS)) > 0)
+    if (attrs_.count(MX_STR_EXTRA_INPUTS) > 0 && std::stoi(attrs_.at(MX_STR_EXTRA_INPUTS)) > 0)
       std::cout << "forward::extra_inputs(" << attrs_.at(MX_STR_EXTRA_INPUTS) << ")::inputs ["
-		<< inputs->size() << "]" << std::endl;
+                << inputs->size() << "]" << std::endl;
     return myExecutor(inputs, outputs, subgraph_);
   }
 
  private:
-  mxnet::ext::Graph *subgraph_;
+  mxnet::ext::Graph* subgraph_;
   const std::unordered_map<std::string, std::string> attrs_;
 };
 
@@ -172,11 +181,9 @@ MXReturnValue createOpState(const std::unordered_map<std::string, std::string>&
   return MX_SUCCESS;
 }
 
-REGISTER_OP(_custom_subgraph_op)
-.setIsSubgraphOp()
-.setCreateOpState(createOpState, "cpu");
+REGISTER_OP(_custom_subgraph_op).setIsSubgraphOp().setCreateOpState(createOpState, "cpu");
 
-const std::vector<std::string> op_names({"exp","log"});
+const std::vector<std::string> op_names({"exp", "log"});
 
 MXReturnValue mySupportedOps(const mxnet::ext::Graph* graph,
                              std::vector<int>* ids,
@@ -185,22 +192,22 @@ MXReturnValue mySupportedOps(const mxnet::ext::Graph* graph,
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
   }
 
-  //loop over nodes
-  for(int i=0; i<graph->size(); i++) {
-    const mxnet::ext::Node *node = graph->getNode(i);
+  // loop over nodes
+  for (int i = 0; i < graph->size(); i++) {
+    const mxnet::ext::Node* node = graph->getNode(i);
 
-    //get shape/type if available
+    // get shape/type if available
     std::string shape;
     int dtype = -1;
-    if(node->attrs.count("shape") > 0)
+    if (node->attrs.count("shape") > 0)
       shape = node->attrs.at("shape");
-    if(node->attrs.count("dtype") > 0)
+    if (node->attrs.count("dtype") > 0)
       dtype = std::stoi(node->attrs.at("dtype"));
 
-    //check if op dtype is float, and if option was specified to require float types
-    if((dtype == kFloat32 && options.count("reqFloat") > 0) || options.count("reqFloat") == 0) {
-      //check if op is in allowlist
-      if(std::find(op_names.begin(),op_names.end(),node->op.c_str()) != op_names.end()) {
+    // check if op dtype is float, and if option was specified to require float types
+    if ((dtype == kFloat32 && options.count("reqFloat") > 0) || options.count("reqFloat") == 0) {
+      // check if op is in allowlist
+      if (std::find(op_names.begin(), op_names.end(), node->op.c_str()) != op_names.end()) {
         // found op in allowlist, set value to -1 to include op in any subgraph
         ids->at(i) = -1;
       }
@@ -209,7 +216,9 @@ MXReturnValue mySupportedOps(const mxnet::ext::Graph* graph,
   return MX_SUCCESS;
 }
 
-MXReturnValue myReviewSubgraph(const mxnet::ext::Graph *subgraph, int subgraph_id, bool* accept,
+MXReturnValue myReviewSubgraph(const mxnet::ext::Graph* subgraph,
+                               int subgraph_id,
+                               bool* accept,
                                const std::unordered_map<std::string, std::string>& options,
                                std::unordered_map<std::string, std::string>* attrs) {
   for (auto kv : options) {
@@ -221,7 +230,7 @@ MXReturnValue myReviewSubgraph(const mxnet::ext::Graph *subgraph, int subgraph_i
   std::cout << sg << std::endl;
 
   // check if option `reject` was specified, and if so check if value is 'True'
-  if(options.count("reject") > 0 && options.at("reject").compare("True") == 0) {
+  if (options.count("reject") > 0 && options.at("reject").compare("True") == 0) {
     // if specified, reject the subgraph. this is only used for testing
     *accept = false;
     std::cout << "rejecting subgraph" << std::endl;
@@ -230,43 +239,42 @@ MXReturnValue myReviewSubgraph(const mxnet::ext::Graph *subgraph, int subgraph_i
     std::cout << "accepting subgraph" << std::endl;
   }
 
-  attrs->emplace("myKey","myVal");
+  attrs->emplace("myKey", "myVal");
 
   return MX_SUCCESS;
 }
 
 REGISTER_PARTITIONER(myProp)
-.addStrategy("strategy1", "_custom_subgraph_op")
-.setSupportedOps("strategy1", mySupportedOps)
-.setReviewSubgraph("strategy1", myReviewSubgraph);
+    .addStrategy("strategy1", "_custom_subgraph_op")
+    .setSupportedOps("strategy1", mySupportedOps)
+    .setReviewSubgraph("strategy1", myReviewSubgraph);
 
 class MySelector : public CustomOpSelector {
  public:
-  MySelector(const mxnet::ext::Graph *graph,
-             const std::unordered_map<std::string, std::string>& options) :
-    graph_(graph), options_(options) {
+  MySelector(const mxnet::ext::Graph* graph,
+             const std::unordered_map<std::string, std::string>& options)
+      : graph_(graph), options_(options) {
     for (auto kv : options) {
-      std::cout << "selector options: " << kv.first
-                << " ==> " << kv.second << std::endl;
+      std::cout << "selector options: " << kv.first << " ==> " << kv.second << std::endl;
     }
   }
   bool chooseNode(int nodeID) {
-    const mxnet::ext::Node *node = graph_->getNode(nodeID);
+    const mxnet::ext::Node* node = graph_->getNode(nodeID);
 
-    //get shape/type if available
+    // get shape/type if available
     std::string shape;
     int dtype = -1;
-    if(node->attrs.count("shape") > 0)
+    if (node->attrs.count("shape") > 0)
       shape = node->attrs.at("shape");
-    if(node->attrs.count("dtype") > 0)
+    if (node->attrs.count("dtype") > 0)
       dtype = std::stoi(node->attrs.at("dtype"));
 
-    //check if op dtype is float, and if option was specified to require float types
-    if((dtype == kFloat32 && options_.count("reqFloat") > 0) || options_.count("reqFloat") == 0) {
-      //check if op is in allowlist
-      if(std::find(op_names.begin(),op_names.end(),node->op.c_str()) != op_names.end()) {
+    // check if op dtype is float, and if option was specified to require float types
+    if ((dtype == kFloat32 && options_.count("reqFloat") > 0) || options_.count("reqFloat") == 0) {
+      // check if op is in allowlist
+      if (std::find(op_names.begin(), op_names.end(), node->op.c_str()) != op_names.end()) {
         // found op in allowlist, return true to include op subgraph
-	return true;
+        return true;
       }
     }
     return false;
@@ -280,17 +288,18 @@ class MySelector : public CustomOpSelector {
   bool SelectOutput(int nodeID, int output_nodeID) override {
     return chooseNode(output_nodeID);
   }
-  virtual void Filter(std::vector<int>& candidates,
-                      std::vector<int>& keep) {
+  virtual void Filter(std::vector<int>& candidates, std::vector<int>& keep) {
     keep.insert(keep.end(), candidates.begin(), candidates.end());
   }
   void Reset() override {}
+
  private:
-  const mxnet::ext::Graph *graph_;
+  const mxnet::ext::Graph* graph_;
   const std::unordered_map<std::string, std::string> options_;
 };
 
-MXReturnValue createSelector(const mxnet::ext::Graph *graph, CustomOpSelector** sel_inst,
+MXReturnValue createSelector(const mxnet::ext::Graph* graph,
+                             CustomOpSelector** sel_inst,
                              const std::unordered_map<std::string, std::string>& options) {
   *sel_inst = new MySelector(graph, options);
   std::cout << "Info: selector created" << std::endl;
@@ -298,39 +307,37 @@ MXReturnValue createSelector(const mxnet::ext::Graph *graph, CustomOpSelector**
 }
 
 REGISTER_PARTITIONER(mySelect)
-.addStrategy("strategy1", "_custom_subgraph_op")
-.setCreateSelector("strategy1", createSelector)
-.setReviewSubgraph("strategy1", myReviewSubgraph);
+    .addStrategy("strategy1", "_custom_subgraph_op")
+    .setCreateSelector("strategy1", createSelector)
+    .setReviewSubgraph("strategy1", myReviewSubgraph);
 
 /* \brief a basic pass that adds a new input for subgraph ops */
-MXReturnValue addInputPass(mxnet::ext::Graph *graph,
-			   const std::unordered_map<std::string, std::string>& options) {
-  //find node with '_custom_subgraph_op' op type
-  for(int i=0; i<graph->size(); i++) {
+MXReturnValue addInputPass(mxnet::ext::Graph* graph,
+                           const std::unordered_map<std::string, std::string>& options) {
+  // find node with '_custom_subgraph_op' op type
+  for (int i = 0; i < graph->size(); i++) {
     mxnet::ext::Node* n = graph->getNode(i);
-    if(n->op.compare("_custom_subgraph_op") == 0) {
-      //set extra input
+    if (n->op.compare("_custom_subgraph_op") == 0) {
+      // set extra input
       n->attrs[MX_STR_EXTRA_INPUTS] = std::to_string(1);
-      
-      //create a new input Node
+
+      // create a new input Node
       Node* input = graph->addNode(n->name + "_input", "null");
-      //set this node as an input in the graph
+      // set this node as an input in the graph
       graph->inputs.push_back(input);
-      //connect new input to node
-      input->outputs.push_back({n,(int)(n->inputs.size())});
-      //connect node to new input
-      n->inputs.push_back({input,0});
+      // connect new input to node
+      input->outputs.push_back({n, (int)(n->inputs.size())});
+      // connect node to new input
+      n->inputs.push_back({input, 0});
       // add a corresponding tensor for this input
-      input->alloc_arg({1},MXContext::CPU(0),kFloat32);
+      input->alloc_arg({1}, MXContext::CPU(0), kFloat32);
     }
   }
 
   return MX_SUCCESS;
 }
 
-REGISTER_PASS(addInputPass)
-.setBody(addInputPass);
-
+REGISTER_PASS(addInputPass).setBody(addInputPass);
 
 MXReturnValue initialize(int version) {
   if (version >= 10700) {

From 00e8dfb9102c340b8990313ccf3c86ca8c1f86be Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:01:56 +0100
Subject: [PATCH 06/10] [PLUGIN] Re-format .cc .h files

---
 plugin/opencv/cv_api.cc            | 150 +++++++++++++++-------------
 plugin/opencv/cv_api.h             |  37 ++++---
 plugin/sframe/iter_sframe.cc       |  98 +++++++++----------
 plugin/torch/torch_base.cc         |   8 +-
 plugin/torch/torch_base.h          |  53 +++++-----
 plugin/torch/torch_criterion-inl.h |  85 ++++++++--------
 plugin/torch/torch_criterion.cc    |  12 +--
 plugin/torch/torch_criterion.cu    |   6 +-
 plugin/torch/torch_function.cc     |  55 +++++------
 plugin/torch/torch_function.h      | 136 ++++++++++++++------------
 plugin/torch/torch_module-inl.h    | 136 +++++++++++++-------------
 plugin/torch/torch_module.cc       |  12 +--
 plugin/torch/torch_module.cu       |   6 +-
 plugin/warpctc/warpctc-inl.h       | 151 +++++++++++++----------------
 plugin/warpctc/warpctc.cc          |  16 +--
 plugin/warpctc/warpctc.cu          |   6 +-
 16 files changed, 487 insertions(+), 480 deletions(-)

diff --git a/plugin/opencv/cv_api.cc b/plugin/opencv/cv_api.cc
index b0915fd40579..bbbe96c3226b 100644
--- a/plugin/opencv/cv_api.cc
+++ b/plugin/opencv/cv_api.cc
@@ -30,35 +30,37 @@
 #include "cv_api.h"
 #include "../../src/c_api/c_api_common.h"
 
-
 using namespace mxnet;
 // http://www.64lines.com/jpeg-width-height
-// Gets the JPEG size from the array of data passed to the function, file reference: http://www.obrador.com/essentialjpeg/headerinfo.htm
-bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
+// Gets the JPEG size from the array of data passed to the function, file reference:
+// http://www.obrador.com/essentialjpeg/headerinfo.htm
+bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint* width, mx_uint* height) {
   // Check for valid JPEG image
   mx_uint i = 0;  // Keeps track of the position within the file
-  if (data[i] == 0xFF && data[i+1] == 0xD8 && data[i+2] == 0xFF && data[i+3] == 0xE0) {
+  if (data[i] == 0xFF && data[i + 1] == 0xD8 && data[i + 2] == 0xFF && data[i + 3] == 0xE0) {
     i += 4;
     // Check for valid JPEG header (null terminated JFIF)
-    if (data[i+2] == 'J' && data[i+3] == 'F' && data[i+4] == 'I'
-        && data[i+5] == 'F' && data[i+6] == 0x00) {
+    if (data[i + 2] == 'J' && data[i + 3] == 'F' && data[i + 4] == 'I' && data[i + 5] == 'F' &&
+        data[i + 6] == 0x00) {
       // Retrieve the block length of the first block since
       // the first block will not contain the size of file
-      uint16_t block_length = data[i] * 256 + data[i+1];
+      uint16_t block_length = data[i] * 256 + data[i + 1];
       while (i < data_size) {
-        i+=block_length;  // Increase the file index to get to the next block
-        if (i >= data_size) return false;  // Check to protect against segmentation faults
-        if (data[i] != 0xFF) return false;  // Check that we are truly at the start of another block
-        if (data[i+1] == 0xC0) {
+        i += block_length;  // Increase the file index to get to the next block
+        if (i >= data_size)
+          return false;  // Check to protect against segmentation faults
+        if (data[i] != 0xFF)
+          return false;  // Check that we are truly at the start of another block
+        if (data[i + 1] == 0xC0) {
           // 0xFFC0 is the "Start of frame" marker which contains the file size
           // The structure of the 0xFFC0 block is quite simple
           // [0xFFC0][ushort length][uchar precision][ushort x][ushort y]
-          *height = data[i+5]*256 + data[i+6];
-          *width = data[i+7]*256 + data[i+8];
+          *height = data[i + 5] * 256 + data[i + 6];
+          *width  = data[i + 7] * 256 + data[i + 8];
           return true;
         } else {
-          i+=2;  // Skip the block marker
-          block_length = data[i] * 256 + data[i+1];  // Go to the next block
+          i += 2;                                      // Skip the block marker
+          block_length = data[i] * 256 + data[i + 1];  // Go to the next block
         }
       }
       return false;  // If this point is reached then no size was found
@@ -70,53 +72,61 @@ bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint *width,
   }
 }
 
-bool get_png_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
-  if (data[0] == 0x89 && data[1] == 0x50 && data[2] ==0x4E && data[3] == 0x47) {
+bool get_png_size(const unsigned char* data, mx_uint data_size, mx_uint* width, mx_uint* height) {
+  if (data[0] == 0x89 && data[1] == 0x50 && data[2] == 0x4E && data[3] == 0x47) {
     unsigned char const* p = data + 16;
-    *width = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    *width                 = ((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3];
     p += 4;
-    *height = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    *height = ((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3];
     return true;
   } else {
     return false;
   }
 }
 
-MXNET_DLL int MXCVImdecode(const unsigned char *img, const mx_uint len,
-                           const int flag, NDArrayHandle *out) {
+MXNET_DLL int MXCVImdecode(const unsigned char* img,
+                           const mx_uint len,
+                           const int flag,
+                           NDArrayHandle* out) {
   API_BEGIN();
   mx_uint dims[3];
   CHECK_GE(flag, 0) << "flag must be 0 (grayscale) or 1 (colored).";
   dims[2] = flag == 0 ? 1 : 3;
-  if (get_jpeg_size(img, len, dims+1, dims)) {
-  } else if (get_png_size(img, len, dims+1, dims)) {
+  if (get_jpeg_size(img, len, dims + 1, dims)) {
+  } else if (get_png_size(img, len, dims + 1, dims)) {
   } else {
     LOG(FATAL) << "Only supports png and jpg.";
   }
-  NDArray ndout(mxnet::TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
-  unsigned char *img_cpy = new unsigned char[len];
-  memcpy(img_cpy, img, sizeof(unsigned char)*len);
-  Engine::Get()->PushSync([=](RunContext ctx){
-      ndout.CheckAndAlloc();
-      cv::Mat buf(1, len, CV_8U, img_cpy);
-      cv::Mat dst(dims[0], dims[1], flag == 0 ? CV_8U : CV_8UC3, ndout.data().dptr_);
+  NDArray ndout(mxnet::TShape(dims, dims + 3), Context::CPU(), true, mshadow::kUint8);
+  unsigned char* img_cpy = new unsigned char[len];
+  memcpy(img_cpy, img, sizeof(unsigned char) * len);
+  Engine::Get()->PushSync(
+      [=](RunContext ctx) {
+        ndout.CheckAndAlloc();
+        cv::Mat buf(1, len, CV_8U, img_cpy);
+        cv::Mat dst(dims[0], dims[1], flag == 0 ? CV_8U : CV_8UC3, ndout.data().dptr_);
 #if (CV_MAJOR_VERSION > 3 || (CV_MAJOR_VERSION == 3 && CV_MINOR_VERSION >= 3))
-      cv::imdecode(buf, flag | cv::IMREAD_IGNORE_ORIENTATION, &dst);
+        cv::imdecode(buf, flag | cv::IMREAD_IGNORE_ORIENTATION, &dst);
 #else
-      cv::imdecode(buf, flag, &dst);
+        cv::imdecode(buf, flag, &dst);
 #endif
-      CHECK(!dst.empty());
-      delete[] img_cpy;
-    }, ndout.ctx(), {}, {ndout.var()});
-  NDArray *tmp = new NDArray();
-  *tmp = ndout;
-  *out = tmp;
+        CHECK(!dst.empty());
+        delete[] img_cpy;
+      },
+      ndout.ctx(),
+      {},
+      {ndout.var()});
+  NDArray* tmp = new NDArray();
+  *tmp         = ndout;
+  *out         = tmp;
   API_END();
 }
 
-
-MXNET_DLL int MXCVResize(NDArrayHandle src, const mx_uint w, const mx_uint h,
-                         const int interpolation, NDArrayHandle *out) {
+MXNET_DLL int MXCVResize(NDArrayHandle src,
+                         const mx_uint w,
+                         const mx_uint h,
+                         const int interpolation,
+                         NDArrayHandle* out) {
   API_BEGIN();
   NDArray ndsrc = *static_cast<NDArray*>(src);
   CHECK_EQ(ndsrc.shape().ndim(), 3);
@@ -124,19 +134,23 @@ MXNET_DLL int MXCVResize(NDArrayHandle src, const mx_uint w, const mx_uint h,
   CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
 
   mx_uint dims[3] = {h, w, ndsrc.shape()[2]};
-  NDArray ndout(mxnet::TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+  NDArray ndout(mxnet::TShape(dims, dims + 3), Context::CPU(), true, mshadow::kUint8);
 
-  Engine::Get()->PushSync([=](RunContext ctx){
-      ndout.CheckAndAlloc();
-      cv::Mat buf(ndsrc.shape()[0], ndsrc.shape()[1],
-                  dims[2] == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
-      cv::Mat dst(h, w, dims[2] == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
-      cv::resize(buf, dst, cv::Size(w, h), 0, 0, interpolation);
-      CHECK(!dst.empty());
-    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
-  NDArray *tmp = new NDArray();
-  *tmp = ndout;
-  *out = tmp;
+  Engine::Get()->PushSync(
+      [=](RunContext ctx) {
+        ndout.CheckAndAlloc();
+        cv::Mat buf(
+            ndsrc.shape()[0], ndsrc.shape()[1], dims[2] == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+        cv::Mat dst(h, w, dims[2] == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+        cv::resize(buf, dst, cv::Size(w, h), 0, 0, interpolation);
+        CHECK(!dst.empty());
+      },
+      ndout.ctx(),
+      {ndsrc.var()},
+      {ndout.var()});
+  NDArray* tmp = new NDArray();
+  *tmp         = ndout;
+  *out         = tmp;
   API_END();
 }
 
@@ -147,7 +161,7 @@ MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,
                                  const int right,
                                  const int type,
                                  const double value,
-                                 NDArrayHandle *out) {
+                                 NDArrayHandle* out) {
   API_BEGIN();
   NDArray ndsrc = *static_cast<NDArray*>(src);
   CHECK_EQ(ndsrc.shape().ndim(), 3);
@@ -155,18 +169,22 @@ MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,
   CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
 
   int h = ndsrc.shape()[0], w = ndsrc.shape()[1], c = ndsrc.shape()[2];
-  mx_uint dims[3] = {top+h+bot, left+w+right, c};
-  NDArray ndout(mxnet::TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+  mx_uint dims[3] = {top + h + bot, left + w + right, c};
+  NDArray ndout(mxnet::TShape(dims, dims + 3), Context::CPU(), true, mshadow::kUint8);
 
-  Engine::Get()->PushSync([=](RunContext ctx){
-      ndout.CheckAndAlloc();
-      cv::Mat buf(h, w, c == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
-      cv::Mat dst(top+h+bot, left+w+right, c == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
-      cv::copyMakeBorder(buf, dst, top, bot, left, right, type, cv::Scalar(value));
-      CHECK(!dst.empty());
-    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
-  NDArray *tmp = new NDArray();
-  *tmp = ndout;
-  *out = tmp;
+  Engine::Get()->PushSync(
+      [=](RunContext ctx) {
+        ndout.CheckAndAlloc();
+        cv::Mat buf(h, w, c == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+        cv::Mat dst(top + h + bot, left + w + right, c == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+        cv::copyMakeBorder(buf, dst, top, bot, left, right, type, cv::Scalar(value));
+        CHECK(!dst.empty());
+      },
+      ndout.ctx(),
+      {ndsrc.var()},
+      {ndout.var()});
+  NDArray* tmp = new NDArray();
+  *tmp         = ndout;
+  *out         = tmp;
   API_END();
 }
diff --git a/plugin/opencv/cv_api.h b/plugin/opencv/cv_api.h
index b318041eb6b9..c8ab701e0521 100644
--- a/plugin/opencv/cv_api.h
+++ b/plugin/opencv/cv_api.h
@@ -28,27 +28,24 @@
 
 #include <mxnet/c_api.h>
 
-MXNET_DLL int MXCVImdecode(
-  const unsigned char *img,
-  const mx_uint len,
-  const int flag,
-  NDArrayHandle *out);
+MXNET_DLL int MXCVImdecode(const unsigned char* img,
+                           const mx_uint len,
+                           const int flag,
+                           NDArrayHandle* out);
 
-MXNET_DLL int MXCVResize(
-  NDArrayHandle src,
-  const mx_uint w,
-  const mx_uint h,
-  const int interpolation,
-  NDArrayHandle *out);
+MXNET_DLL int MXCVResize(NDArrayHandle src,
+                         const mx_uint w,
+                         const mx_uint h,
+                         const int interpolation,
+                         NDArrayHandle* out);
 
-MXNET_DLL int MXCVcopyMakeBorder(
-  NDArrayHandle src,
-  const int top,
-  const int bot,
-  const int left,
-  const int right,
-  const int type,
-  const double value,
-  NDArrayHandle *out);
+MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,
+                                 const int top,
+                                 const int bot,
+                                 const int left,
+                                 const int right,
+                                 const int type,
+                                 const double value,
+                                 NDArrayHandle* out);
 
 #endif  // PLUGIN_OPENCV_CV_API_H_
diff --git a/plugin/sframe/iter_sframe.cc b/plugin/sframe/iter_sframe.cc
index 6a6b03f9c2fb..8834e8872ab4 100644
--- a/plugin/sframe/iter_sframe.cc
+++ b/plugin/sframe/iter_sframe.cc
@@ -22,7 +22,7 @@
  * \file iter_sframe_image.cc
  * \brief
  * \author Bing Xu
-*/
+ */
 
 #include <mxnet/io.h>
 #include <dmlc/base.h>
@@ -53,16 +53,17 @@ struct SFrameParam : public dmlc::Parameter<SFrameParam> {
   mxnet::TShape data_shape;
   mxnet::TShape label_shape;
   DMLC_DECLARE_PARAMETER(SFrameParam) {
-    DMLC_DECLARE_FIELD(path_sframe).set_default("")
-    .describe("Dataset Param: path to image dataset sframe");
-    DMLC_DECLARE_FIELD(data_field).set_default("data")
-    .describe("Dataset Param: data column in sframe");
-    DMLC_DECLARE_FIELD(label_field).set_default("label")
-    .describe("Dataset Param: label column in sframe");
-    DMLC_DECLARE_FIELD(data_shape)
-    .describe("Dataset Param: input data instance shape");
-    DMLC_DECLARE_FIELD(label_shape)
-    .describe("Dataset Param: input label instance shape");
+    DMLC_DECLARE_FIELD(path_sframe)
+        .set_default("")
+        .describe("Dataset Param: path to image dataset sframe");
+    DMLC_DECLARE_FIELD(data_field)
+        .set_default("data")
+        .describe("Dataset Param: data column in sframe");
+    DMLC_DECLARE_FIELD(label_field)
+        .set_default("label")
+        .describe("Dataset Param: label column in sframe");
+    DMLC_DECLARE_FIELD(data_shape).describe("Dataset Param: input data instance shape");
+    DMLC_DECLARE_FIELD(label_shape).describe("Dataset Param: input label instance shape");
   }
 };  // struct SFrameImageParam
 
@@ -80,12 +81,12 @@ class SFrameIterBase : public IIterator<DataInst> {
   virtual ~SFrameIterBase() {}
 
   virtual void BeforeFirst() {
-    idx_ = 0;
-    *range_it_ = sframe_.range_iterator();
+    idx_        = 0;
+    *range_it_  = sframe_.range_iterator();
     current_it_ = range_it_->begin();
   }
 
-  virtual const DataInst &Value(void) const {
+  virtual const DataInst& Value(void) const {
     return out_;
   }
 
@@ -109,8 +110,8 @@ class SFrameIterBase : public IIterator<DataInst> {
 
  protected:
   /*! \brief copy data */
-  template<int dim>
-  void Copy_(mshadow::Tensor<cpu, dim> tensor, const graphlab::flex_vec &vec) {
+  template <int dim>
+  void Copy_(mshadow::Tensor<cpu, dim> tensor, const graphlab::flex_vec& vec) {
     CHECK_EQ(tensor.shape_.Size(), vec.size());
     CHECK_EQ(tensor.CheckContiguous(), true);
     mshadow::Tensor<cpu, 1> flatten(tensor.dptr_, mshadow::Shape1(tensor.shape_.Size()));
@@ -122,14 +123,12 @@ class SFrameIterBase : public IIterator<DataInst> {
 
 class SFrameImageIter : public SFrameIterBase {
  public:
-  SFrameImageIter() :
-    augmenter_(new ImageAugmenter()), prnd_(new common::RANDOM_ENGINE(8964)) {}
+  SFrameImageIter() : augmenter_(new ImageAugmenter()), prnd_(new common::RANDOM_ENGINE(8964)) {}
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     Parent::Init(kwargs);
     augmenter_->Init(kwargs);
-    CHECK_EQ(Parent::param_.data_shape.ndim(), 3)
-      << "Image shpae must be (channel, height, width)";
+    CHECK_EQ(Parent::param_.data_shape.ndim(), 3) << "Image shpae must be (channel, height, width)";
   }
 
   bool Next(void) override {
@@ -141,21 +140,22 @@ class SFrameImageIter : public SFrameIterBase {
     // TODO(bing): check not decoded
     // TODO(bing): check img shape
     CHECK_EQ(gl_label.size(), Parent::param_.label_shape.Size()) << "Label shape does not match";
-    const unsigned char *raw_data = gl_img.get_image_data();
+    const unsigned char* raw_data = gl_img.get_image_data();
     cv::Mat res;
     cv::Mat buf(1, gl_img.m_image_data_size, CV_8U, const_cast<unsigned char*>(raw_data));
-    res = cv::imdecode(buf, -1);
-    res = augmenter_->Process(res, prnd_.get());
+    res                  = cv::imdecode(buf, -1);
+    res                  = augmenter_->Process(res, prnd_.get());
     const int n_channels = res.channels();
     if (!tmp_.Size()) {
-      tmp_.Push(Parent::idx_++,
-                Parent::param_.data_shape.get<3>(),
-                Parent::param_.label_shape.get<1>());
+      tmp_.Push(
+          Parent::idx_++, Parent::param_.data_shape.get<3>(), Parent::param_.label_shape.get<1>());
     }
     mshadow::Tensor<cpu, 3> data = Parent::tmp_.data().Back();
     std::vector<int> swap_indices;
-    if (n_channels == 1) swap_indices = {0};
-    if (n_channels == 3) swap_indices = {2, 1, 0};
+    if (n_channels == 1)
+      swap_indices = {0};
+    if (n_channels == 3)
+      swap_indices = {2, 1, 0};
     for (int i = 0; i < res.rows; ++i) {
       uchar* im_data = res.ptr<uchar>(i);
       for (int j = 0; j < res.cols; ++j) {
@@ -188,14 +188,13 @@ class SFrameDataIter : public SFrameIterBase {
     if (Parent::current_it_ == Parent::range_it_->end()) {
       return false;
     }
-    graphlab::flex_vec gl_data = (*Parent::current_it_)[0];
+    graphlab::flex_vec gl_data  = (*Parent::current_it_)[0];
     graphlab::flex_vec gl_label = (*Parent::current_it_)[1];
     CHECK_EQ(gl_data.size(), Parent::param_.data_shape.Size()) << "Data shape does not match";
     CHECK_EQ(gl_label.size(), Parent::param_.label_shape.Size()) << "Label shape does not match";
     if (!Parent::tmp_.Size()) {
-        Parent::tmp_.Push(Parent::idx_++,
-                  Parent::param_.data_shape.get<3>(),
-                  Parent::param_.label_shape.get<1>());
+      Parent::tmp_.Push(
+          Parent::idx_++, Parent::param_.data_shape.get<3>(), Parent::param_.label_shape.get<1>());
     }
     mshadow::Tensor<cpu, 3> data = Parent::tmp_.data().Back();
     Parent::Copy_<3>(data, gl_data);
@@ -214,31 +213,22 @@ class SFrameDataIter : public SFrameIterBase {
 DMLC_REGISTER_PARAMETER(SFrameParam);
 
 MXNET_REGISTER_IO_ITER(SFrameImageIter)
-.describe("Naive SFrame image iterator prototype")
-.add_arguments(SFrameParam::__FIELDS__())
-.add_arguments(BatchParam::__FIELDS__())
-.add_arguments(PrefetcherParam::__FIELDS__())
-.add_arguments(ImageAugmentParam::__FIELDS__())
-.add_arguments(ImageNormalizeParam::__FIELDS__())
-.set_body([]() {
-    return new PrefetcherIter(
-        new BatchLoader(
-            new ImageNormalizeIter(
-              new SFrameImageIter())));
+    .describe("Naive SFrame image iterator prototype")
+    .add_arguments(SFrameParam::__FIELDS__())
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__())
+    .add_arguments(ImageAugmentParam::__FIELDS__())
+    .add_arguments(ImageNormalizeParam::__FIELDS__())
+    .set_body([]() {
+      return new PrefetcherIter(new BatchLoader(new ImageNormalizeIter(new SFrameImageIter())));
     });
 
 MXNET_REGISTER_IO_ITER(SFrameDataIter)
-.describe("Naive SFrame data iterator prototype")
-.add_arguments(SFrameParam::__FIELDS__())
-.add_arguments(BatchParam::__FIELDS__())
-.add_arguments(PrefetcherParam::__FIELDS__())
-.set_body([]() {
-    return new PrefetcherIter(
-        new BatchLoader(
-              new SFrameDataIter()));
-    });
-
+    .describe("Naive SFrame data iterator prototype")
+    .add_arguments(SFrameParam::__FIELDS__())
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__())
+    .set_body([]() { return new PrefetcherIter(new BatchLoader(new SFrameDataIter())); });
 
 }  // namespace io
 }  // namespace mxnet
-
diff --git a/plugin/torch/torch_base.cc b/plugin/torch/torch_base.cc
index 8a9d85b06465..0dcb73d29a07 100644
--- a/plugin/torch/torch_base.cc
+++ b/plugin/torch/torch_base.cc
@@ -22,7 +22,7 @@
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
-*/
+ */
 #include "./torch_base.h"
 
 namespace mxnet {
@@ -40,7 +40,7 @@ TorchState::TorchState() {
                   "require 'cudnn'\n"
 #endif  // MXNET_USE_CUDNN
 #endif  // MXNET_USE_CUDA
-                  ); // NOLINT(*)
+  );    // NOLINT(*)
   int err = lua_pcall(L, 0, 0, 0);
   CHECK_EQ(err, 0) << lua_tostring(L, -1);
 }
@@ -53,13 +53,13 @@ TorchState* TorchState::ThreadSharedLuaState() {
   return state;
 }
 
-template<>
+template <>
 void TorchState::SetStream(mshadow::Stream<mshadow::cpu>* s) {
   return;
 }
 
 #if MXNET_USE_CUDA
-template<>
+template <>
 void TorchState::SetStream(mshadow::Stream<mshadow::gpu>* s) {
   CudaState()->currentStream = mshadow::Stream<gpu>::GetStream(s);
 }
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index 04bee24974bf..9c573daa70e7 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -65,26 +65,26 @@ class TorchState {
   }
 #endif  // MXNET_USE_CUDA
 
-  template<typename xpu>
+  template <typename xpu>
   void SetStream(mshadow::Stream<xpu>* s);
 
   void PrintState() {
     int i;
     int top = lua_gettop(L);
     LOG(INFO) << "Stack height: " << top;
-    for (i = 1; i <= top; i++) {  /* repeat for each level */
+    for (i = 1; i <= top; i++) { /* repeat for each level */
       int t = lua_type(L, i);
       switch (t) {
-        case LUA_TSTRING:  /* strings */
+        case LUA_TSTRING: /* strings */
           LOG(INFO) << i << ": '" << lua_tostring(L, i) << "'";
           break;
-        case LUA_TBOOLEAN:  /* booleans */
+        case LUA_TBOOLEAN: /* booleans */
           LOG(INFO) << i << ": " << (lua_toboolean(L, i) ? "true" : "false");
           break;
-        case LUA_TNUMBER:  /* numbers */
+        case LUA_TNUMBER: /* numbers */
           LOG(INFO) << i << ": " << lua_tonumber(L, i);
           break;
-        default:  /* other values */
+        default: /* other values */
           LOG(INFO) << i << ": " << lua_typename(L, t);
           break;
       }
@@ -152,7 +152,7 @@ class TorchTensor {
   }
 
   static THGeneralTensor TBlobToTHTensor(TorchState* torchState, TBlob data) {
-    size_t size = data.Size();
+    size_t size            = data.Size();
     THGeneralTensor tensor = NULL;
     THLongStorage* thshape = THLongStorage_newWithSize(data.ndim());
     for (int i = 0; i < data.ndim(); ++i) {
@@ -161,8 +161,8 @@ class TorchTensor {
     CHECK_EQ(data.type_flag_, mshadow::kFloat32) << "Torch Interface only support float32";
     switch (data.dev_mask()) {
       case cpu::kDevMask: {
-        THFloatStorage* storage = THFloatStorage_newWithData(static_cast<real_t*>(data.dptr_),
-                                                             size);
+        THFloatStorage* storage =
+            THFloatStorage_newWithData(static_cast<real_t*>(data.dptr_), size);
         THFloatStorage_clearFlag(storage, TH_STORAGE_FREEMEM);
         tensor = (THGeneralTensor)THFloatTensor_newWithStorage(storage, 0, thshape, NULL);
         THFloatStorage_free(storage);
@@ -171,8 +171,8 @@ class TorchTensor {
 #if MXNET_USE_CUDA
       case gpu::kDevMask: {
         THCState* state = torchState->CudaState();
-        THCudaStorage* storage = THCudaStorage_newWithData(state, static_cast<real_t*>(data.dptr_),
-                                                           size);
+        THCudaStorage* storage =
+            THCudaStorage_newWithData(state, static_cast<real_t*>(data.dptr_), size);
         // a bug in cutorch
         THFloatStorage_clearFlag(reinterpret_cast<THFloatStorage*>(storage), TH_STORAGE_FREEMEM);
         tensor = (THGeneralTensor)THCudaTensor_newWithStorage(state, storage, 0, thshape, NULL);
@@ -197,7 +197,7 @@ class TorchTensor {
       }
 #if MXNET_USE_CUDA
       case gpu::kDevMask: {
-        THCState* state = torchState->CudaState();
+        THCState* state         = torchState->CudaState();
         THCudaStorage* original = static_cast<THCudaTensor*>(tensor)->storage;
         THCudaStorage_free(state, original);
         break;
@@ -212,10 +212,10 @@ class TorchTensor {
     size_t size = blob.Size();
     switch (blob.dev_mask()) {
       case cpu::kDevMask: {
-        THFloatStorage* storage = THFloatStorage_newWithData(static_cast<real_t*>(blob.dptr_),
-                                                             size);
+        THFloatStorage* storage =
+            THFloatStorage_newWithData(static_cast<real_t*>(blob.dptr_), size);
         THFloatStorage_clearFlag(storage, TH_STORAGE_FREEMEM);
-        THFloatStorage* original = static_cast<THFloatTensor*>(tensor)->storage;
+        THFloatStorage* original                     = static_cast<THFloatTensor*>(tensor)->storage;
         static_cast<THFloatTensor*>(tensor)->storage = storage;
         THFloatStorage_free(original);
         break;
@@ -223,12 +223,11 @@ class TorchTensor {
 #if MXNET_USE_CUDA
       case gpu::kDevMask: {
         THCState* state = torchState->CudaState();
-        THCudaStorage* storage = THCudaStorage_newWithData(state,
-                                                           static_cast<real_t*>(blob.dptr_),
-                                                           size);
+        THCudaStorage* storage =
+            THCudaStorage_newWithData(state, static_cast<real_t*>(blob.dptr_), size);
         // TODO(min): torch bug Cuda version not implemented
         THFloatStorage_clearFlag(reinterpret_cast<THFloatStorage*>(storage), TH_STORAGE_FREEMEM);
-        THCudaStorage* original = static_cast<THCudaTensor*>(tensor)->storage;
+        THCudaStorage* original                     = static_cast<THCudaTensor*>(tensor)->storage;
         static_cast<THCudaTensor*>(tensor)->storage = storage;
         THCudaStorage_free(state, original);
         break;
@@ -240,9 +239,9 @@ class TorchTensor {
   }
 
   static std::vector<THGeneralTensor> TBlobVectorAsTable(
-    TorchState* torchState,
-    const std::vector<TBlob>::const_iterator begin,
-    const std::vector<TBlob>::const_iterator end) {
+      TorchState* torchState,
+      const std::vector<TBlob>::const_iterator begin,
+      const std::vector<TBlob>::const_iterator end) {
     lua_State* L = torchState->L;
     std::vector<THGeneralTensor> res;
     int num = end - begin;
@@ -269,16 +268,16 @@ class TorchTensor {
     lua_State* L = torchState->L;
     if (luaT_isudata(L, -1, TorchTensor::TensorType(cpu::kDevMask))) {
       CHECK_EQ(dst.dev_mask(), cpu::kDevMask) << "Device type mismatch.";
-      THFloatTensor* src = static_cast<THFloatTensor*>(
-        luaT_toudata(L, -1, TorchTensor::TensorType(cpu::kDevMask)));
+      THFloatTensor* src =
+          static_cast<THFloatTensor*>(luaT_toudata(L, -1, TorchTensor::TensorType(cpu::kDevMask)));
       if (src->storage != static_cast<THFloatTensor*>(th_dst)->storage) {
         THFloatTensor_copy(static_cast<THFloatTensor*>(th_dst), src);
       }
 #if MXNET_USE_CUDA
     } else if (luaT_isudata(L, -1, TorchTensor::TensorType(gpu::kDevMask))) {
       CHECK_EQ(dst.dev_mask(), gpu::kDevMask) << "Device type mismatch.";
-      THCudaTensor* src = static_cast<THCudaTensor*>(
-        luaT_toudata(L, -1, TorchTensor::TensorType(gpu::kDevMask)));
+      THCudaTensor* src =
+          static_cast<THCudaTensor*>(luaT_toudata(L, -1, TorchTensor::TensorType(gpu::kDevMask)));
       if (src->storage != static_cast<THCudaTensor*>(th_dst)->storage) {
         THCudaTensor_copy(torchState->CudaState(), static_cast<THCudaTensor*>(th_dst), src);
       }
@@ -294,7 +293,7 @@ class TorchTensor {
                           std::vector<THGeneralTensor>::const_iterator th_begin,
                           std::vector<THGeneralTensor>::const_iterator th_end) {
     lua_State* L = torchState->L;
-    int num = end - begin;
+    int num      = end - begin;
     CHECK_EQ(th_end - th_begin, num);
     if (num == 0) {
     } else if (num == 1) {
diff --git a/plugin/torch/torch_criterion-inl.h b/plugin/torch/torch_criterion-inl.h
index 2138bd8f1335..c77fc5ab6e47 100644
--- a/plugin/torch/torch_criterion-inl.h
+++ b/plugin/torch/torch_criterion-inl.h
@@ -22,7 +22,7 @@
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
-*/
+ */
 #ifndef PLUGIN_TORCH_TORCH_CRITERION_INL_H_
 #define PLUGIN_TORCH_TORCH_CRITERION_INL_H_
 
@@ -46,14 +46,14 @@ struct TorchCriterionParam : public dmlc::Parameter<TorchCriterionParam> {
   float grad_scale;
   DMLC_DECLARE_PARAMETER(TorchCriterionParam) {
     DMLC_DECLARE_FIELD(lua_string)
-    .describe("lua string that is called to generate the torch criterion object");
+        .describe("lua string that is called to generate the torch criterion object");
     DMLC_DECLARE_FIELD(label_shape)
-    .set_default(mxnet::TShape())
-    .enforce_nonzero()
-    .describe("Shape of label (without batch size).");
+        .set_default(mxnet::TShape())
+        .enforce_nonzero()
+        .describe("Shape of label (without batch size).");
     DMLC_DECLARE_FIELD(grad_scale)
-    .set_default(1.0f)
-    .describe("Scale the gradient by a float factor (a.k.a weight of this loss).");
+        .set_default(1.0f)
+        .describe("Scale the gradient by a float factor (a.k.a weight of this loss).");
   }
 };
 
@@ -61,7 +61,7 @@ struct TorchCriterionParam : public dmlc::Parameter<TorchCriterionParam> {
  * \brief This is the implementation of activation operator.
  * \tparam xpu The device that the op will be executed on.
  */
-template<typename xpu>
+template <typename xpu>
 class TorchCriterionOp : public Operator {
  private:
   TorchCriterionParam param_;
@@ -70,12 +70,12 @@ class TorchCriterionOp : public Operator {
 
  public:
   explicit TorchCriterionOp(TorchCriterionParam p) {
-    this->param_ = p;
+    this->param_      = p;
     this->torchState_ = new TorchState();
-    lua_State *L = torchState_->L;
+    lua_State* L      = torchState_->L;
     CHECK_EQ(lua_gettop(L), 0);
-    std::string exec = std::string("return ") + p.lua_string
-      + TorchTensor::ModuleType(xpu::kDevMask);
+    std::string exec =
+        std::string("return ") + p.lua_string + TorchTensor::ModuleType(xpu::kDevMask);
     CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
     int err = lua_pcall(L, 0, 1, 0);
     CHECK_EQ(err, 0) << lua_tostring(L, -1);
@@ -87,17 +87,17 @@ class TorchCriterionOp : public Operator {
     delete this->torchState_;
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    lua_State *L = torchState_->L;
+    lua_State* L = torchState_->L;
     CHECK_EQ(lua_gettop(L), 0);
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Stream<xpu>* s = ctx.get_stream<xpu>();
     torchState_->SetStream(s);
     lua_rawgeti(L, LUA_REGISTRYINDEX, lua_reference_);
     // call forward
@@ -117,26 +117,26 @@ class TorchCriterionOp : public Operator {
     real_t loss = static_cast<real_t>(lua_tonumber(L, -1));
     lua_pop(L, 1);
     Tensor<xpu, 2> out = out_data[0].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[0], loss*param_.grad_scale);
+    Assign(out, req[0], loss * param_.grad_scale);
     lua_pop(L, 1);
     CHECK_EQ(lua_gettop(L), 0);
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    lua_State *L = torchState_->L;
+    lua_State* L = torchState_->L;
     CHECK_EQ(lua_gettop(L), 0);
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 1);
     CHECK_EQ(req[0], kWriteTo) << "Torch Criterion only supports write to in_grad";
     CHECK_EQ(req[1], kNullOp) << "Torch Criterion cannot back prop to label";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Stream<xpu>* s = ctx.get_stream<xpu>();
     torchState_->SetStream(s);
     lua_rawgeti(L, LUA_REGISTRYINDEX, lua_reference_);
     THGeneralTensor th = TorchTensor::TBlobToTHTensor(torchState_, in_grad[0]);
@@ -161,7 +161,7 @@ class TorchCriterionOp : public Operator {
 };  // class TorchCriterionOp
 
 // Decalre Factory function, used for dispatch specialization
-template<typename xpu>
+template <typename xpu>
 Operator* CreateOp(TorchCriterionParam type);
 
 #if DMLC_USE_CXX11
@@ -183,17 +183,19 @@ class TorchCriterionProp : public OperatorProperty {
     return param_.__DICT__();
   }
 
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
+  bool InferShape(mxnet::ShapeVector* in_shape,
+                  mxnet::ShapeVector* out_shape,
+                  mxnet::ShapeVector* aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2);
-    const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    const mxnet::TShape& dshape = in_shape->at(0);
+    if (dshape.ndim() == 0)
+      return false;
     std::vector<index_t> lshape;
     lshape.push_back(dshape[0]);
-    lshape.insert(lshape.end(), param_.label_shape.data(),
-      param_.label_shape.data() +  param_.label_shape.ndim());
+    lshape.insert(lshape.end(),
+                  param_.label_shape.data(),
+                  param_.label_shape.data() + param_.label_shape.ndim());
     mxnet::TShape shape(lshape.begin(), lshape.end());
     SHAPE_ASSIGN_CHECK(*in_shape, 1, shape);
     out_shape->clear();
@@ -202,7 +204,7 @@ class TorchCriterionProp : public OperatorProperty {
   }
 
   OperatorProperty* Copy() const override {
-    auto ptr = new TorchCriterionProp();
+    auto ptr    = new TorchCriterionProp();
     ptr->param_ = param_;
     return ptr;
   }
@@ -212,10 +214,9 @@ class TorchCriterionProp : public OperatorProperty {
   }
 
   // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+  std::vector<int> DeclareBackwardDependency(const std::vector<int>& out_grad,
+                                             const std::vector<int>& in_data,
+                                             const std::vector<int>& out_data) const override {
     std::vector<int> dep;
     dep.insert(dep.end(), in_data.begin(), in_data.end());
     // Ensure that the backward and forward cannot be called at the same time
diff --git a/plugin/torch/torch_criterion.cc b/plugin/torch/torch_criterion.cc
index 110a58156a26..7b46a725b618 100644
--- a/plugin/torch/torch_criterion.cc
+++ b/plugin/torch/torch_criterion.cc
@@ -22,27 +22,27 @@
  * \file activation.cc
  * \brief activation op
  * \author Junyuan Xie
-*/
+ */
 #include "./torch_criterion-inl.h"
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(TorchCriterionParam param) {
+template <>
+Operator* CreateOp<cpu>(TorchCriterionParam param) {
   return new TorchCriterionOp<cpu>(param);
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-Operator *TorchCriterionProp::CreateOperator(Context ctx) const {
+Operator* TorchCriterionProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_);
 }
 
 DMLC_REGISTER_PARAMETER(TorchCriterionParam);
 
 MXNET_REGISTER_OP_PROPERTY(TorchCriterion, TorchCriterionProp)
-.describe("Criterions from torch.")
-.add_arguments(TorchCriterionParam::__FIELDS__());
+    .describe("Criterions from torch.")
+    .add_arguments(TorchCriterionParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/plugin/torch/torch_criterion.cu b/plugin/torch/torch_criterion.cu
index ccb7145f36af..0b22722d756a 100644
--- a/plugin/torch/torch_criterion.cu
+++ b/plugin/torch/torch_criterion.cu
@@ -22,14 +22,14 @@
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
-*/
+ */
 #include "./torch_criterion-inl.h"
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(TorchCriterionParam param) {
+template <>
+Operator* CreateOp<gpu>(TorchCriterionParam param) {
   return new TorchCriterionOp<gpu>(param);
 }
 
diff --git a/plugin/torch/torch_function.cc b/plugin/torch/torch_function.cc
index 3ec9a000acfd..bb802ce67e7a 100644
--- a/plugin/torch/torch_function.cc
+++ b/plugin/torch/torch_function.cc
@@ -22,7 +22,7 @@
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
-*/
+ */
 #include "./torch_function.h"
 
 namespace mxnet {
@@ -51,8 +51,10 @@ MXNET_REGISTER_TORCH_UNARY_FUN(_th_floor, floor);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_log, log);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_log1p, log1p);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_pow, pow)
-.add_argument("n", "float", "pow(x, n) returns x^n, element-wise. "
-  "pow(n, x) returns n^x, element-wise.");
+    .add_argument("n",
+                  "float",
+                  "pow(x, n) returns x^n, element-wise. "
+                  "pow(n, x) returns n^x, element-wise.");
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_round, round);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_sin, sin);
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_sinh, sinh);
@@ -62,7 +64,7 @@ MXNET_REGISTER_TORCH_UNARY_FUN(_th_tanh, tanh);
 
 // Basic operations
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_add_scalar, add)
-.add_argument("value", "float", "Add value to all elements in x");
+    .add_argument("value", "float", "Add value to all elements in x");
 MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_add, add);
 MXNET_REGISTER_TORCH_BINARY_FUN(_th_add_axpy, add);
 
@@ -70,7 +72,7 @@ MXNET_REGISTER_TORCH_BINARY_FUN(_th_add_axpy, add);
 // MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_csub, csub);
 
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_mul_scalar, mul)
-.add_argument("value", "float", "Multiply value to all elements in x");
+    .add_argument("value", "float", "Multiply value to all elements in x");
 MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cmul, cmul);
 
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_clamp, clamp);
@@ -78,7 +80,7 @@ MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cpow, cpow);
 MXNET_REGISTER_TORCH_TENARY_FUN(_th_addcmul, addcmul);
 
 MXNET_REGISTER_TORCH_UNARY_FUN(_th_div_scalar, div)
-.add_argument("value", "float", "Divide all elements in x by value");
+    .add_argument("value", "float", "Divide all elements in x by value");
 MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(_th_cdiv, cdiv);
 MXNET_REGISTER_TORCH_TENARY_FUN(_th_addcdiv, addcdiv);
 
@@ -89,67 +91,66 @@ MXNET_REGISTER_TORCH_TENARY_FUN(_th_addbmm, addbmm);
 MXNET_REGISTER_TORCH_TENARY_FUN(_th_baddbmm, baddbmm);
 
 struct TorchMMShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     CHECK_EQ(u[0]->shape().ndim(), 2);
     CHECK_EQ(u[1]->shape().ndim(), 2);
     CHECK_EQ(u[0]->shape()[1], u[1]->shape()[0]);
     index_t shape[] = {u[0]->shape()[0], u[1]->shape()[1]};
-    mshadow::TShape tshape(shape, shape+2);
+    mshadow::TShape tshape(shape, shape + 2);
     return {tshape};
   }
   static constexpr const char* fname = "mm";
-  static const int num_inputs = 2;
-  static const int num_outputs = 1;
+  static const int num_inputs        = 2;
+  static const int num_outputs       = 1;
 };
 MXNET_REGISTER_TORCH_FUN(_th_mm, TorchMMShape);
 
 struct TorchMVShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     CHECK_EQ(u[0]->shape().ndim(), 2);
     CHECK_EQ(u[1]->shape().ndim(), 1);
     CHECK_EQ(u[0]->shape()[1], u[1]->shape()[0]);
     index_t shape[] = {u[0]->shape()[0]};
-    mshadow::TShape tshape(shape, shape+1);
+    mshadow::TShape tshape(shape, shape + 1);
     return {tshape};
   }
   static constexpr const char* fname = "mv";
-  static const int num_inputs = 2;
-  static const int num_outputs = 1;
+  static const int num_inputs        = 2;
+  static const int num_outputs       = 1;
 };
 MXNET_REGISTER_TORCH_FUN(_th_mv, TorchMVShape);
 
-
 struct TorchBMMShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     CHECK_EQ(u[0]->shape().ndim(), 3);
     CHECK_EQ(u[1]->shape().ndim(), 3);
     CHECK_EQ(u[0]->shape()[0], u[1]->shape()[0]);
     CHECK_EQ(u[0]->shape()[2], u[1]->shape()[1]);
     index_t shape[] = {u[0]->shape()[1], u[1]->shape()[2]};
-    mshadow::TShape tshape(shape, shape+2);
+    mshadow::TShape tshape(shape, shape + 2);
     return {tshape};
   }
   static constexpr const char* fname = "bmm";
-  static const int num_inputs = 2;
-  static const int num_outputs = 1;
+  static const int num_inputs        = 2;
+  static const int num_outputs       = 1;
 };
 MXNET_REGISTER_TORCH_FUN(_th_bmm, TorchBMMShape);
 
 struct TorchGERShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     CHECK_EQ(u[0]->shape().ndim(), 1);
     CHECK_EQ(u[1]->shape().ndim(), 1);
     index_t shape[] = {u[0]->shape()[0], u[1]->shape()[0]};
-    mshadow::TShape tshape(shape, shape+2);
+    mshadow::TShape tshape(shape, shape + 2);
     return {tshape};
   }
   static constexpr const char* fname = "ger";
-  static const int num_inputs = 2;
-  static const int num_outputs = 1;
+  static const int num_inputs        = 2;
+  static const int num_outputs       = 1;
 };
 MXNET_REGISTER_TORCH_FUN(_th_ger, TorchGERShape);
 
diff --git a/plugin/torch/torch_function.h b/plugin/torch/torch_function.h
index f6f760231bdf..32917cf7f39c 100644
--- a/plugin/torch/torch_function.h
+++ b/plugin/torch/torch_function.h
@@ -37,7 +37,7 @@
 
 namespace mxnet {
 
-template<typename xpu, typename OP>
+template <typename xpu, typename OP>
 void TorchRunOp(std::vector<NDArray> arr_in,
                 std::vector<NDArray> arr_out,
                 const std::map<std::string, std::string>& param,
@@ -84,16 +84,17 @@ void TorchRunOp(std::vector<NDArray> arr_in,
   CHECK_EQ(lua_pcall(L, format.size(), 0, 0), 0) << "Lua Error: " << lua_tostring(L, -1);
 }
 
-template<typename OP>
-void TorchOp(NDArray **u, real_t *s, NDArray **out,
+template <typename OP>
+void TorchOp(NDArray** u,
+             real_t* s,
+             NDArray** out,
              const std::map<std::string, std::string>& param) {
   std::vector<mshadow::TShape> shapes = OP::GetShape(u, param);
-  CHECK_EQ(shapes.size(), OP::num_outputs)
-    << "Too many output shapes for TorchOp " << OP::fname;
+  CHECK_EQ(shapes.size(), OP::num_outputs) << "Too many output shapes for TorchOp " << OP::fname;
   Context ctx;
   int type_flag;
   if (OP::num_inputs) {
-    ctx = u[0]->ctx();
+    ctx       = u[0]->ctx();
     type_flag = u[0]->dtype();
     for (int i = 0; i < OP::num_inputs; ++i) {
       CHECK_EQ(ctx, u[i]->ctx()) << "Context of all oprands must be the same.";
@@ -138,37 +139,49 @@ void TorchOp(NDArray **u, real_t *s, NDArray **out,
   var_in.resize(std::unique(var_in.begin(), var_in.end()) - var_in.begin());
   std::sort(var_out.begin(), var_out.end());
   var_out.resize(std::unique(var_out.begin(), var_out.end()) - var_out.begin());
-  std::set_difference(var_in.begin(), var_in.end(), var_out.begin(), var_out.end(),
+  std::set_difference(var_in.begin(),
+                      var_in.end(),
+                      var_out.begin(),
+                      var_out.end(),
                       std::inserter(var_const, var_const.begin()));
   switch (ctx.dev_mask()) {
     case mshadow::cpu::kDevMask: {
-      Engine::Get()->PushSync([arr_in, arr_out, param](RunContext rctx) {
-        TorchRunOp<mshadow::cpu, OP>(arr_in, arr_out, param, rctx);
-      }, ctx, var_const, var_out);
+      Engine::Get()->PushSync(
+          [arr_in, arr_out, param](RunContext rctx) {
+            TorchRunOp<mshadow::cpu, OP>(arr_in, arr_out, param, rctx);
+          },
+          ctx,
+          var_const,
+          var_out);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([arr_in, arr_out, param](RunContext rctx) {
-        TorchRunOp<mshadow::gpu, OP>(arr_in, arr_out, param, rctx);
-      }, ctx, var_const, var_out);
+      Engine::Get()->PushSync(
+          [arr_in, arr_out, param](RunContext rctx) {
+            TorchRunOp<mshadow::gpu, OP>(arr_in, arr_out, param, rctx);
+          },
+          ctx,
+          var_const,
+          var_out);
       break;
     }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
 struct TorchFirstShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     return {u[0]->shape()};
   }
 };
 
 struct TorchConstructorShape {
-  static std::vector<mshadow::TShape> GetShape(NDArray **u,
-    const std::map<std::string, std::string>& param) {
+  static std::vector<mshadow::TShape> GetShape(NDArray** u,
+                                               const std::map<std::string, std::string>& param) {
     std::vector<index_t> shape;
     std::string format = param.at("format");
     std::istringstream args(param.at("args"));
@@ -183,53 +196,52 @@ struct TorchConstructorShape {
     mshadow::TShape tshape(shape.begin(), shape.end());
     return {tshape};
   }
-  static const int num_inputs = 0;
+  static const int num_inputs  = 0;
   static const int num_outputs = 1;
 };
 
-#define MXNET_REGISTER_TORCH_FUN(name, OP)                \
-  MXNET_REGISTER_NDARRAY_FUN(name)                        \
-  .set_function(TorchOp<OP>)                              \
-  .set_num_use_vars(OP::num_inputs)                       \
-  .set_num_mutate_vars(OP::num_outputs)                   \
-  .set_type_mask(kAcceptEmptyMutateTarget)
-
-#define MXNET_REGISTER_TORCH_UNARY_FUN(name, func)                            \
-  struct TorchUnaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {    \
-    static constexpr const char* fname = #func;                               \
-    static const int num_inputs = 1;                                          \
-    static const int num_outputs = 1;                                         \
-  };                                                                          \
-  MXNET_REGISTER_TORCH_FUN(name, TorchUnaryOpDesc_ ## name ## _ ## func)      \
-  .add_argument("x", "NDArray", "Input NDArray")
-
-#define MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                           \
-  struct TorchBinaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {   \
-    static constexpr const char* fname = #func;                               \
-    static const int num_inputs = 2;                                          \
-    static const int num_outputs = 1;                                         \
-  };                                                                          \
-  MXNET_REGISTER_TORCH_FUN(name, TorchBinaryOpDesc_ ## name ## _ ## func)
-
-#define MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(name, func)                  \
-  MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                                 \
-  .add_argument("x1", "NDArray", "First Input NDArray")                       \
-  .add_argument("x2", "NDArray", "Second Input NDArray")
-
-#define MXNET_REGISTER_TORCH_TENARY_FUN(name, func)                           \
-  struct TorchTenaryOpDesc_ ## name ## _ ## func : public TorchFirstShape {   \
-    static constexpr const char* fname = #func;                               \
-    static const int num_inputs = 3;                                          \
-    static const int num_outputs = 1;                                         \
-  };                                                                          \
-  MXNET_REGISTER_TORCH_FUN(name, TorchTenaryOpDesc_ ## name ## _ ## func)
-
-#define MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(name, func)                                  \
-  struct TorchConstructorOpDesc_ ## name ## _ ## func : public TorchConstructorShape {    \
-    static constexpr const char* fname = #func;                                           \
-  };                                                                                      \
-  MXNET_REGISTER_TORCH_FUN(name, TorchConstructorOpDesc_ ## name ## _ ## func)
-
+#define MXNET_REGISTER_TORCH_FUN(name, OP)  \
+  MXNET_REGISTER_NDARRAY_FUN(name)          \
+      .set_function(TorchOp<OP>)            \
+      .set_num_use_vars(OP::num_inputs)     \
+      .set_num_mutate_vars(OP::num_outputs) \
+      .set_type_mask(kAcceptEmptyMutateTarget)
+
+#define MXNET_REGISTER_TORCH_UNARY_FUN(name, func)                   \
+  struct TorchUnaryOpDesc_##name##_##func : public TorchFirstShape { \
+    static constexpr const char* fname = #func;                      \
+    static const int num_inputs        = 1;                          \
+    static const int num_outputs       = 1;                          \
+  };                                                                 \
+  MXNET_REGISTER_TORCH_FUN(name, TorchUnaryOpDesc_##name##_##func)   \
+      .add_argument("x", "NDArray", "Input NDArray")
+
+#define MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                   \
+  struct TorchBinaryOpDesc_##name##_##func : public TorchFirstShape { \
+    static constexpr const char* fname = #func;                       \
+    static const int num_inputs        = 2;                           \
+    static const int num_outputs       = 1;                           \
+  };                                                                  \
+  MXNET_REGISTER_TORCH_FUN(name, TorchBinaryOpDesc_##name##_##func)
+
+#define MXNET_REGISTER_TORCH_BINARY_FUN_WITH_ARG(name, func) \
+  MXNET_REGISTER_TORCH_BINARY_FUN(name, func)                \
+      .add_argument("x1", "NDArray", "First Input NDArray")  \
+      .add_argument("x2", "NDArray", "Second Input NDArray")
+
+#define MXNET_REGISTER_TORCH_TENARY_FUN(name, func)                   \
+  struct TorchTenaryOpDesc_##name##_##func : public TorchFirstShape { \
+    static constexpr const char* fname = #func;                       \
+    static const int num_inputs        = 3;                           \
+    static const int num_outputs       = 1;                           \
+  };                                                                  \
+  MXNET_REGISTER_TORCH_FUN(name, TorchTenaryOpDesc_##name##_##func)
+
+#define MXNET_REGISTER_TORCH_CONSTRUCTOR_FUN(name, func)                         \
+  struct TorchConstructorOpDesc_##name##_##func : public TorchConstructorShape { \
+    static constexpr const char* fname = #func;                                  \
+  };                                                                             \
+  MXNET_REGISTER_TORCH_FUN(name, TorchConstructorOpDesc_##name##_##func)
 
 }  // namespace mxnet
 #endif  // PLUGIN_TORCH_TORCH_FUNCTION_H_
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 386f0e31fb43..ef13493ba56b 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -22,7 +22,7 @@
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
-*/
+ */
 #ifndef PLUGIN_TORCH_TORCH_MODULE_INL_H_
 #define PLUGIN_TORCH_TORCH_MODULE_INL_H_
 
@@ -47,13 +47,10 @@ struct TorchModuleParam : public dmlc::Parameter<TorchModuleParam> {
   uint32_t num_outputs;
   DMLC_DECLARE_PARAMETER(TorchModuleParam) {
     DMLC_DECLARE_FIELD(lua_string)
-    .describe("lua string that is called to generate the torch module object");
-    DMLC_DECLARE_FIELD(num_data)
-    .describe("the number of input data");
-    DMLC_DECLARE_FIELD(num_params)
-    .describe("the number of parameters");
-    DMLC_DECLARE_FIELD(num_outputs)
-    .describe("the number of outputs");
+        .describe("lua string that is called to generate the torch module object");
+    DMLC_DECLARE_FIELD(num_data).describe("the number of input data");
+    DMLC_DECLARE_FIELD(num_params).describe("the number of parameters");
+    DMLC_DECLARE_FIELD(num_outputs).describe("the number of outputs");
   }
 };
 
@@ -61,7 +58,7 @@ struct TorchModuleParam : public dmlc::Parameter<TorchModuleParam> {
  * \brief This is the implementation of activation operator.
  * \tparam xpu The device that the op will be executed on.
  */
-template<typename xpu>
+template <typename xpu>
 class TorchModuleOp : public Operator {
  private:
   TorchModuleParam param_;
@@ -73,8 +70,8 @@ class TorchModuleOp : public Operator {
     this->param_ = p;
     lua_State* L = torchState_->L;
     CHECK_EQ(lua_gettop(L), 0);
-    std::string exec = std::string("return ") + p.lua_string
-      + TorchTensor::ModuleType(xpu::kDevMask);
+    std::string exec =
+        std::string("return ") + p.lua_string + TorchTensor::ModuleType(xpu::kDevMask);
     CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
     int err = lua_pcall(L, 0, 1, 0);
     CHECK_EQ(err, 0) << lua_tostring(L, -1);
@@ -111,25 +108,24 @@ class TorchModuleOp : public Operator {
     this->lua_reference_ = luaL_ref(L, LUA_REGISTRYINDEX);
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     lua_State* L = torchState_->L;
 
     CHECK_EQ(lua_gettop(L), 0);
     CHECK_EQ(in_data.size(), param_.num_params + param_.num_data);
     CHECK_EQ(out_data.size(), param_.num_outputs);
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
     torchState_->SetStream(s);
     // Deserialize self table
 
     lua_rawgeti(L, LUA_REGISTRYINDEX, lua_reference_);
 
-    std::vector<THGeneralTensor> th_output =
-      TorchTensor::TBlobVectorAsTable(torchState_, out_data.begin(),
-                                      out_data.begin() + param_.num_outputs);
+    std::vector<THGeneralTensor> th_output = TorchTensor::TBlobVectorAsTable(
+        torchState_, out_data.begin(), out_data.begin() + param_.num_outputs);
     // set the output field
     lua_setfield(L, -2, "output");
     // set the parameters
@@ -157,38 +153,40 @@ class TorchModuleOp : public Operator {
     // | self | updateOutput
     lua_pushvalue(L, -2);
     // | self | updateOutput | self
-    TorchTensor::TBlobVectorAsTable(torchState_, in_data.begin(),
-                                    in_data.begin() + param_.num_data);
+    TorchTensor::TBlobVectorAsTable(
+        torchState_, in_data.begin(), in_data.begin() + param_.num_data);
     // | self | updateOutput | self | inputs
     int err = lua_pcall(L, 2, 1, 0);  // doesn't need the output
     CHECK_EQ(err, 0) << lua_tostring(L, -1);
-    TorchTensor::CheckOutput(torchState_, out_data.begin(), out_data.begin() + param_.num_outputs,
-                             th_output.begin(), th_output.end());
+    TorchTensor::CheckOutput(torchState_,
+                             out_data.begin(),
+                             out_data.begin() + param_.num_outputs,
+                             th_output.begin(),
+                             th_output.end());
     lua_pop(L, 2);
     CHECK_EQ(lua_gettop(L), 0);
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     lua_State* L = torchState_->L;
     CHECK_EQ(lua_gettop(L), 0);
     CHECK_EQ(in_data.size(), param_.num_params + param_.num_data);
     CHECK_EQ(out_data.size(), param_.num_outputs);
     CHECK_EQ(out_grad.size(), param_.num_outputs);
     CHECK_EQ(in_grad.size(), param_.num_params + param_.num_data);
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
     torchState_->SetStream(s);
     lua_rawgeti(L, LUA_REGISTRYINDEX, lua_reference_);
     TorchTensor::TBlobVectorAsTable(torchState_, out_data.begin(), out_data.end());
     lua_setfield(L, -2, "output");
-    std::vector<THGeneralTensor> th_grad =
-      TorchTensor::TBlobVectorAsTable(torchState_, in_grad.begin(),
-                                      in_grad.begin() + param_.num_data);
+    std::vector<THGeneralTensor> th_grad = TorchTensor::TBlobVectorAsTable(
+        torchState_, in_grad.begin(), in_grad.begin() + param_.num_data);
     lua_setfield(L, -2, "gradInput");
     if (param_.num_params != 0) {
       // get the parameters into the stack
@@ -201,20 +199,21 @@ class TorchModuleOp : public Operator {
       std::vector<TBlob>::const_iterator it = in_data.begin() + param_.num_data;
       while (lua_next(L, -3)) {
         TorchTensor::SetInternal(
-          torchState_,
-          static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
-          *it);
+            torchState_,
+            static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
+            *it);
         it++;
         lua_pop(L, 1);
       }
       // iterate the grad of params
       lua_pushnil(L);
-      it = in_grad.begin() + param_.num_data;;
+      it = in_grad.begin() + param_.num_data;
+      ;
       while (lua_next(L, -2)) {
         TorchTensor::SetInternal(
-          torchState_,
-          static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
-          *it);
+            torchState_,
+            static_cast<THGeneralTensor>(luaT_toudata(L, -1, TorchTensor::TensorType(*it))),
+            *it);
         it++;
         lua_pop(L, 1);
       }
@@ -223,8 +222,8 @@ class TorchModuleOp : public Operator {
     lua_getfield(L, -1, "zeroGradParameters");
     lua_pushvalue(L, -2);
     CHECK_EQ(lua_pcall(L, 1, 0, 0), 0);
-    TorchTensor::TBlobVectorAsTable(torchState_, in_data.begin(),
-                                    in_data.begin() + param_.num_data);
+    TorchTensor::TBlobVectorAsTable(
+        torchState_, in_data.begin(), in_data.begin() + param_.num_data);
     TorchTensor::TBlobVectorAsTable(torchState_, out_grad.begin(), out_grad.end());
     // call
     lua_getfield(L, -3, "accGradParameters");
@@ -240,15 +239,18 @@ class TorchModuleOp : public Operator {
     lua_pushvalue(L, -4);
     err = lua_pcall(L, 3, 1, 0);  // doesn't need the output
     CHECK_EQ(err, 0) << lua_tostring(L, -1);
-    TorchTensor::CheckOutput(torchState_, in_grad.begin(), in_grad.begin() + param_.num_data,
-                             th_grad.begin(), th_grad.end());
+    TorchTensor::CheckOutput(torchState_,
+                             in_grad.begin(),
+                             in_grad.begin() + param_.num_data,
+                             th_grad.begin(),
+                             th_grad.end());
     lua_pop(L, 4);
     CHECK_EQ(lua_gettop(L), 0);
   }
 };  // class TorchModuleOp
 
 // Declare Factory function, used for dispatch specialization
-template<typename xpu>
+template <typename xpu>
 Operator* CreateOp(TorchModuleParam type, TorchState* torchState);
 
 #if DMLC_USE_CXX11
@@ -260,8 +262,8 @@ class TorchModuleProp : public OperatorProperty {
 
   void InitTorchState() const {
     this->torchState_ = new TorchState();
-    lua_State* L = torchState_->L;
-    std::string exec = std::string("return ") + param_.lua_string;
+    lua_State* L      = torchState_->L;
+    std::string exec  = std::string("return ") + param_.lua_string;
     CHECK_EQ(luaL_loadstring(L, exec.c_str()), 0);
     int err = lua_pcall(L, 0, LUA_MULTRET, 0);
     CHECK_EQ(lua_gettop(L), 1);
@@ -277,8 +279,7 @@ class TorchModuleProp : public OperatorProperty {
   }
 
  public:
-  TorchModuleProp() : OperatorProperty(), torchState_(NULL), lua_reference_(-1) {
-  }
+  TorchModuleProp() : OperatorProperty(), torchState_(NULL), lua_reference_(-1) {}
 
   std::vector<std::string> ListArguments() const override {
     if (!torchState_) {
@@ -347,9 +348,9 @@ class TorchModuleProp : public OperatorProperty {
     return param_.__DICT__();
   }
 
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
+  bool InferShape(mxnet::ShapeVector* in_shape,
+                  mxnet::ShapeVector* out_shape,
+                  mxnet::ShapeVector* aux_shape) const override {
     if (torchState_ == nullptr) {
       this->InitTorchState();
     }
@@ -394,9 +395,9 @@ class TorchModuleProp : public OperatorProperty {
       lua_pushnil(L);
       int index = param_.num_data;
       while (lua_next(L, -3)) {
-        THFloatTensor* param = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
-          TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-        long int* size = param->size;  // NOLINT(*)
+        THFloatTensor* param = reinterpret_cast<THFloatTensor*>(
+            luaT_toudata(L, -1, TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+        long int* size       = param->size;  // NOLINT(*)
         (*in_shape)[index++] = mxnet::TShape(size, size + THFloatTensor_nDimension(param));
         lua_pop(L, 1);
       }
@@ -405,18 +406,18 @@ class TorchModuleProp : public OperatorProperty {
     lua_getfield(L, -1, "output");
     if (param_.num_outputs == 0) {
     } else if (param_.num_outputs == 1) {
-      THFloatTensor* output = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
-        TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-      long int* size = output->size;  // NOLINT(*)
+      THFloatTensor* output = reinterpret_cast<THFloatTensor*>(
+          luaT_toudata(L, -1, TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+      long int* size  = output->size;  // NOLINT(*)
       (*out_shape)[0] = mxnet::TShape(size, size + THFloatTensor_nDimension(output));
     } else {
       for (uint32_t data_index = 0; data_index < param_.num_outputs; ++data_index) {
         lua_pushnil(L);
         int index = 0;
         while (lua_next(L, -2)) {
-          THFloatTensor* out = reinterpret_cast<THFloatTensor*>(luaT_toudata(L, -1,
-            TorchTensor::TensorType(mshadow::cpu::kDevMask)));
-          long int* size = out->size;  // NOLINT(*)
+          THFloatTensor* out = reinterpret_cast<THFloatTensor*>(
+              luaT_toudata(L, -1, TorchTensor::TensorType(mshadow::cpu::kDevMask)));
+          long int* size        = out->size;  // NOLINT(*)
           (*out_shape)[index++] = mxnet::TShape(size, size + THFloatTensor_nDimension(out));
         }
       }
@@ -427,7 +428,7 @@ class TorchModuleProp : public OperatorProperty {
   }
 
   OperatorProperty* Copy() const override {
-    auto ptr = new TorchModuleProp();
+    auto ptr    = new TorchModuleProp();
     ptr->param_ = param_;
     return ptr;
   }
@@ -437,10 +438,9 @@ class TorchModuleProp : public OperatorProperty {
   }
 
   // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+  std::vector<int> DeclareBackwardDependency(const std::vector<int>& out_grad,
+                                             const std::vector<int>& in_data,
+                                             const std::vector<int>& out_data) const override {
     std::vector<int> dep;
     dep.insert(dep.end(), out_grad.begin(), out_grad.end());
     dep.insert(dep.end(), out_data.begin(), out_data.end());
diff --git a/plugin/torch/torch_module.cc b/plugin/torch/torch_module.cc
index 4ab792c4dd58..ad6fa8a49d50 100644
--- a/plugin/torch/torch_module.cc
+++ b/plugin/torch/torch_module.cc
@@ -22,27 +22,27 @@
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
-*/
+ */
 #include "./torch_module-inl.h"
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(TorchModuleParam param, TorchState* torchState) {
+template <>
+Operator* CreateOp<cpu>(TorchModuleParam param, TorchState* torchState) {
   return new TorchModuleOp<cpu>(param, torchState);
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-Operator *TorchModuleProp::CreateOperator(Context ctx) const {
+Operator* TorchModuleProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_, torchState_);
 }
 
 DMLC_REGISTER_PARAMETER(TorchModuleParam);
 
 MXNET_REGISTER_OP_PROPERTY(TorchModule, TorchModuleProp)
-.describe("Modules from torch.")
-.add_arguments(TorchModuleParam::__FIELDS__());
+    .describe("Modules from torch.")
+    .add_arguments(TorchModuleParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/plugin/torch/torch_module.cu b/plugin/torch/torch_module.cu
index d743da5fd922..b6ac7f8cbd19 100644
--- a/plugin/torch/torch_module.cu
+++ b/plugin/torch/torch_module.cu
@@ -22,14 +22,14 @@
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
-*/
+ */
 #include "./torch_module-inl.h"
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(TorchModuleParam param, TorchState* torchState) {
+template <>
+Operator* CreateOp<gpu>(TorchModuleParam param, TorchState* torchState) {
   return new TorchModuleOp<gpu>(param, torchState);
 }
 
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index 9fcbedce74f1..dcc581765f62 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -22,7 +22,7 @@
  * \file warpctc-inl.h
  * \brief warpctc operator
  * \author Liang Xiang
-*/
+ */
 #ifndef PLUGIN_WARPCTC_WARPCTC_INL_H_
 #define PLUGIN_WARPCTC_WARPCTC_INL_H_
 
@@ -43,25 +43,21 @@ namespace mxnet {
 namespace op {
 
 namespace warpctc_enum {
-  enum CTCOpInputs {kData, kLabel};
-  enum CTCOpOutputs {kOut};
-  enum CTCTemp {kTmp};
+enum CTCOpInputs { kData, kLabel };
+enum CTCOpOutputs { kOut };
+enum CTCTemp { kTmp };
 }  // namespace warpctc_enum
 
 struct WarpCTCParam : public dmlc::Parameter<WarpCTCParam> {
   int label_length;
   int input_length;
   DMLC_DECLARE_PARAMETER(WarpCTCParam) {
-    DMLC_DECLARE_FIELD(label_length)
-        .set_default(0)
-        .describe("Real label length");
-    DMLC_DECLARE_FIELD(input_length)
-        .set_default(0)
-        .describe("Input length");
+    DMLC_DECLARE_FIELD(label_length).set_default(0).describe("Real label length");
+    DMLC_DECLARE_FIELD(input_length).set_default(0).describe("Input length");
   }
 };
 
-template<typename xpu>
+template <typename xpu>
 class WarpCTCOp : public Operator {
  private:
   WarpCTCParam param_;
@@ -71,37 +67,37 @@ class WarpCTCOp : public Operator {
     this->param_ = p;
   }
 
-  ~WarpCTCOp() {
-  }
+  ~WarpCTCOp() {}
 
   inline void throw_on_error(ctcStatus_t status, const char* message) {
     if (status != CTC_STATUS_SUCCESS) {
-      throw std::runtime_error(message
-                               + (", stat = "
-                                  + std::string(ctcGetStatusString(status))));
+      throw std::runtime_error(message + (", stat = " + std::string(ctcGetStatusString(status))));
     }
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 2) << "CTCOutput Input: [data, label]";
     CHECK_EQ(out_data.size(), 1) << "CTCOutput Output: [output]";
 
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    TBlob data = in_data[warpctc_enum::kData];
-    TBlob out = out_data[warpctc_enum::kOut];
+    Stream<xpu>* s                    = ctx.get_stream<xpu>();
+    TBlob data                        = in_data[warpctc_enum::kData];
+    TBlob out                         = out_data[warpctc_enum::kOut];
     Tensor<xpu, 2, float> data_tensor = data.FlatTo2D<xpu, float>(s);
-    Tensor<xpu, 2, float> out_tensor = out.FlatTo2D<xpu, float>(s);
+    Tensor<xpu, 2, float> out_tensor  = out.FlatTo2D<xpu, float>(s);
     Softmax(out_tensor, data_tensor);
   }
 
-  std::vector<int> labelLengths(const int * flat_labels, int minibatch,
-                                int size, int blank, int * total_length) {
+  std::vector<int> labelLengths(const int* flat_labels,
+                                int minibatch,
+                                int size,
+                                int blank,
+                                int* total_length) {
     CHECK_EQ(param_.label_length * minibatch, size)
         << "label size should = label_length * minibatch";
     std::vector<int> ret(minibatch, 0);
@@ -116,8 +112,7 @@ class WarpCTCOp : public Operator {
     return ret;
   }
 
-  void removeBlank(const int * flat_labels, int * cpu_labels,
-                   int size, int blank) {
+  void removeBlank(const int* flat_labels, int* cpu_labels, int size, int blank) {
     int k = 0;
     for (int i = 0; i < size; i++) {
       if (flat_labels[i] != blank) {
@@ -127,25 +122,25 @@ class WarpCTCOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    TBlob data = in_data[warpctc_enum::kData];
-    TBlob label = in_data[warpctc_enum::kLabel];
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    TBlob data     = in_data[warpctc_enum::kData];
+    TBlob label    = in_data[warpctc_enum::kLabel];
     CHECK_EQ(data.shape_.ndim(), 2) << "input data shape should be 2 (t*n, p)";
-    ctcOptions info; //please updated to latest baidu/warp-ctc NOLINT(*)
+    ctcOptions info;  // please updated to latest baidu/warp-ctc NOLINT(*)
     if (data.dev_mask() == cpu::kDevMask) {
-      info.loc = CTC_CPU;
+      info.loc         = CTC_CPU;
       info.num_threads = 1;
     } else if (data.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
-      info.loc = CTC_GPU;
+      info.loc    = CTC_GPU;
       info.stream = ctx.get_stream<gpu>()->stream_;
     } else {
 #endif
@@ -153,8 +148,8 @@ class WarpCTCOp : public Operator {
     }
     info.blank_label = 0;
 
-    int T = param_.input_length;
-    int minibatch = data.shape_[0] / T;
+    int T             = param_.input_length;
+    int minibatch     = data.shape_[0] / T;
     int alphabet_size = data.shape_[1];
     std::vector<int> input_lengths;
     for (int i = 0; i < minibatch; i++) {
@@ -164,15 +159,16 @@ class WarpCTCOp : public Operator {
 #if MXNET_USE_CUDA
     cudaError_t cuda_status;
 #endif
-    float* activations = static_cast<float*>(data.dptr_);
-    int* flat_labels = static_cast<int*>(label.dptr_);
+    float* activations  = static_cast<float*>(data.dptr_);
+    int* flat_labels    = static_cast<int*>(label.dptr_);
     int* cpu_raw_labels = flat_labels;
-    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
+    float* grads        = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
     if (data.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       cpu_raw_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
-      cuda_status = cudaMemcpyAsync(cpu_raw_labels, flat_labels,
-                                    label.Size()*sizeof(int),
+      cuda_status    = cudaMemcpyAsync(cpu_raw_labels,
+                                    flat_labels,
+                                    label.Size() * sizeof(int),
                                     cudaMemcpyDeviceToHost,
                                     ctx.get_stream<gpu>()->stream_);
       CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
@@ -180,24 +176,22 @@ class WarpCTCOp : public Operator {
     }
 
     int total_label_length = 0;
-    std::vector<int> label_lengths = labelLengths(cpu_raw_labels,
-                                                  minibatch,
-                                                  label.Size(),
-                                                  0, &total_label_length);
-    int* cpu_labels = reinterpret_cast<int*>(
-        malloc(sizeof(int) * total_label_length));
+    std::vector<int> label_lengths =
+        labelLengths(cpu_raw_labels, minibatch, label.Size(), 0, &total_label_length);
+    int* cpu_labels = reinterpret_cast<int*>(malloc(sizeof(int) * total_label_length));
     removeBlank(cpu_raw_labels, cpu_labels, label.Size(), 0);
 
     size_t alloc_bytes;
     throw_on_error(get_workspace_size(label_lengths.data(),
                                       input_lengths.data(),
                                       alphabet_size,
-                                      input_lengths.size(), info,
+                                      input_lengths.size(),
+                                      info,
                                       &alloc_bytes),
                    "Error: get_workspace_size in inf_test");
 
-    Tensor<xpu, 1> ctc_workspace = ctx.requested[warpctc_enum::kTmp].get_space<xpu>(
-        mshadow::Shape1(alloc_bytes), s);
+    Tensor<xpu, 1> ctc_workspace =
+        ctx.requested[warpctc_enum::kTmp].get_space<xpu>(mshadow::Shape1(alloc_bytes), s);
 
     std::vector<float> costs(minibatch);
     throw_on_error(compute_ctc_loss(activations,
@@ -223,10 +217,9 @@ class WarpCTCOp : public Operator {
   }
 };
 
-template<typename xpu>
+template <typename xpu>
 Operator* CreateOp(WarpCTCParam type);
 
-
 #if DMLC_USE_CXX11
 class WarpCTCProp : public OperatorProperty {
  public:
@@ -238,8 +231,7 @@ class WarpCTCProp : public OperatorProperty {
     return {"output"};
   }
 
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs)
-      override {
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
   }
 
@@ -247,13 +239,14 @@ class WarpCTCProp : public OperatorProperty {
     return param_.__DICT__();
   }
 
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
+  bool InferShape(mxnet::ShapeVector* in_shape,
+                  mxnet::ShapeVector* out_shape,
+                  mxnet::ShapeVector* aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
-    const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    const mxnet::TShape& dshape = in_shape->at(0);
+    if (dshape.ndim() == 0)
+      return false;
     mxnet::TShape label_shape(dshape.ndim() - 1, 1);
     label_shape[0] = param_.label_length * (dshape[0] / param_.input_length);
     SHAPE_ASSIGN_CHECK(*in_shape, warpctc_enum::kLabel, label_shape);
@@ -263,9 +256,9 @@ class WarpCTCProp : public OperatorProperty {
     return true;
   }
 
-  virtual bool InferType(std::vector<int> *in_type,
-                         std::vector<int> *out_type,
-                         std::vector<int> *aux_type) const {
+  virtual bool InferType(std::vector<int>* in_type,
+                         std::vector<int>* out_type,
+                         std::vector<int>* aux_type) const {
     CHECK_LE(in_type->size(), this->ListArguments().size());
     in_type->clear();
     in_type->push_back(mshadow::kFloat32);
@@ -275,13 +268,12 @@ class WarpCTCProp : public OperatorProperty {
     return true;
   }
 
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
+  std::vector<ResourceRequest> BackwardResource(const mxnet::ShapeVector& in_shape) const override {
     return {ResourceRequest::kTempSpace};
   }
 
   OperatorProperty* Copy() const override {
-    auto ptr = new WarpCTCProp();
+    auto ptr    = new WarpCTCProp();
     ptr->param_ = param_;
     return ptr;
   }
@@ -290,14 +282,11 @@ class WarpCTCProp : public OperatorProperty {
     return "WarpCTC";
   }
 
-
-  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
-                                             const std::vector<int> &in_data,
-                                             const std::vector<int> &out_data)
-      const override {
-    return {in_data[warpctc_enum::kData],
-          in_data[warpctc_enum::kLabel],
-          out_data[warpctc_enum::kOut]};
+  std::vector<int> DeclareBackwardDependency(const std::vector<int>& out_grad,
+                                             const std::vector<int>& in_data,
+                                             const std::vector<int>& out_data) const override {
+    return {
+        in_data[warpctc_enum::kData], in_data[warpctc_enum::kLabel], out_data[warpctc_enum::kOut]};
   }
 
   Operator* CreateOperator(Context ctx) const override;
diff --git a/plugin/warpctc/warpctc.cc b/plugin/warpctc/warpctc.cc
index aac36a375a9e..754883ab226b 100644
--- a/plugin/warpctc/warpctc.cc
+++ b/plugin/warpctc/warpctc.cc
@@ -22,29 +22,29 @@
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
-*/
+ */
 
 #include "./warpctc-inl.h"
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(WarpCTCParam param) {
+template <>
+Operator* CreateOp<cpu>(WarpCTCParam param) {
   return new WarpCTCOp<cpu>(param);
 }
 
-Operator *WarpCTCProp::CreateOperator(Context ctx) const {
+Operator* WarpCTCProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_);
 }
 
 DMLC_REGISTER_PARAMETER(WarpCTCParam);
 
 MXNET_REGISTER_OP_PROPERTY(WarpCTC, WarpCTCProp)
-.add_argument("data", "NDArray-or-Symbol", "Input data.")
-.add_argument("label", "NDArray-or-Symbol", "Input label.")
-.describe("warp ctc.")
-.add_arguments(WarpCTCParam::__FIELDS__());
+    .add_argument("data", "NDArray-or-Symbol", "Input data.")
+    .add_argument("label", "NDArray-or-Symbol", "Input label.")
+    .describe("warp ctc.")
+    .add_arguments(WarpCTCParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/plugin/warpctc/warpctc.cu b/plugin/warpctc/warpctc.cu
index 3ee20fc9d3fe..093a1498823a 100644
--- a/plugin/warpctc/warpctc.cu
+++ b/plugin/warpctc/warpctc.cu
@@ -22,15 +22,15 @@
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
-*/
+ */
 #include "./warpctc-inl.h"
 #include <stdio.h>
 #include "../../src/operator/mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(WarpCTCParam param) {
+template <>
+Operator* CreateOp<gpu>(WarpCTCParam param) {
   return new WarpCTCOp<gpu>(param);
 }
 

From 2ae66a0954a506db90d99f5fe26ffdb8197ce1fd Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 4 Nov 2021 09:02:09 +0100
Subject: [PATCH 07/10] [TOOLS] Re-format .cc .h files

---
 tools/im2rec.cc | 275 +++++++++++++++++++++++++++---------------------
 1 file changed, 156 insertions(+), 119 deletions(-)

diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 1c4071a23eee..db8df4481582 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -42,51 +42,68 @@
 #include "../src/io/image_recordio.h"
 #include <random>
 /*!
- *\brief get interpolation method with given inter_method, 0-CV_INTER_NN 1-CV_INTER_LINEAR 2-CV_INTER_CUBIC
- *\ 3-CV_INTER_AREA 4-CV_INTER_LANCZOS4 9-AUTO(cubic for enlarge, area for shrink, bilinear for others) 10-RAND(0-4)
+ *\brief get interpolation method with given inter_method, 0-CV_INTER_NN 1-CV_INTER_LINEAR
+ *2-CV_INTER_CUBIC \ 3-CV_INTER_AREA 4-CV_INTER_LANCZOS4 9-AUTO(cubic for enlarge, area for shrink,
+ *bilinear for others) 10-RAND(0-4)
  */
-int GetInterMethod(int inter_method, int old_width, int old_height, int new_width, int new_height, std::mt19937& prnd) {
-    if (inter_method == 9) {
-        if (new_width > old_width && new_height > old_height) {
-            return 2;  // CV_INTER_CUBIC for enlarge
-        } else if (new_width <old_width && new_height < old_height) {
-            return 3;  // CV_INTER_AREA for shrink
-        } else {
-            return 1;  // CV_INTER_LINEAR for others
-        }
-    } else if (inter_method == 10) {
-        std::uniform_int_distribution<size_t> rand_uniform_int(0, 4);
-        return rand_uniform_int(prnd);
+int GetInterMethod(int inter_method,
+                   int old_width,
+                   int old_height,
+                   int new_width,
+                   int new_height,
+                   std::mt19937& prnd) {
+  if (inter_method == 9) {
+    if (new_width > old_width && new_height > old_height) {
+      return 2;  // CV_INTER_CUBIC for enlarge
+    } else if (new_width < old_width && new_height < old_height) {
+      return 3;  // CV_INTER_AREA for shrink
     } else {
-        return inter_method;
+      return 1;  // CV_INTER_LINEAR for others
     }
+  } else if (inter_method == 10) {
+    std::uniform_int_distribution<size_t> rand_uniform_int(0, 4);
+    return rand_uniform_int(prnd);
+  } else {
+    return inter_method;
+  }
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   if (argc < 4) {
-    printf("Usage: <image.lst> <image_root_dir> <output.rec> [additional parameters in form key=value]\n"\
-           "Possible additional parameters:\n"\
-           "\tcolor=USE_COLOR[default=1] Force color (1), gray image (0) or keep source unchanged (-1).\n"\
-           "\tresize=newsize resize the shorter edge of image to the newsize, original images will be packed by default\n"\
-           "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
-           "\tpack_label=PACK_LABEL[default=0] whether to also pack multi dimenional label in the record file\n"\
-           "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
-           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"\
-           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it square.\n"\
-           "\tquality=QUALITY[default=95] JPEG quality for encoding (1-100, default: 95) or PNG compression for encoding (1-9, default: 3).\n"\
-           "\tencoding=ENCODING[default='.jpg'] Encoding type. Can be '.jpg' or '.png'\n"\
-           "\tinter_method=INTER_METHOD[default=1] NN(0) BILINEAR(1) CUBIC(2) AREA(3) LANCZOS4(4) AUTO(9) RAND(10).\n"\
-           "\tunchanged=UNCHANGED[default=0] Keep the original image encoding, size and color. If set to 1, it will ignore the others parameters.\n");
+    printf(
+        "Usage: <image.lst> <image_root_dir> <output.rec> [additional parameters in form "
+        "key=value]\n"
+        "Possible additional parameters:\n"
+        "\tcolor=USE_COLOR[default=1] Force color (1), gray image (0) or keep source unchanged "
+        "(-1).\n"
+        "\tresize=newsize resize the shorter edge of image to the newsize, original images will be "
+        "packed by default\n"
+        "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"
+        "\tpack_label=PACK_LABEL[default=0] whether to also pack multi dimenional label in the "
+        "record file\n"
+        "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to "
+        "NSPLIT parts by position\n"
+        "\tpart=PART[default=0] used for part generation, pack the images from the specific part "
+        "in image.list\n"
+        "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it "
+        "square.\n"
+        "\tquality=QUALITY[default=95] JPEG quality for encoding (1-100, default: 95) or PNG "
+        "compression for encoding (1-9, default: 3).\n"
+        "\tencoding=ENCODING[default='.jpg'] Encoding type. Can be '.jpg' or '.png'\n"
+        "\tinter_method=INTER_METHOD[default=1] NN(0) BILINEAR(1) CUBIC(2) AREA(3) LANCZOS4(4) "
+        "AUTO(9) RAND(10).\n"
+        "\tunchanged=UNCHANGED[default=0] Keep the original image encoding, size and color. If set "
+        "to 1, it will ignore the others parameters.\n");
     return 0;
   }
-  int label_width = 1;
-  int pack_label = 0;
-  int new_size = -1;
-  int nsplit = 1;
-  int partid = 0;
-  int center_crop = 0;
-  int quality = 95;
-  int color_mode = CV_LOAD_IMAGE_COLOR;
-  int unchanged = 0;
+  int label_width  = 1;
+  int pack_label   = 0;
+  int new_size     = -1;
+  int nsplit       = 1;
+  int partid       = 0;
+  int center_crop  = 0;
+  int quality      = 95;
+  int color_mode   = CV_LOAD_IMAGE_COLOR;
+  int unchanged    = 0;
   int inter_method = CV_INTER_LINEAR;
   std::string encoding(".jpg");
   for (int i = 4; i < argc; ++i) {
@@ -100,17 +117,28 @@ int main(int argc, char *argv[]) {
 #endif
 
     if (effct_len == 2) {
-      if (!strcmp(key, "resize")) new_size = atoi(val);
-      if (!strcmp(key, "label_width")) label_width = atoi(val);
-      if (!strcmp(key, "pack_label")) pack_label = atoi(val);
-      if (!strcmp(key, "nsplit")) nsplit = atoi(val);
-      if (!strcmp(key, "part")) partid = atoi(val);
-      if (!strcmp(key, "center_crop")) center_crop = atoi(val);
-      if (!strcmp(key, "quality")) quality = atoi(val);
-      if (!strcmp(key, "color")) color_mode = atoi(val);
-      if (!strcmp(key, "encoding")) encoding = std::string(val);
-      if (!strcmp(key, "unchanged")) unchanged = atoi(val);
-      if (!strcmp(key, "inter_method")) inter_method = atoi(val);
+      if (!strcmp(key, "resize"))
+        new_size = atoi(val);
+      if (!strcmp(key, "label_width"))
+        label_width = atoi(val);
+      if (!strcmp(key, "pack_label"))
+        pack_label = atoi(val);
+      if (!strcmp(key, "nsplit"))
+        nsplit = atoi(val);
+      if (!strcmp(key, "part"))
+        partid = atoi(val);
+      if (!strcmp(key, "center_crop"))
+        center_crop = atoi(val);
+      if (!strcmp(key, "quality"))
+        quality = atoi(val);
+      if (!strcmp(key, "color"))
+        color_mode = atoi(val);
+      if (!strcmp(key, "encoding"))
+        encoding = std::string(val);
+      if (!strcmp(key, "unchanged"))
+        unchanged = atoi(val);
+      if (!strcmp(key, "inter_method"))
+        inter_method = atoi(val);
     }
   }
   // Check parameters ranges
@@ -140,43 +168,42 @@ int main(int argc, char *argv[]) {
   LOG(INFO) << "Encoding is " << encoding;
 
   if (encoding == std::string(".png") && quality > 9) {
-      quality = 3;
+    quality = 3;
   }
   if (inter_method != 1) {
-      switch (inter_method) {
-        case 0:
-            LOG(INFO) << "Use inter_method CV_INTER_NN";
-            break;
-        case 2:
-            LOG(INFO) << "Use inter_method CV_INTER_CUBIC";
-            break;
-        case 3:
-            LOG(INFO) << "Use inter_method CV_INTER_AREA";
-            break;
-        case 4:
-            LOG(INFO) << "Use inter_method CV_INTER_LANCZOS4";
-            break;
-        case 9:
-            LOG(INFO) << "Use inter_method mod auto(cubic for enlarge, area for shrink)";
-            break;
-        case 10:
-            LOG(INFO) << "Use inter_method mod rand(nn/bilinear/cubic/area/lanczos4)";
-           break;
-        default:
-            LOG(INFO) << "Unkown inter_method";
-            return 0;
-      }
+    switch (inter_method) {
+      case 0:
+        LOG(INFO) << "Use inter_method CV_INTER_NN";
+        break;
+      case 2:
+        LOG(INFO) << "Use inter_method CV_INTER_CUBIC";
+        break;
+      case 3:
+        LOG(INFO) << "Use inter_method CV_INTER_AREA";
+        break;
+      case 4:
+        LOG(INFO) << "Use inter_method CV_INTER_LANCZOS4";
+        break;
+      case 9:
+        LOG(INFO) << "Use inter_method mod auto(cubic for enlarge, area for shrink)";
+        break;
+      case 10:
+        LOG(INFO) << "Use inter_method mod rand(nn/bilinear/cubic/area/lanczos4)";
+        break;
+      default:
+        LOG(INFO) << "Unkown inter_method";
+        return 0;
+    }
   }
   std::random_device rd;
   std::mt19937 prnd(rd());
   using namespace dmlc;
   const static size_t kBufferSize = 1 << 20UL;
-  std::string root = argv[2];
+  std::string root                = argv[2];
   mxnet::io::ImageRecordIO rec;
-  size_t imcnt = 0;
-  double tstart = dmlc::GetTime();
-  dmlc::InputSplit *flist = dmlc::InputSplit::
-      Create(argv[1], partid, nsplit, "text");
+  size_t imcnt            = 0;
+  double tstart           = dmlc::GetTime();
+  dmlc::InputSplit* flist = dmlc::InputSplit::Create(argv[1], partid, nsplit, "text");
   std::ostringstream os;
   if (nsplit == 1) {
     os << argv[3];
@@ -184,7 +211,7 @@ int main(int argc, char *argv[]) {
     os << argv[3] << ".part" << std::setw(3) << std::setfill('0') << partid;
   }
   LOG(INFO) << "Write to output: " << os.str();
-  dmlc::Stream *fo = dmlc::Stream::Create(os.str().c_str(), "w");
+  dmlc::Stream* fo = dmlc::Stream::Create(os.str().c_str(), "w");
   LOG(INFO) << "Output: " << os.str();
   dmlc::RecordIOWriter writer(fo);
   std::string fname, path, blob;
@@ -192,13 +219,13 @@ int main(int argc, char *argv[]) {
   std::vector<unsigned char> encode_buf;
   std::vector<int> encode_params;
   if (encoding == std::string(".png")) {
-      encode_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
-      encode_params.push_back(quality);
-      LOG(INFO) << "PNG encoding compression: " << quality;
+    encode_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
+    encode_params.push_back(quality);
+    LOG(INFO) << "PNG encoding compression: " << quality;
   } else {
-      encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
-      encode_params.push_back(quality);
-      LOG(INFO) << "JPEG encoding quality: " << quality;
+    encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+    encode_params.push_back(quality);
+    LOG(INFO) << "JPEG encoding quality: " << quality;
   }
   dmlc::InputSplit::Blob line;
   std::vector<float> label_buf(label_width, 0.f);
@@ -206,32 +233,32 @@ int main(int argc, char *argv[]) {
   while (flist->NextRecord(&line)) {
     std::string sline(static_cast<char*>(line.dptr), line.size);
     std::istringstream is(sline);
-    if (!(is >> rec.header.image_id[0] >> rec.header.label)) continue;
+    if (!(is >> rec.header.image_id[0] >> rec.header.label))
+      continue;
     label_buf[0] = rec.header.label;
     for (int k = 1; k < label_width; ++k) {
-      CHECK(is >> label_buf[k])
-          << "Invalid ImageList, did you provide the correct label_width?";
+      CHECK(is >> label_buf[k]) << "Invalid ImageList, did you provide the correct label_width?";
     }
-    if (pack_label) rec.header.flag = label_width;
+    if (pack_label)
+      rec.header.flag = label_width;
     rec.SaveHeader(&blob);
     if (pack_label) {
       size_t bsize = blob.size();
-      blob.resize(bsize + label_buf.size()*sizeof(float));
-      memcpy(BeginPtr(blob) + bsize,
-             BeginPtr(label_buf), label_buf.size()*sizeof(float));
+      blob.resize(bsize + label_buf.size() * sizeof(float));
+      memcpy(BeginPtr(blob) + bsize, BeginPtr(label_buf), label_buf.size() * sizeof(float));
     }
     CHECK(std::getline(is, fname));
     // eliminate invalid chars in the end
-    while (fname.length() != 0 &&
-           (isspace(*fname.rbegin()) || !isprint(*fname.rbegin()))) {
+    while (fname.length() != 0 && (isspace(*fname.rbegin()) || !isprint(*fname.rbegin()))) {
       fname.resize(fname.length() - 1);
     }
     // eliminate invalid chars in beginning.
-    const char *p = fname.c_str();
-    while (isspace(*p)) ++p;
+    const char* p = fname.c_str();
+    while (isspace(*p))
+      ++p;
     path = root + p;
     // use "r" is equal to rb in dmlc::Stream
-    dmlc::Stream *fi = dmlc::Stream::Create(path.c_str(), "r");
+    dmlc::Stream* fi = dmlc::Stream::Create(path.c_str(), "r");
     decode_buf.clear();
     size_t imsize = 0;
     while (true) {
@@ -239,11 +266,11 @@ int main(int argc, char *argv[]) {
       size_t nread = fi->Read(BeginPtr(decode_buf) + imsize, kBufferSize);
       imsize += nread;
       decode_buf.resize(imsize);
-      if (nread != kBufferSize) break;
+      if (nread != kBufferSize)
+        break;
     }
     delete fi;
 
-
     if (unchanged != 1) {
       cv::Mat img = cv::imdecode(decode_buf, color_mode);
       CHECK(img.data != nullptr) << "OpenCV decode fail:" << path;
@@ -251,28 +278,40 @@ int main(int argc, char *argv[]) {
       if (new_size > 0) {
         if (center_crop) {
           if (img.rows > img.cols) {
-            int margin = (img.rows - img.cols)/2;
-            img = img(cv::Range(margin, margin+img.cols), cv::Range(0, img.cols));
+            int margin = (img.rows - img.cols) / 2;
+            img        = img(cv::Range(margin, margin + img.cols), cv::Range(0, img.cols));
           } else {
-            int margin = (img.cols - img.rows)/2;
-            img = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
+            int margin = (img.cols - img.rows) / 2;
+            img        = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
           }
         }
         int interpolation_method = 1;
         if (img.rows > img.cols) {
-            if (img.cols != new_size) {
-                interpolation_method = GetInterMethod(inter_method, img.cols, img.rows, new_size, img.rows * new_size / img.cols, prnd);
-                cv::resize(img, res, cv::Size(new_size, img.rows * new_size / img.cols), 0, 0, interpolation_method);
-            } else {
-                res = img.clone();
-            }
+          if (img.cols != new_size) {
+            interpolation_method = GetInterMethod(
+                inter_method, img.cols, img.rows, new_size, img.rows * new_size / img.cols, prnd);
+            cv::resize(img,
+                       res,
+                       cv::Size(new_size, img.rows * new_size / img.cols),
+                       0,
+                       0,
+                       interpolation_method);
+          } else {
+            res = img.clone();
+          }
         } else {
-            if (img.rows != new_size) {
-                interpolation_method = GetInterMethod(inter_method, img.cols, img.rows, new_size * img.cols / img.rows, new_size, prnd);
-                cv::resize(img, res, cv::Size(new_size * img.cols / img.rows, new_size), 0, 0, interpolation_method);
-            } else {
-                res = img.clone();
-            }
+          if (img.rows != new_size) {
+            interpolation_method = GetInterMethod(
+                inter_method, img.cols, img.rows, new_size * img.cols / img.rows, new_size, prnd);
+            cv::resize(img,
+                       res,
+                       cv::Size(new_size * img.cols / img.rows, new_size),
+                       0,
+                       0,
+                       interpolation_method);
+          } else {
+            res = img.clone();
+          }
         }
       }
       encode_buf.clear();
@@ -281,13 +320,11 @@ int main(int argc, char *argv[]) {
       // write buffer
       size_t bsize = blob.size();
       blob.resize(bsize + encode_buf.size());
-      memcpy(BeginPtr(blob) + bsize,
-             BeginPtr(encode_buf), encode_buf.size());
+      memcpy(BeginPtr(blob) + bsize, BeginPtr(encode_buf), encode_buf.size());
     } else {
       size_t bsize = blob.size();
       blob.resize(bsize + decode_buf.size());
-      memcpy(BeginPtr(blob) + bsize,
-             BeginPtr(decode_buf), decode_buf.size());
+      memcpy(BeginPtr(blob) + bsize, BeginPtr(decode_buf), decode_buf.size());
     }
     writer.WriteRecord(BeginPtr(blob), blob.size());
     // write header

From a47156d66767088adf522234bef95572e5f98de9 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Fri, 5 Nov 2021 07:22:27 +0100
Subject: [PATCH 08/10] Clang-format fix

---
 src/engine/threaded_engine_pooled.cc          |  2 +-
 src/kvstore/kvstore_dist.h                    |  8 +++---
 src/operator/contrib/bilinear_resize-inl.h    |  6 ++--
 src/operator/contrib/bounding_box.cu          |  4 +--
 src/operator/contrib/multi_lamb.cc            |  8 +++---
 src/operator/contrib/multi_lans.cc            |  8 +++---
 src/operator/nn/batch_norm.cu                 |  8 +++---
 src/operator/nn/dnnl/dnnl_base.cc             | 12 ++++----
 src/operator/nn/dnnl/dnnl_rnn.cc              | 28 +++++++++----------
 src/operator/nn/softmax-inl.h                 |  4 +--
 src/operator/optimizer_op.cc                  |  2 +-
 src/operator/optimizer_op.cu                  |  2 +-
 src/operator/subgraph/dnnl/dnnl_conv.cc       |  2 +-
 .../subgraph/tensorrt/onnx_to_tensorrt.h      |  2 ++
 14 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index fd29f6daacc3..21dc470b708a 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -155,7 +155,7 @@ class ThreadedEnginePooled : public ThreadedEngine {
     bool is_copy = (opr_block->opr->prop == FnProperty::kCopyFromGPU ||
                     opr_block->opr->prop == FnProperty::kCopyToGPU);
     auto&& rctx  = is_copy ? streams_->GetIORunContext(opr_block->ctx) :
-                             streams_->GetRunContext(opr_block->ctx);
+                            streams_->GetRunContext(opr_block->ctx);
 #if MXNET_USE_CUDA
     CallbackOnStart on_start;
     CallbackOnComplete callback;
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index a80176494e1b..27ddb82547a2 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -508,16 +508,16 @@ class KVStoreDist : public KVStoreLocal {
       const int dtype     = recv_buf.dtype();
       const int num_bytes = mshadow::mshadow_sizeof(dtype);
       PSKV& pskv          = (gradient_compression_->get_type() == CompressionType::kNone) ?
-                                EncodeDefaultKey(key, size, num_bytes) :
-                                EncodeCompressedKey(key, size, false, num_bytes);
-      char* data          = static_cast<char*>(recv_buf.data().dptr_);
+                       EncodeDefaultKey(key, size, num_bytes) :
+                       EncodeCompressedKey(key, size, false, num_bytes);
+      char* data = static_cast<char*>(recv_buf.data().dptr_);
       // false means not to delete data when SArray is deleted
       auto vals = new ps::SArray<char>(data, size * num_bytes, false);
       // issue pull
       RequestType mode = (gradient_compression_->get_type() != CompressionType::kNone) ?
                              RequestType::kCompressedPushPull :
                              RequestType::kDefaultPushPull;
-      const int cmd    = GetCommandType(mode, dtype);
+      const int cmd = GetCommandType(mode, dtype);
       CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, cmd, [vals, cb]() {
         delete vals;
         cb();
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index be57acc36ce1..8afb63eff30b 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -273,9 +273,9 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
       new_height = ((dshape[2] % 2) == 0) ?
                        (int16_t)(dshape[2] * param.scale_height.value()) :
                        (int16_t)((dshape[2] - 1) * param.scale_height.value()) + 1;
-      new_width  = ((dshape[3] % 2) == 0) ?
-                       (int16_t)(dshape[3] * param.scale_width.value()) :
-                       (int16_t)((dshape[3] - 1) * param.scale_width.value()) + 1;
+      new_width = ((dshape[3] % 2) == 0) ?
+                      (int16_t)(dshape[3] * param.scale_width.value()) :
+                      (int16_t)((dshape[3] - 1) * param.scale_width.value()) + 1;
       break;
     }
     case bilinear_resize::like: {
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index ef2b7be50a37..e39e69c6fbbc 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -490,8 +490,8 @@ __launch_bounds__(NMS<DType>::THRESHOLD) __global__
   for (int i = 0; i < n_threads / warp_size; ++i) {
     uint32_t my_mask = my_next_mask;
     my_next_mask     = (((i + 1) < n_threads / warp_size) && (my_element_in_batch < topk)) ?
-                           nms_results[(i + 1) * topk * num_batches + my_element] :
-                           full_mask;
+                       nms_results[(i + 1) * topk * num_batches + my_element] :
+                       full_mask;
     if (my_warp == i && !__all_sync(full_mask, my_mask == full_mask)) {
       my_mask = my_mask | earlier_threads_mask;
       // Loop over warp_size - 1 because the last
diff --git a/src/operator/contrib/multi_lamb.cc b/src/operator/contrib/multi_lamb.cc
index 866567d6aa21..91920079a77f 100644
--- a/src/operator/contrib/multi_lamb.cc
+++ b/src/operator/contrib/multi_lamb.cc
@@ -44,8 +44,8 @@ struct MultiLAMBKernelStep1 {
     using namespace mshadow_op;
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i] :
-                                                    MPDType(kernel_params.weights[index][i]);
+        MPDType w = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                          MPDType(kernel_params.weights[index][i]);
         MPDType scaled_grad = static_cast<MPDType>(kernel_params.grads[index][i]) * rescale_grad;
         if (clip_gradient >= 0.0f)
           scaled_grad = mshadow_op::clip::Map(scaled_grad, static_cast<MPDType>(clip_gradient));
@@ -93,8 +93,8 @@ struct MultiLAMBKernelStep2 {
       if ((size_t)i < kernel_params.sizes[index]) {
         MPDType w = has_mixed_precision ? kernel_params.weights32[index][i] :
                                           MPDType(kernel_params.weights[index][i]);
-        float r1  = sqrt(sum_sq_weigths[index]);
-        float r2  = sqrt(sum_sq_temp_g[index]);
+        float r1 = sqrt(sum_sq_weigths[index]);
+        float r2 = sqrt(sum_sq_temp_g[index]);
         if (lower_bound >= 0)
           r1 = std::max(r1, lower_bound);
         if (upper_bound >= 0)
diff --git a/src/operator/contrib/multi_lans.cc b/src/operator/contrib/multi_lans.cc
index a7bb3ab69a77..4cc88928ff93 100644
--- a/src/operator/contrib/multi_lans.cc
+++ b/src/operator/contrib/multi_lans.cc
@@ -45,8 +45,8 @@ struct MultiLANSKernelStep1 {
     using namespace mshadow_op;
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w           = has_mixed_precision ? kernel_params.weights32[index][i] :
-                                                    MPDType(kernel_params.weights[index][i]);
+        MPDType w = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                          MPDType(kernel_params.weights[index][i]);
         float g_norm        = sqrt(g_sq_norm[index]);
         MPDType scaled_grad = static_cast<MPDType>(kernel_params.grads[index][i]) * rescale_grad;
         scaled_grad /= g_norm;
@@ -95,8 +95,8 @@ struct MultiLANSKernelStep2 {
                                   const OpReqType req) {
     for (size_t index = 0; index < kernel_params.ntensors; ++index) {
       if ((size_t)i < kernel_params.sizes[index]) {
-        MPDType w  = has_mixed_precision ? kernel_params.weights32[index][i] :
-                                           MPDType(kernel_params.weights[index][i]);
+        MPDType w = has_mixed_precision ? kernel_params.weights32[index][i] :
+                                          MPDType(kernel_params.weights[index][i]);
         float r1   = sqrt(sum_sq_weigths[index]);
         float r2_m = sqrt(sum_sq_temp_m[index]);
         float r2_g = sqrt(sum_sq_temp_g[index]);
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 29f3f61b6808..6ff71aae18bd 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -282,7 +282,7 @@ __launch_bounds__(inference_forward_threads) __global__
 
       AType invstd = small_num_channels ? saved_invstd[my_channel] :
                                           variance_to_invstd(runningVar[my_channel], epsilon);
-      AType mean   = small_num_channels ? saved_mean[my_channel] : runningMean[my_channel];
+      AType mean = small_num_channels ? saved_mean[my_channel] : runningMean[my_channel];
       AType gamma =
           small_num_channels ?
               saved_weight[my_channel] :
@@ -349,8 +349,8 @@ __global__ void BatchNormalizationUpdateOutputKernel(DeviceTensor input,
   const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0) ?
                             ScalarConvert<DType, AccReal>::to(weight[plane]) :
                             ScalarConvert<int, AccReal>::to(1);
-  const AccReal beta  = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane]) :
-                                                 ScalarConvert<int, AccReal>::to(0);
+  const AccReal beta = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane]) :
+                                                ScalarConvert<int, AccReal>::to(0);
   for (int batch = 0, nbatch = input.OuterSize(); batch < nbatch; ++batch) {
     for (int x = threadIdx.x, nx = input.InnerSize(); x < nx; x += blockDim.x) {
       const DType inp = input.get_ref(batch, plane, x);
@@ -651,7 +651,7 @@ static __global__ void BatchNormalizationBackwardKernel(const DeviceTensor input
   const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
                                 ScalarConvert<DType, AccReal>::to(tensors.weight[plane]) :
                                 AccReal(1);
-  const AccReal norm      = AccReal(1) / N;
+  const AccReal norm = AccReal(1) / N;
 
   // Compute two values across (batch, x/y/z) in one pass:
   // 1. Sum(gradOutput)
diff --git a/src/operator/nn/dnnl/dnnl_base.cc b/src/operator/nn/dnnl/dnnl_base.cc
index 54af44c80fe4..adcd8f2751d9 100644
--- a/src/operator/nn/dnnl/dnnl_base.cc
+++ b/src/operator/nn/dnnl/dnnl_base.cc
@@ -242,19 +242,19 @@ const dnnl::memory* GetWeights(const NDArray& arr, int num_groups) {
     tz         = dnnl::memory::dims{arr.shape()[O], arr.shape()[I]};
     format_tag = dnnl::memory::format_tag::oi;
   } else if (ndim == 3) {
-    tz         = num_groups > 1 ?
-                     dnnl::memory::dims{
+    tz = num_groups > 1 ?
+             dnnl::memory::dims{
                  num_groups, arr.shape()[O] / num_groups, arr.shape()[I], arr.shape()[H]} :
-                     dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+             dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
     format_tag = num_groups > 1 ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw;
   } else if (ndim == 4) {
-    tz         = num_groups > 1 ?
-                     dnnl::memory::dims{num_groups,
+    tz = num_groups > 1 ?
+             dnnl::memory::dims{num_groups,
                                 arr.shape()[O] / num_groups,
                                 arr.shape()[I],
                                 arr.shape()[H],
                                 arr.shape()[W]} :
-                     dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+             dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
     format_tag = num_groups > 1 ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw;
   } else if (ndim == 5) {
     tz = num_groups > 1 ?
diff --git a/src/operator/nn/dnnl/dnnl_rnn.cc b/src/operator/nn/dnnl/dnnl_rnn.cc
index 5ebad89089c3..051de78c7d5d 100644
--- a/src/operator/nn/dnnl/dnnl_rnn.cc
+++ b/src/operator/nn/dnnl/dnnl_rnn.cc
@@ -197,14 +197,14 @@ RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
   auto src_cell_desc     = memory::desc(layer_param.cell_dims, data_type, tag::ldnc);
   auto weight_peep_desc  = memory::desc();
   auto weight_proj_desc  = layer_param.proj_size > 0 ?
-                               memory::desc(layer_param.weight_proj_dims, weight_type, tag::any) :
-                               memory::desc();
-  auto dst_state_desc    = layer_param.state_outputs ?
-                               memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
-                               memory::desc();
-  auto dst_cell_desc     = layer_param.state_outputs ?
-                               memory::desc(layer_param.cell_dims, data_type, tag::ldnc) :
-                               memory::desc();
+                              memory::desc(layer_param.weight_proj_dims, weight_type, tag::any) :
+                              memory::desc();
+  auto dst_state_desc = layer_param.state_outputs ?
+                            memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
+                            memory::desc();
+  auto dst_cell_desc = layer_param.state_outputs ?
+                           memory::desc(layer_param.cell_dims, data_type, tag::ldnc) :
+                           memory::desc();
 
   auto fwd = RnnPrimitive();
   switch (mode) {
@@ -266,8 +266,8 @@ RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
   memory::data_type weight_type        = get_dnnl_type(params.dtype());
   const prop_kind prop                 = prop_kind::backward;
   rnn_direction dnnl_rnn_direction     = layer_param.bidirectional ?
-                                             rnn_direction::bidirectional_concat :
-                                             rnn_direction::unidirectional;
+                                         rnn_direction::bidirectional_concat :
+                                         rnn_direction::unidirectional;
 
   auto src_layer_desc    = memory::desc(layer_param.src_dims, data_type, tag::tnc);
   auto weight_layer_desc = memory::desc(layer_param.weight_layer_dims, weight_type, tag::any);
@@ -276,8 +276,8 @@ RnnBwdPrimitive GetRnnBwdPrim(const DNNLRnnForwardTraining& fwd,
   auto dst_layer_desc    = memory::desc(layer_param.dst_dims, data_type, tag::tnc);
   auto src_state_desc    = memory::desc(layer_param.state_dims, data_type, tag::ldnc);
   auto dst_state_desc    = layer_param.state_outputs ?
-                               memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
-                               memory::desc();
+                            memory::desc(layer_param.state_dims, data_type, tag::ldnc) :
+                            memory::desc();
 
   const void* fwd_pd = fwd.GetPrimDesc();
   auto bwd           = RnnBwdPrimitive();
@@ -1127,8 +1127,8 @@ void DNNLRnnOp::Forward(const OpContext& ctx,
   const int batch_size = default_param.batch_size_;
   const int state_size = default_param.state_size;
   const int iter_size  = default_param.projection_size.has_value() ?
-                             default_param.projection_size.value() :
-                             default_param.state_size;
+                            default_param.projection_size.value() :
+                            default_param.state_size;
   const int directions = default_param.bidirectional ? 2 : 1;
   dnnl::memory::desc dst_desc({seq_length, batch_size, directions * iter_size},
                               get_dnnl_type(data_dtype),
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 9ee41cb8f9a6..71c205539efd 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -853,8 +853,8 @@ __global__ void masked_softmax_grad_kernel(OType* out,
   for (index_t i = x; i < M; i += x_size) {
     bool mask_value = bcst_mask_axis ? in_mask[base_mask] : in_mask[base_mask + i * sa_mask];
     final_result    = negate ? -OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum) :
-                               OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum);
-    final_result    = mask_value ? final_result / static_cast<DType>(temperature) : DType(0.0f);
+                            OP2::Map(ograd[base + i * sa], out[base + i * sa], ssum);
+    final_result = mask_value ? final_result / static_cast<DType>(temperature) : DType(0.0f);
     KERNEL_ASSIGN(igrad[base + i * sa], Req, final_result);
   }
 }
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index c3fd47dadd17..ff5f4dd9f355 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -229,7 +229,7 @@ struct AdamStdDnsRspDnsKernel<req, cpu> {
     for (index_t j = 0; j < row_length; j++) {
       const index_t data_i = row_i + j;
       DType grad_rescaled  = non_zero ? static_cast<DType>(grad_data[grad_i + j] * rescale_grad) :
-                                        static_cast<DType>(0);
+                                       static_cast<DType>(0);
       if (clip_gradient >= 0.0f) {
         grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 4c75eb0c72fc..01bd6f8ff1a0 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -164,7 +164,7 @@ struct AdamStdDnsRspDnsKernel<req, gpu> {
         (row_id == 0) ? prefix_sum[0] > 0 : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
     DType grad_rescaled     = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad) :
-                                         static_cast<DType>(0);
+                                     static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
diff --git a/src/operator/subgraph/dnnl/dnnl_conv.cc b/src/operator/subgraph/dnnl/dnnl_conv.cc
index e9fab47e6f44..bc1f6fdc5aa5 100644
--- a/src/operator/subgraph/dnnl/dnnl_conv.cc
+++ b/src/operator/subgraph/dnnl/dnnl_conv.cc
@@ -472,7 +472,7 @@ static void SgDNNLConvParamParser(nnvm::NodeAttrs* attrs) {
       auto& post_act_param = (param_.full_conv_param.dnnl_param.with_act && !with_act) ?
                                  param_.full_conv_param.act_param :
                                  param_.full_conv_param.postsum_act_param;
-      with_act             = true;
+      with_act = true;
       if (node_name == "Activation") {
         const auto act_param = nnvm::get<ActivationParam>(node->attrs.parsed);
         post_act_param.alg   = GetDNNLActAlgo(act_param);
diff --git a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
index c145273076b2..834b20a44165 100644
--- a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
@@ -73,11 +73,13 @@ class TRT_Logger : public nvinfer1::ILogger {
       time_t rawtime = std::time(0);
       char buf[256];
       strftime(&buf[0], 256, "%Y-%m-%d %H:%M:%S", std::gmtime(&rawtime));
+      // clang-format off
       const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" :
                             severity == Severity::kERROR          ? "  ERROR" :
                             severity == Severity::kWARNING        ? "WARNING" :
                             severity == Severity::kINFO           ? "   INFO" :
                                                                     "UNKNOWN");
+      // clang-format on
       (*_ostream) << "[" << buf << " " << sevstr << "] " << msg << std::endl;
     }
   }

From a9fcaf318820e2705dd2267561c4f1687d2aaf49 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Fri, 5 Nov 2021 07:42:24 +0100
Subject: [PATCH 09/10] Sanity-cpp fix

---
 cpp-package/include/mxnet-cpp/contrib.h     | 4 ++--
 cpp-package/include/mxnet-cpp/initializer.h | 4 +++-
 include/mxnet/operator.h                    | 4 ++--
 plugin/torch/torch_module-inl.h             | 2 +-
 src/operator/contrib/adamw.cu               | 4 +++-
 src/operator/tensor/square_sum.cc           | 4 +++-
 src/operator/tensor/square_sum.cu           | 4 +++-
 tests/cpp/include/test_perf.h               | 1 -
 tests/cpp/include/test_util.h               | 1 -
 tests/cpp/operator/batchnorm_test.cc        | 2 --
 10 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/contrib.h b/cpp-package/include/mxnet-cpp/contrib.h
index c6ca3b834b14..b754ab5e5725 100644
--- a/cpp-package/include/mxnet-cpp/contrib.h
+++ b/cpp-package/include/mxnet-cpp/contrib.h
@@ -59,10 +59,10 @@ namespace contrib {
 
 // needs to be same with
 //   https://github.com/apache/incubator-mxnet/blob/1c874cfc807cee755c38f6486e8e0f4d94416cd8/src/operator/subgraph/tensorrt/tensorrt-inl.h#L190
-static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";
+static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";  // NOLINT
 // needs to be same with
 //   https://github.com/apache/incubator-mxnet/blob/master/src/operator/subgraph/tensorrt/tensorrt.cc#L244
-static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";
+static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";  // NOLINT
 /*!
  * this is a mimic to
  * https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/tensorrt.py#L37
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
index 5f509c2aa5a0..78ed2dfdecc8 100644
--- a/cpp-package/include/mxnet-cpp/initializer.h
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -197,7 +197,9 @@ class Xavier : public Initializer {
   enum RandType { gaussian, uniform } rand_type;
   enum FactorType { avg, in, out } factor_type;
   float magnitude;
-  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg, float magnitude = 3)
+  Xavier(RandType rand_type     = gaussian,  // NOLINT
+         FactorType factor_type = avg,       // NOLINT
+         float magnitude        = 3)                // NOLINT
       : rand_type(rand_type), factor_type(factor_type), magnitude(magnitude) {}
 
   void operator()(const std::string& name, NDArray* arr) override {
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 268460fd7c25..a5ab13945899 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -109,8 +109,8 @@ class Operator {
     LOG(FATAL) << "Backward is not implemented";
   }
   /*! \return [Deprecated] execution type of the operator */
-  virtual ExecType exec_type()
-      const final {  // NOLINT(*) exec_type has been moved to OperatorProperty
+  virtual ExecType exec_type()  // NOLINT(*) exec_type has been moved to OperatorProperty
+      const final {             // NOLINT(*) exec_type has been moved to OperatorProperty
     return ExecType::kSync;
   }
 };
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index ef13493ba56b..57406811b484 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -208,7 +208,7 @@ class TorchModuleOp : public Operator {
       // iterate the grad of params
       lua_pushnil(L);
       it = in_grad.begin() + param_.num_data;
-      ;
+
       while (lua_next(L, -2)) {
         TorchTensor::SetInternal(
             torchState_,
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index b67ea10e26a3..802378839bc2 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -28,8 +28,10 @@ namespace mxnet {
 namespace op {
 namespace adamw {
 
+// clang-format off
 template <>
-void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float* pScalef){
+void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float* pScalef) {
+    // clang-format on
     MSHADOW_REAL_TYPE_SWITCH(
         scale_blob.type_flag_,
         DType,
diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc
index 05917b6c1382..6efef0af1266 100644
--- a/src/operator/tensor/square_sum.cc
+++ b/src/operator/tensor/square_sum.cc
@@ -26,8 +26,10 @@
 namespace mxnet {
 namespace op {
 
+// clang-format off
 template <>
-void CheckSameIdx<cpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx){
+void CheckSameIdx<cpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx) {
+    // clang-format on
     MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_,
                             IType,
                             {
diff --git a/src/operator/tensor/square_sum.cu b/src/operator/tensor/square_sum.cu
index d41f0aa02918..1cb27c1b9f0c 100644
--- a/src/operator/tensor/square_sum.cu
+++ b/src/operator/tensor/square_sum.cu
@@ -26,8 +26,10 @@
 namespace mxnet {
 namespace op {
 
+// clang-format off
 template <>
-void CheckSameIdx<gpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx){
+void CheckSameIdx<gpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx) {
+// clang-format on	
     MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_,
                             IType,
                             {
diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h
index 2f215b5f68ee..94902f71a5f6 100644
--- a/tests/cpp/include/test_perf.h
+++ b/tests/cpp/include/test_perf.h
@@ -125,7 +125,6 @@ class TimedScope {
 
   inline void stop() {
     stopTime_ = getMicroTickCount();
-    ;
   }
 
   inline float elapsedMilliseconds() const {
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index 9b495388955c..48e3971a88be 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -444,7 +444,6 @@ inline StreamType& print_blob_(const RunContext& ctx,
         break;
       } else {
         os << " |" << std::flush;
-        ;
       }
     }
     if (r < height - 1) {
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 39d039c0b55c..55b7f421ce79 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -950,8 +950,6 @@ static void timingTest(const std::string& label,
       timing += info.executor_->GetTiming();
     }
   }
-  while (false)
-    ;
 
   timing.print(&std::cout, label);
   std::cout << std::endl << std::flush;

From ff48249586aaabda429a4ccf21e08fd9667af4f2 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Fri, 5 Nov 2021 08:25:18 +0100
Subject: [PATCH 10/10] Sanity-cpp fix part2

---
 src/operator/tensor/square_sum.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/square_sum.cu b/src/operator/tensor/square_sum.cu
index 1cb27c1b9f0c..e27e62d03e2a 100644
--- a/src/operator/tensor/square_sum.cu
+++ b/src/operator/tensor/square_sum.cu
@@ -29,7 +29,7 @@ namespace op {
 // clang-format off
 template <>
 void CheckSameIdx<gpu>(const OpContext& ctx, const TBlob& ograd_row_idx, const TBlob& in_row_idx) {
-// clang-format on	
+    // clang-format on
     MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_,
                             IType,
                             {