From c1868633c711b0b495cf9e956e94a5dc47d8a823 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Fri, 19 Jul 2019 18:01:05 +0800
Subject: [PATCH 01/24] MKLDNN LBR-GRU Integration

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 381 +++++++++++++----------
 src/operator/rnn-inl.h                   |  28 +-
 src/operator/rnn.cc                      | 141 +++++----
 3 files changed, 293 insertions(+), 257 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index ea8e07ea617c..2db46ea84fc7 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -39,6 +39,26 @@
 namespace mxnet {
 namespace op {
 
+struct MKLDNNRNNMemory {
+  std::vector<mkldnn::memory> concat_weight_memory;
+  std::vector<mkldnn::memory> concat_iter_memory;
+  std::vector<mkldnn::memory> x_memory;
+  std::vector<mkldnn::memory> hcx_memory;
+  std::vector<mkldnn::memory> wx_memory;
+  std::vector<mkldnn::memory> wh_memory;
+  std::vector<mkldnn::memory> bias_memory;
+  std::vector<mkldnn::memory> y_memory;
+  std::vector<mkldnn::memory> hcy_memory;
+  std::vector<mkldnn::memory> uni_states_memory;
+  std::vector<mkldnn::memory> concat_states_memory;
+  std::vector<mkldnn::memory> weight_layer_mems;
+  std::vector<mkldnn::memory> weight_iter_mems;
+  mkldnn::memory user_src_layer_memory_l;
+
+  MKLDNNRNNMemory() : user_src_layer_memory_l(
+      null_memory(CpuEngine::Get()->get_engine())) {}
+};
+
 static algorithm GetMKLDNNRNNAlgo(int mode,
                                   int* ngates,
                                   int* nstates) {
@@ -52,7 +72,7 @@ static algorithm GetMKLDNNRNNAlgo(int mode,
     case rnn_enum::kGru:
       *ngates = 3;
       *nstates = 1;
-      algo = algorithm::vanilla_gru;
+      algo = algorithm::gru_linear_before_reset;
       break;
     case rnn_enum::kRnnRelu:
     case rnn_enum::kRnnTanh:
@@ -73,35 +93,48 @@ static void ConcatData(mkldnn::memory::format src_format,
                        mkldnn::memory::dims dst_cds,
                        mkldnn::memory::data_type mkldnn_dtype,
                        int concat_dimension,
-                       std::vector<void*> srcs_data,
-                       const mkldnn::memory &dst) {
+                       const std::vector<void*> &srcs_data,
+                       const mkldnn::memory &dst,
+                       std::vector<mkldnn::memory> *tmp_src_mems) {
   auto cpu_engine = CpuEngine::Get()->get_engine();
   std::vector<mkldnn::memory::primitive_desc> srcs_pd;
-  std::vector<mkldnn::memory> srcs;
+  bool initialized = tmp_src_mems->size() > 0;
   for (size_t i = 0; i < srcs_cds.size(); i++) {
     auto desc = mkldnn::memory::desc(srcs_cds[i], mkldnn_dtype, src_format);
     auto mpd = mkldnn::memory::primitive_desc(desc, cpu_engine);
-    auto src_memory = mkldnn::memory(mpd, srcs_data[i]);
     srcs_pd.push_back(mpd);
-    srcs.push_back(src_memory);
-  }
-  std::vector<primitive::at> inputs;
-  for (size_t i = 0; i < srcs_cds.size(); i++) {
-    inputs.push_back(srcs[i]);
+    if (initialized) {
+      tmp_src_mems->at(i).set_data_handle(srcs_data[i]);
+    } else {
+      auto src_memory = mkldnn::memory(mpd, srcs_data[i]);
+      tmp_src_mems->push_back(src_memory);
+    }
   }
+  std::vector<primitive::at> inputs(tmp_src_mems->begin(), tmp_src_mems->end());
   auto dst_desc = mkldnn::memory::desc(dst_cds, mkldnn_dtype, dst_format);
   auto concat_pd = concat::primitive_desc(dst_desc, concat_dimension, srcs_pd);
   MKLDNNStream::Get()->RegisterPrim(concat(concat_pd, inputs, dst));
-  MKLDNNStream::Get()->Submit();
 }
 
-//  cached mkldnn memory
-//  first layer wx, wh with next L - 1 layers wx and wh
-//  with L layers hx and cx, src and dst data/iter etc.
-//  it will prepare memory on before and after reorder and concat.
-//  for unidirectional, it will fused as dim like 1  + (L - 1) when I != H.
-//  for bidirectional, it will fused as data + back_data (weight, bias, iter etc),
-//  also need to identify first layer and next layers
+/**
+ * Size of cached memory
+ * 
+ * Cache memory of wx, wh from the first layer and next L - 1 layers
+ * seperately, as well as the layer and iter memory for src and dst.
+ * Output states memory hx, hc and bias memory are also cached. It
+ * will prepare memory on before and after reorder and concat. For
+ * unidirectional, it will fused as dim like 1  + (L - 1) when I != H.
+ * For bidirectional, it will fused as data + back_data (weight, bias,
+ * iter etc)
+ * 
+ * @param L Number of Layers
+ * @param D Direction of the RNN implement. It should be 1 or 2.
+ * @param T The maximum sequence length.
+ * @param N Batch size.
+ * @param I Input channel. Also the dimension of the input feature.
+ * @param H Hidden state size.
+ * @return The required cache size.
+ */
 static size_t GetMKLDNNRNNCacheMemorySize(int L,
                                           int D,
                                           int T,
@@ -118,7 +151,7 @@ static size_t GetMKLDNNRNNCacheMemorySize(int L,
       break;
     case rnn_enum::kGru:
       size = 2 * (D * (I + H) * 3 * H + (L - 1) * D * (D * H + H) * 3 * H +
-             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 3 * H + (L + 2) * D * 2 * N * H +
+             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H +
              6 * D * (I + H + 2) * 3 * H + T * N * I * 2;
       break;
     case rnn_enum::kRnnRelu:
@@ -177,7 +210,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
                                           const int I,
                                           const int H,
                                           DType* x_ptr,
-                                          mkldnn::memory *user_src_layer_memory,
                                           DType* hx_ptr,
                                           DType* cx_ptr,
                                           DType* w_ptr,
@@ -185,15 +217,7 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
                                           DType* y_ptr,
                                           DType* hy_ptr,
                                           DType* cy_ptr,
-                                          std::vector<mkldnn::memory> *concat_weight_memory,
-                                          std::vector<mkldnn::memory> *concat_iter_memory,
-                                          std::vector<mkldnn::memory> *x_memory,
-                                          std::vector<mkldnn::memory> *hcx_memory,
-                                          std::vector<mkldnn::memory> *wx_memory,
-                                          std::vector<mkldnn::memory> *wh_memory,
-                                          std::vector<mkldnn::memory> *bias_memory,
-                                          std::vector<mkldnn::memory> *y_memory,
-                                          std::vector<mkldnn::memory> *hcy_memory,
+                                          MKLDNNRNNMemory *mkldnn_mems,
                                           std::vector<primitive> *rnn_forward_prim,
                                           int layer_index,
                                           bool *has_cache,
@@ -203,16 +227,17 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
                                           int mode) {
   int ngates = 0, nstates = 0;
   algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
+  const int nbias = mode == rnn_enum::kGru ? ngates + 1 : ngates;
   mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
   const int single_cell_size = N * H;
-  const int single_b_size = ngates * H;
+  const int mx_single_b_sz = ngates * H;
   DType* wx = w_ptr;  //  ngates * H, I
   DType* wh = w_ptr + I * H * ngates;  //  ngates * H, H
   DType* back_wx = w_ptr + ngates * H * (I + H);
   DType* back_wh = back_wx + I * H * ngates;
   DType* bx = b_ptr;
   DType* bh = b_ptr + H * ngates;
-  DType* back_bx = b_ptr + single_b_size * 2;
+  DType* back_bx = b_ptr + mx_single_b_sz * 2;
   DType* back_bh = back_bx + H * ngates;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   auto cpu_engine = CpuEngine::Get()->get_engine();
@@ -225,54 +250,76 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
   mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
   mkldnn::memory::dims weights_iter_tz = {1, 2, H, ngates, H};  //  ldigo
   mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H};  //  ldigo for reorder
-  mkldnn::memory::dims bias_tz = {1, 2, ngates, H};
+  mkldnn::memory::dims bias_tz = {1, 2, nbias, H};  // ldgo
   mkldnn::memory::dims src_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
   mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
 
-  if (!initialized) {
+  bool has_adjusted = false;
+  if (!initialized || is_train) {
     if (mode == rnn_enum::kGru) {
       AdjustGruWeightGateOrder(wx, I, H);
       AdjustGruWeightGateOrder(back_wx, I, H);
       AdjustGruWeightGateOrder(wh, H, H);
       AdjustGruWeightGateOrder(back_wh, H, H);
-      AdjustGruBiasGateOrder(bx, H);
-      AdjustGruBiasGateOrder(back_bx, H);
-      AdjustGruBiasGateOrder(bh, H);
-      AdjustGruBiasGateOrder(back_bh, H);
+      has_adjusted = true;
     }
-    auto src_wx = (*concat_weight_memory)[2 * layer_index];
-    auto src_wh = (*concat_weight_memory)[2 * layer_index + 1];
+    auto src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index];
+    auto src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
     std::vector<void*> srcs_data1;
     srcs_data1.push_back(wx);
     srcs_data1.push_back(back_wx);
     ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
         {weights_layer_r_tz, weights_layer_r_tz}, weights_layer_tz,
-        mkldnn_dtype, 1, srcs_data1, src_wx);
+        mkldnn_dtype, 1, srcs_data1, src_wx, &(mkldnn_mems->weight_layer_mems));
     srcs_data1.clear();
     srcs_data1.push_back(wh);
     srcs_data1.push_back(back_wh);
     ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
         {weights_iter_r_tz, weights_iter_r_tz}, weights_iter_tz,
-         mkldnn_dtype, 1, srcs_data1, src_wh);
+        mkldnn_dtype, 1, srcs_data1, src_wh, &(mkldnn_mems->weight_iter_mems));
     int tmpvalue = 0;
     if (lvalue > 0) {
       tmpvalue = lvalue + 1;
     }
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, (*wx_memory)[tmpvalue]));
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, (*wh_memory)[tmpvalue]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[tmpvalue]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[tmpvalue]));
 
     DType* user_bias = reinterpret_cast<DType *>
-        ((*bias_memory)[tmpvalue].get_data_handle());
-    #pragma omp parallel for num_threads(omp_threads)
-    for (int j = 0; j < single_b_size; j++) {
-      user_bias[j] = bx[j] + bh[j];
-      user_bias[single_b_size + j] = back_bx[j] + back_bh[j];
+        (mkldnn_mems->bias_memory[tmpvalue].get_data_handle());
+    if (mode == rnn_enum::kGru) {
+      // While mxnet gru gate order is reset, update and new gates,
+      // mkldnn gru gate order is update, reset and new gates. So
+      // we need to swap the order of reset and update from mxnet.
+      const index_t single_b_sz = nbias * H;
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int j = 0; j < H; j++) {
+        user_bias[j + H] = bx[j] + bh[j];
+        user_bias[single_b_sz + j + H] = back_bx[j] + back_bh[j];
+        user_bias[j] = bx[j + H] + bh[j + H];
+        user_bias[single_b_sz + j] = back_bx[j + H] + back_bh[j + H];
+      }
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int j = 2 * H; j < 3 * H; j++) {
+        user_bias[j] = bx[j];
+        user_bias[j + H] = bh[j];
+        user_bias[single_b_sz + j] = back_bx[j];
+        user_bias[single_b_sz + j + H] = back_bh[j];
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int j = 0; j < mx_single_b_sz; j++) {
+        user_bias[j] = bx[j] + bh[j];
+        user_bias[mx_single_b_sz + j] = back_bx[j] + back_bh[j];
+      }
     }
   }
   if (lvalue > 0) {
-    (*wx_memory)[layer_index].set_data_handle((*wx_memory)[lvalue + 1].get_data_handle());
-    (*wh_memory)[layer_index].set_data_handle((*wh_memory)[lvalue + 1].get_data_handle());
-    (*bias_memory)[layer_index].set_data_handle((*bias_memory)[lvalue + 1].get_data_handle());
+    mkldnn_mems->wx_memory[layer_index].set_data_handle(
+        mkldnn_mems->wx_memory[lvalue + 1].get_data_handle());
+    mkldnn_mems->wh_memory[layer_index].set_data_handle(
+        mkldnn_mems->wh_memory[lvalue + 1].get_data_handle());
+    mkldnn_mems->bias_memory[layer_index].set_data_handle(
+        mkldnn_mems->bias_memory[lvalue + 1].get_data_handle());
   }
 
   auto src_layer_md = mkldnn::memory::desc(
@@ -290,32 +337,32 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
   auto bias_md = mkldnn::memory::desc({bias_tz},
       mkldnn_dtype, mkldnn::memory::format::ldgo);
 
-  auto user_src_iter_memory = (*concat_iter_memory)[2];
+  auto user_src_iter_memory = mkldnn_mems->concat_iter_memory[2];
   if (mode == rnn_enum::kLstm) {
     std::vector<void*> srcs_data1;
     srcs_data1.push_back(hx_ptr);
     srcs_data1.push_back(cx_ptr);
-    auto tmp1_src_iter_memory = (*concat_iter_memory)[0];
+    auto tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0];
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
         {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
-        srcs_data1, tmp1_src_iter_memory);
+        srcs_data1, tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     std::vector<void*> srcs_data2;
     srcs_data2.push_back(hx_ptr + single_cell_size);
     srcs_data2.push_back(cx_ptr + single_cell_size);
-    auto tmp2_src_iter_memory = (*concat_iter_memory)[1];
+    auto tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1];
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
         {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
-        srcs_data2, tmp2_src_iter_memory);
+        srcs_data2, tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     std::vector<void*> srcs_data3;
     srcs_data3.push_back(reinterpret_cast<DType *>(tmp1_src_iter_memory.get_data_handle()));
     srcs_data3.push_back(reinterpret_cast<DType *>(tmp2_src_iter_memory.get_data_handle()));
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
         {{1, 1, nstates, N, H}, {1, 1, nstates, N, H}}, {1, 2, nstates, N, H},
-        mkldnn_dtype, 1, srcs_data3, user_src_iter_memory);
+        mkldnn_dtype, 1, srcs_data3, user_src_iter_memory, &(mkldnn_mems->concat_states_memory));
   } else {
     user_src_iter_memory.set_data_handle(hx_ptr);
   }
-  (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
+  mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
 
   rnn_cell::desc rnn_cell(nalgorithm,
       mode == rnn_enum::kRnnRelu ? algorithm::eltwise_relu : algorithm::eltwise_tanh);
@@ -329,25 +376,25 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
        = rnn_forward::primitive_desc(layer_desc, cpu_engine);
 
   if (x_ptr && layer_index == 0) {
-    (*x_memory)[layer_index].set_data_handle(x_ptr);
+    mkldnn_mems->x_memory[layer_index].set_data_handle(x_ptr);
   } else {
-    (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle());
+    mkldnn_mems->x_memory[layer_index].set_data_handle(
+        mkldnn_mems->user_src_layer_memory_l.get_data_handle());
   }
-  (*y_memory)[layer_index].set_data_handle(y_ptr);
-
+  mkldnn_mems->y_memory[layer_index].set_data_handle(y_ptr);
   if (rnn_forward_prim->size() <= (size_t)layer_index) {
-    primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index],
-          (*hcx_memory)[layer_index], (*wx_memory)[layer_index],
-          (*wh_memory)[layer_index], (*bias_memory)[layer_index],
-          (*y_memory)[layer_index],
-         (*hcy_memory)[layer_index], null_memory_);
+    primitive rnn_prim = rnn_forward(prim_desc, mkldnn_mems->x_memory[layer_index],
+          mkldnn_mems->hcx_memory[layer_index], mkldnn_mems->wx_memory[layer_index],
+          mkldnn_mems->wh_memory[layer_index], mkldnn_mems->bias_memory[layer_index],
+          mkldnn_mems->y_memory[layer_index],
+          mkldnn_mems->hcy_memory[layer_index], null_memory_);
     rnn_forward_prim->push_back(rnn_prim);
   }
   MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]);
   MKLDNNStream::Get()->Submit();
 
   if (state_outputs) {
-    DType* dst_hcy = reinterpret_cast<DType *> ((*hcy_memory)[layer_index].get_data_handle());
+    DType* dst_hcy = reinterpret_cast<DType *>(mkldnn_mems->hcy_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kLstm) {
       offset1 = nstates * single_cell_size;
       offset2 = (nstates + 1) * single_cell_size;
@@ -365,6 +412,12 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
       }
     }
   }
+  if (has_adjusted) {
+    AdjustGruWeightGateOrder(wx, I, H);
+    AdjustGruWeightGateOrder(back_wx, I, H);
+    AdjustGruWeightGateOrder(wh, H, H);
+    AdjustGruWeightGateOrder(back_wh, H, H);
+  }
 }
 
 
@@ -376,7 +429,6 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
                                   const int I,
                                   const int H,
                                   DType* x_ptr,
-                                  mkldnn::memory *user_src_layer_memory,
                                   DType* hx_ptr,
                                   DType* cx_ptr,
                                   DType* w_ptr,
@@ -384,15 +436,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
                                   DType* y_ptr,
                                   DType* hy_ptr,
                                   DType* cy_ptr,
-                                  std::vector<mkldnn::memory> *concat_weight_memory,
-                                  std::vector<mkldnn::memory> *concat_iter_memory,
-                                  std::vector<mkldnn::memory> *x_memory,
-                                  std::vector<mkldnn::memory> *hcx_memory,
-                                  std::vector<mkldnn::memory> *wx_memory,
-                                  std::vector<mkldnn::memory> *wh_memory,
-                                  std::vector<mkldnn::memory> *bias_memory,
-                                  std::vector<mkldnn::memory> *y_memory,
-                                  std::vector<mkldnn::memory> *hcy_memory,
+                                  MKLDNNRNNMemory *mkldnn_mems,
                                   std::vector<primitive> *rnn_forward_prim,
                                   int layer_index,
                                   bool *has_cache,
@@ -401,10 +445,11 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
                                   int mode) {
   int ngates = 0, nstates = 0;
   algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
+  const int nbias = (mode == rnn_enum::kGru ? ngates + 1 : ngates);
   mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
   const int cell_size = N * H;
   const int single_cell_size = N * H;
-  const int single_b_size = ngates * H;
+  const int single_b_size = nbias * H;
   int w_size = (I + H) * H * ngates;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   auto cpu_engine = CpuEngine::Get()->get_engine();
@@ -416,7 +461,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
   mkldnn::memory::dims dst_layer_tz = {T, N, H};
   mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H};  //  ldigo
   mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H};  //  ldigo
-  mkldnn::memory::dims bias_tz = {L, 1, ngates, H};
+  mkldnn::memory::dims bias_tz = {L, 1, nbias, H};  // ldgo
   mkldnn::memory::dims src_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
   mkldnn::memory::dims dst_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
   mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
@@ -442,12 +487,12 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
       std::vector<void*> srcs_data;
       srcs_data.push_back(hx_ptr);
       srcs_data.push_back(cx_ptr);
-      auto tmp_src_iter_memory = (*concat_iter_memory)[l + layer_index];
+      auto tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index];
       ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
           {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype,
-          2, srcs_data, tmp_src_iter_memory);
+          2, srcs_data, tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     } else {
-      (*concat_iter_memory)[l + layer_index].set_data_handle(hx_ptr);
+      mkldnn_mems->concat_iter_memory[l + layer_index].set_data_handle(hx_ptr);
     }
     hx_ptr += cell_size;
     if (mode == rnn_enum::kLstm) {
@@ -457,71 +502,95 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
 
   auto user_src_iter_memory = null_memory_;
   if (L == 1) {
-    user_src_iter_memory = (*concat_iter_memory)[layer_index];
+    user_src_iter_memory = mkldnn_mems->concat_iter_memory[layer_index];
   } else {
-    user_src_iter_memory = (*concat_iter_memory)[L + layer_index];
+    user_src_iter_memory = mkldnn_mems->concat_iter_memory[L + layer_index];
     std::vector<void*> src_l_data;
     std::vector<mkldnn::memory::dims> src_l_dim;
     for (int l = 0; l < L; l++) {
       src_l_data.push_back(reinterpret_cast<DType *>
-          ((*concat_iter_memory)[l + layer_index].get_data_handle()));
+          (mkldnn_mems->concat_iter_memory[l + layer_index].get_data_handle()));
       src_l_dim.push_back({1, 1, nstates, N, H});
     }
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, src_l_dim,
-        {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory);
+        {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory,
+        &(mkldnn_mems->concat_states_memory));
   }
-  (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
+  mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
 
-  auto src_wx_f = (*concat_weight_memory)[2 * layer_index];
-  auto src_wh_f = (*concat_weight_memory)[2 * layer_index + 1];
+  auto src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index];
+  auto src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
 
   std::vector<void*> srcs_data_x;
   std::vector<void*> srcs_data_h;
   std::vector<mkldnn::memory::dims> src_l_dim_x;
   std::vector<mkldnn::memory::dims> src_l_dim_h;
+
+  bool has_adjusted = false;
   if (!initialized) {
     if (L == 1) {
       DType* wx = w_ptr;
-      DType* wh = w_ptr + I * H * ngates;
+      DType* wh = wx + I * H * ngates;
       if (mode == rnn_enum::kGru) {
         AdjustGruWeightGateOrder(wx, I, H);
         AdjustGruWeightGateOrder(wh, H, H);
-        AdjustGruBiasGateOrder(b_ptr, H);
-        AdjustGruBiasGateOrder(b_ptr + H * ngates, H);
+        has_adjusted = true;
       }
       src_wx_f.set_data_handle(wx);
       src_wh_f.set_data_handle(wh);
     } else {
       for (int l = 0; l < L; l++) {
-        DType* wx = w_ptr;
-        DType* wh = w_ptr + I * H * ngates;
-        DType* bx = b_ptr + l * ngates * H * 2;
-        DType* bh = b_ptr + l * ngates * H * 2 + H * ngates;
+        DType* wx = w_ptr + l * w_size;
+        DType* wh = wx + I * H * ngates;
         if (mode == rnn_enum::kGru) {
           AdjustGruWeightGateOrder(wx, I, H);
           AdjustGruWeightGateOrder(wh, H, H);
-          AdjustGruBiasGateOrder(bx, H);
-          AdjustGruBiasGateOrder(bh, H);
+          has_adjusted = true;
         }
         srcs_data_x.push_back(wx);
         srcs_data_h.push_back(wh);
         src_l_dim_x.push_back(weights_layer_r_tz);
         src_l_dim_h.push_back(weights_iter_r_tz);
-        w_ptr = w_ptr + w_size;
       }
       ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
-          src_l_dim_x, weights_layer_tz, mkldnn_dtype, 0, srcs_data_x, src_wx_f);
+          src_l_dim_x, weights_layer_tz, mkldnn_dtype, 0, srcs_data_x, src_wx_f,
+          &(mkldnn_mems->weight_layer_mems));
       ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
-          src_l_dim_h, weights_iter_tz, mkldnn_dtype, 0, srcs_data_h, src_wh_f);
+          src_l_dim_h, weights_iter_tz, mkldnn_dtype, 0, srcs_data_h, src_wh_f,
+          &(mkldnn_mems->weight_iter_mems));
     }
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, (*wx_memory)[layer_index]));
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, (*wh_memory)[layer_index]));
-
-    DType* user_bias_f = reinterpret_cast<DType *> ((*bias_memory)[layer_index].get_data_handle());
-    #pragma omp parallel for num_threads(omp_threads)
-    for (int j = 0; j < L * single_b_size; j++) {
-      int k = j / single_b_size;
-      user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size];
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, mkldnn_mems->wx_memory[layer_index]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, mkldnn_mems->wh_memory[layer_index]));
+
+    DType* user_bias_f = reinterpret_cast<DType *>(mkldnn_mems->bias_memory[layer_index].get_data_handle());
+    if (mode == rnn_enum::kGru) {
+      const int mx_single_b_sz = ngates * H;
+      for (int l = 0; l < L; l++) {
+        #pragma omp parallel for num_threads(omp_threads)
+        for (int g = 0; g < H; g++) {
+          // While mxnet gru gate order is reset, update and new gates,
+          // mkldnn gru gate order is update, reset and new gates. So
+          // we need to swap the order of reset and update from mxnet.
+          user_bias_f[g + H + l * single_b_size] =
+              b_ptr[g + l * mx_single_b_sz * 2]
+              + b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz];
+          user_bias_f[g + l * single_b_size] =
+              b_ptr[g + H + l * mx_single_b_sz * 2]
+              + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz];
+        } 
+        #pragma omp parallel for num_threads(omp_threads)
+        for (int g = 2 * H; g < 3 * H; g++) {
+          user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2];
+          user_bias_f[g + l * single_b_size + H] =
+              b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz];
+        }
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int j = 0; j < L * single_b_size; j++) {
+        int k = j / single_b_size;
+        user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size];
+      }
     }
   }
 
@@ -537,25 +606,25 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
        = rnn_forward::primitive_desc(layer_desc, cpu_engine);
 
   if (x_ptr && layer_index == 0) {
-    (*x_memory)[layer_index].set_data_handle(x_ptr);
+    mkldnn_mems->x_memory[layer_index].set_data_handle(x_ptr);
   } else {
-    (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle());
+    mkldnn_mems->x_memory[layer_index].set_data_handle(
+        mkldnn_mems->user_src_layer_memory_l.get_data_handle());
   }
-  (*y_memory)[layer_index].set_data_handle(y_ptr);
-
+  mkldnn_mems->y_memory[layer_index].set_data_handle(y_ptr);
   if (rnn_forward_prim->size() <= (size_t)layer_index) {
-    primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index],
-          (*hcx_memory)[layer_index], (*wx_memory)[layer_index],
-          (*wh_memory)[layer_index], (*bias_memory)[layer_index],
-          (*y_memory)[layer_index],
-         (*hcy_memory)[layer_index], null_memory_);
+    primitive rnn_prim = rnn_forward(prim_desc, mkldnn_mems->x_memory[layer_index],
+          mkldnn_mems->hcx_memory[layer_index], mkldnn_mems->wx_memory[layer_index],
+          mkldnn_mems->wh_memory[layer_index], mkldnn_mems->bias_memory[layer_index],
+          mkldnn_mems->y_memory[layer_index],
+         mkldnn_mems->hcy_memory[layer_index], null_memory_);
     rnn_forward_prim->push_back(rnn_prim);
   }
   MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]);
   MKLDNNStream::Get()->Submit();
 
   if (state_outputs) {
-    DType* dst_hcy = reinterpret_cast<DType *> ((*hcy_memory)[layer_index].get_data_handle());
+    DType* dst_hcy = reinterpret_cast<DType *>(mkldnn_mems->hcy_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kLstm) {
       for (int l = 0; l < L; l++) {
         offset1 = l * single_cell_size;
@@ -573,6 +642,14 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
       }
     }
   }
+  if (has_adjusted) {
+    for (int l = 0; l < L; l++) {
+      DType* wx = w_ptr + l * w_size;
+      DType* wh = wx + I * H * ngates;
+      AdjustGruWeightGateOrder(wx, I, H);
+      AdjustGruWeightGateOrder(wh, H, H);
+    }
+  }
 }
 
 template <typename DType>
@@ -591,15 +668,7 @@ static void MKLDNNRNNForward(bool state_outputs,
                              DType* y_ptr,
                              DType* hy_ptr,
                              DType* cy_ptr,
-                             std::vector<mkldnn::memory> *concat_weight_memory,
-                             std::vector<mkldnn::memory> *concat_iter_memory,
-                             std::vector<mkldnn::memory> *x_memory,
-                             std::vector<mkldnn::memory> *hcx_memory,
-                             std::vector<mkldnn::memory> *wx_memory,
-                             std::vector<mkldnn::memory> *wh_memory,
-                             std::vector<mkldnn::memory> *bias_memory,
-                             std::vector<mkldnn::memory> *y_memory,
-                             std::vector<mkldnn::memory> *hcy_memory,
+                             MKLDNNRNNMemory *mkldnn_mems,
                              std::vector<primitive> *rnn_forward_prim,
                              bool *has_cache,
                              int dtype,
@@ -611,33 +680,27 @@ static void MKLDNNRNNForward(bool state_outputs,
   const int cell_size = N * H * D;
   //  First layer
   int w_size = (I + H) * H * ngates * D;
-  auto cpu_engine = CpuEngine::Get()->get_engine();
-  auto null_memory_ = null_memory(cpu_engine);
   DType* tmpNull = NULL;
   // when D = 1 and I == H, L layers can be fused together
   if (D == 1 && I == H) {
-    MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr, &null_memory_,
-        hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
-        concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
-        bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+    MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr,
+        hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+        mkldnn_mems, rnn_forward_prim,
         0, has_cache, dtype, is_train, mode);
   } else {
-    auto user_src_layer_memory_l = null_memory_;
     if (D == 2) {
-      MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, &user_src_layer_memory_l,
-          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
-          concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
-          bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+      MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr,
+          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+          mkldnn_mems, rnn_forward_prim,
           0, has_cache, 0, dtype, is_train, mode);
     } else {
-      MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, &user_src_layer_memory_l,
-          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
-          concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
-          bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+      MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr,
+          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+          mkldnn_mems, rnn_forward_prim,
           0, has_cache, dtype, is_train, mode);
     }
     if (L > 1) {
-      user_src_layer_memory_l = (*y_memory)[0];
+      mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[0];
       //  go to next L - 1 layers.
       //  If D = 2, do it layer by layer. If D = 1, fused L - 1 layers
       w_ptr += w_size;
@@ -656,12 +719,10 @@ static void MKLDNNRNNForward(bool state_outputs,
             cx_ptr += cell_size;
           }
           MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull,
-              &user_src_layer_memory_l, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr,
-              cy_ptr, concat_weight_memory, concat_iter_memory, x_memory,
-              hcx_memory, wx_memory, wh_memory, bias_memory,
-              y_memory, hcy_memory, rnn_forward_prim,
+              hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr,
+              cy_ptr, mkldnn_mems, rnn_forward_prim,
               1, has_cache, l + 1, dtype, is_train, mode);
-          user_src_layer_memory_l = (*y_memory)[1];
+          mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1];
           w_ptr += w_size;
           b_ptr += b_size;
         }
@@ -674,10 +735,8 @@ static void MKLDNNRNNForward(bool state_outputs,
           }
         }
         w_size = (H + H) * H * ngates;
-        MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull, &user_src_layer_memory_l,
-            hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
-            concat_iter_memory, x_memory, hcx_memory, wx_memory,
-            wh_memory, bias_memory, y_memory, hcy_memory,
+        MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull,
+            hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems,
             rnn_forward_prim, 1, has_cache, dtype, is_train, mode);
       }
     }
@@ -701,15 +760,7 @@ static void MKLDNNRNNForwardInference(bool state_outputs,
                                       DType* y_ptr,
                                       DType* hy_ptr,
                                       DType* cy_ptr,
-                                      std::vector<mkldnn::memory>* concat_weight_memory,
-                                      std::vector<mkldnn::memory>* concat_iter_memory,
-                                      std::vector<mkldnn::memory> *x_memory,
-                                      std::vector<mkldnn::memory> *hcx_memory,
-                                      std::vector<mkldnn::memory> *wx_memory,
-                                      std::vector<mkldnn::memory> *wh_memory,
-                                      std::vector<mkldnn::memory> *bias_memory,
-                                      std::vector<mkldnn::memory> *y_memory,
-                                      std::vector<mkldnn::memory> *hcy_memory,
+                                      MKLDNNRNNMemory *mkldnn_mems,
                                       std::vector<primitive> *rnn_forward_prim,
                                       bool *has_cache,
                                       int dtype,
@@ -723,9 +774,7 @@ static void MKLDNNRNNForwardInference(bool state_outputs,
       MKLDNNRNNForward<DType>(state_outputs, num_layers, direction, seq_length,
                               batch_size, input_size, state_size, x_ptr, hx_ptr,
                               cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
-                              concat_weight_memory, concat_iter_memory, x_memory,
-                              hcx_memory, wx_memory, wh_memory,
-                              bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+                              mkldnn_mems, rnn_forward_prim,
                               has_cache, dtype, is_train, mode);
       break;
     default:
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 328e28de8537..e3a2bfb6a322 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -397,20 +397,12 @@ class RNNOp {
   RNNParam param_;
   Context ctx_;
   #if MXNET_USE_MKLDNN == 1
-  std::vector<mkldnn::memory> concat_weight_memory;
-  std::vector<mkldnn::memory> concat_iter_memory;
-  std::vector<primitive> rnn_forward_prim;
-  std::vector<mkldnn::memory> x_memory;
-  std::vector<mkldnn::memory> hcx_memory;
-  std::vector<mkldnn::memory> wx_memory;
-  std::vector<mkldnn::memory> wh_memory;
-  std::vector<mkldnn::memory> bias_memory;
-  std::vector<mkldnn::memory> y_memory;
-  std::vector<mkldnn::memory> hcy_memory;
   bool has_cache;
   bool init_mem_;
   size_t reserve_mem_size_;
   Storage::Handle mem_space_;
+  MKLDNNRNNMemory mkldnn_mems;
+  std::vector<primitive> rnn_forward_prim;
   #endif
   explicit RNNOp(RNNParam param, Context ctx) {
     this->param_ = param;
@@ -908,9 +900,7 @@ class RNNOp {
                                   param_.mode);
       } else {
         #if MXNET_USE_MKLDNN == 1
-        if (dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1) && param_.mode != rnn_enum::kGru) {
-          // TODO(zixuanweeei): MKLDNN GRU has precision issue. A stable one
-          //   will be added to MXNet when we figure out the issue.
+        if (dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
           int dtype = in_data[rnn_enum::kData].type_flag_;
           MKLDNNRNNForwardInference<DType>(param_.state_outputs,
                                            param_.num_layers,
@@ -927,15 +917,7 @@ class RNNOp {
                                            y.dptr_,
                                            hy_ptr,
                                            cy_ptr,
-                                           &concat_weight_memory,
-                                           &concat_iter_memory,
-                                           &x_memory,
-                                           &hcx_memory,
-                                           &wx_memory,
-                                           &wh_memory,
-                                           &bias_memory,
-                                           &y_memory,
-                                           &hcy_memory,
+                                           &mkldnn_mems,
                                            &rnn_forward_prim,
                                            &has_cache,
                                            dtype,
@@ -943,8 +925,6 @@ class RNNOp {
                                            param_.mode);
         } else {
         #endif
-          //  Before integrating MKLDNN GRU fp32 inference
-          //  using below code for keep func being OK
           const size_t work_cpu_space_size =
               GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
                                   param_.state_size, direction, param_.mode);
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 6a0dbd7a4e23..7edcbe5c61a9 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -260,13 +260,14 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
       const RNNParam& param = op.param_;
       int ngates = 0, nstates = 0;
       GetMKLDNNRNNAlgo(param.mode, &ngates, &nstates);
-      int D = param.bidirectional ? 2 : 1;
+      const int D = param.bidirectional ? 2 : 1;
       Tensor<cpu, 3, DType> x = in_blobs[rnn_enum::kData].get<cpu, 3, DType>(s);
-      int T = x.shape_[0];
-      int N = x.shape_[1];
-      int I = x.shape_[2];
-      int H = param.state_size;
-      int L = param.num_layers;
+      const int T = x.shape_[0];
+      const int N = x.shape_[1];
+      const int I = x.shape_[2];
+      const int H = param.state_size;
+      const int L = param.num_layers;
+      const int nbias = param.mode == rnn_enum::kGru ? ngates + 1 : ngates;
 
       const size_t r_size = GetMKLDNNRNNCacheMemorySize(L, D, T, N, I, H, param.mode);
       if (op.init_mem_ && op.reserve_mem_size_ < r_size) {
@@ -281,7 +282,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
         op.init_mem_ = true;
         op.has_cache = false;
       }
-      if (op.has_cache && op.x_memory.size() == 0) {
+      if (op.has_cache && op.mkldnn_mems.x_memory.size() == 0) {
         op.has_cache = false;
       }
 
@@ -291,16 +292,16 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
       mkldnn::memory::dims dst_layer_tz = {T, N, D * H};
       auto dst_layer_md = mkldnn::memory::desc(
         { dst_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
-      if (op.x_memory.size() == 0) {
+      if (op.mkldnn_mems.x_memory.size() == 0) {
         if (D == 1 && I == H) {
           auto user_src_layer_md = mkldnn::memory::desc(
               { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
           auto user_src_layer_memory_n = mkldnn::memory({ user_src_layer_md, cpu_engine });
-          op.x_memory.push_back(user_src_layer_memory_n);
+          op.mkldnn_mems.x_memory.push_back(user_src_layer_memory_n);
 
           mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H};  //  ldigo
           mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H};  //  ldigo
-          mkldnn::memory::dims bias_tz = {L, 1, ngates, H};
+          mkldnn::memory::dims bias_tz = {L, 1, nbias, H};
           auto user_weight_layer_md = mkldnn::memory::desc(
               { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
           auto user_weight_iter_md = mkldnn::memory::desc(
@@ -310,21 +311,22 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* weight_layer_n = workptr;  //  L * I * ngates * H
           auto user_weight_layer_memory_n
               = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
-          op.wx_memory.push_back(user_weight_layer_memory_n);
+          op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n);
 
           DType* weight_iter_n = weight_layer_n + L * I * ngates * H;  //  L * H * ngates * H
           auto user_weight_iter_memory_n
               = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
-          op.wh_memory.push_back(user_weight_iter_memory_n);
+          op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
 
-          DType* bias_n = weight_iter_n + L * H * ngates * H;  //  L * ngates * H
+          DType* bias_n = weight_iter_n + L * H * ngates * H;  //  Generally, L * ngates * H
+                                                               //  LBR-Gru, L * (ngates + 1) * H
           auto user_bias_memory_n =
               mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
-          op.bias_memory.push_back(user_bias_memory_n);
+          op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);
 
           auto wx_md_n = mkldnn::memory::desc(
               { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
-          DType* wx_n = bias_n + L * ngates * H;  //   L * ngates * I * H
+          DType* wx_n = bias_n + L * nbias * H;  //   L * ngates * I * H
           auto wx_memory_n =
               mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
           DType* wh_n = wx_n + L * ngates * I * H;  //  L * ngates * H * H
@@ -333,8 +335,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           auto wh_memory_n =
               mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
 
-          op.concat_weight_memory.push_back(wx_memory_n);
-          op.concat_weight_memory.push_back(wh_memory_n);
+          op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n);
+          op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n);
           workptr = wh_n + L * ngates * H * H;
 
           mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H};  //  ldsnc
@@ -344,7 +346,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* src_iter_n1 = workptr;  //  nstates * N * H
             auto src_iter_memory_n1 =
                 mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1);
-            op.concat_iter_memory.push_back(src_iter_memory_n1);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n1);
             workptr = src_iter_n1 + nstates * N * H;
           }
           mkldnn::memory::dims src_iter_tz_n = {L, 1, nstates, N, H};  //  ldsnc
@@ -353,12 +355,12 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* src_iter_n = workptr;  //  L * nstates * N * H
           auto src_iter_memory_n =
               mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n);
-          op.concat_iter_memory.push_back(src_iter_memory_n);
-          op.hcx_memory.push_back(src_iter_memory_n);
+          op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n);
+          op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_n);
           DType* dst_layer_n = src_iter_n + L * nstates * N * H;  //  T * N * D * H
           auto dst_layer_memory_n
               = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
-          op.y_memory.push_back(dst_layer_memory_n);
+          op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n);
 
           mkldnn::memory::dims dst_iter_tz_n = {L, 1, nstates, N, H};  //  ldsnc
           auto dst_iter_md_n = mkldnn::memory::desc(
@@ -366,18 +368,18 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  L * nstates * N * H
           auto dst_iter_memory_n =
               mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
-          op.hcy_memory.push_back(dst_iter_memory_n);
+          op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n);
           workptr = dst_iter_n + L * nstates * N * H;
 
         } else {
           auto user_src_layer_md_0 = mkldnn::memory::desc(
               { src_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::tnc);
           auto user_src_layer_memory_0 = mkldnn::memory({ user_src_layer_md_0, cpu_engine });
-          op.x_memory.push_back(user_src_layer_memory_0);
+          op.mkldnn_mems.x_memory.push_back(user_src_layer_memory_0);
 
           mkldnn::memory::dims weights_layer_tz_0 = {1, D, I, ngates, H};  //  ldigo
           mkldnn::memory::dims weights_iter_tz_0 = {1, D, H, ngates, H};  //  ldigo
-          mkldnn::memory::dims bias_tz_0 = {1, D, ngates, H};
+          mkldnn::memory::dims bias_tz_0 = {1, D, nbias, H};
           auto user_weight_layer_md_0 = mkldnn::memory::desc(
               { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldigo);
           auto user_weight_iter_md_0 = mkldnn::memory::desc(
@@ -388,18 +390,19 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* weight_layer_0 = workptr;  //  D * I * ngates * H
           auto user_weight_layer_memory_0
               = mkldnn::memory({ user_weight_layer_md_0, cpu_engine }, weight_layer_0);
-          op.wx_memory.push_back(user_weight_layer_memory_0);
+          op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_0);
 
           DType* weight_iter_0 = weight_layer_0 + D * I * ngates * H;  //  D * H * ngates * H
           auto user_weight_iter_memory_0
               = mkldnn::memory({ user_weight_iter_md_0, cpu_engine }, weight_iter_0);
-          op.wh_memory.push_back(user_weight_iter_memory_0);
+          op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_0);
 
-          DType* bias_0 = weight_iter_0 + D * H * ngates * H;  //  D * ngates * H
+          DType* bias_0 = weight_iter_0 + D * H * ngates * H;  //  Generally, D * ngates * H
+                                                               //  LBR-Gru, D * (ngates + 1) * H
           auto user_bias_memory_0 =
               mkldnn::memory({ user_bias_md_0, cpu_engine }, bias_0);
-          op.bias_memory.push_back(user_bias_memory_0);
-          workptr = bias_0 + D * ngates * H;
+          op.mkldnn_mems.bias_memory.push_back(user_bias_memory_0);
+          workptr = bias_0 + D * nbias * H;
 
           auto wx_md_0 = mkldnn::memory::desc(
               { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
@@ -416,8 +419,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             wh_memory_0.set_data_handle(wh_0);
             workptr = wh_0 + D * ngates * H * H;
           }
-          op.concat_weight_memory.push_back(wx_memory_0);
-          op.concat_weight_memory.push_back(wh_memory_0);
+          op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_0);
+          op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_0);
 
           mkldnn::memory::dims src_iter_undi_tz_0 = {1, 1, nstates, N, H};  //  ldsnc
           auto src_iter_undi_md_0 = mkldnn::memory::desc(
@@ -425,15 +428,15 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* src_iter_undi_0 = workptr;  //  nstates * N * H
           auto src_iter_undi_memory_0 =
               mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi_0);
-          op.concat_iter_memory.push_back(src_iter_undi_memory_0);
+          op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi_memory_0);
           workptr = src_iter_undi_0 + nstates * N * H;
           if (D == 1) {
-            op.hcx_memory.push_back(src_iter_undi_memory_0);
+            op.mkldnn_mems.hcx_memory.push_back(src_iter_undi_memory_0);
           } else {
             DType* src_iter_undi2_0 = workptr;  //  nstates * N * H
             auto src_iter_undi2_memory_0 =
                 mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi2_0);
-            op.concat_iter_memory.push_back(src_iter_undi2_memory_0);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi2_memory_0);
 
             mkldnn::memory::dims src_iter_tz_0 = {1, D, nstates, N, H};  //  ldsnc
             auto src_iter_md_0 = mkldnn::memory::desc(
@@ -441,15 +444,15 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* src_iter_0 = src_iter_undi2_0 + nstates * N * H;  //  D * nstates * N * H
             auto src_iter_memory_0 =
                 mkldnn::memory({ src_iter_md_0, cpu_engine }, src_iter_0);
-            op.concat_iter_memory.push_back(src_iter_memory_0);
-            op.hcx_memory.push_back(src_iter_memory_0);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_0);
+            op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_0);
             workptr = src_iter_0 + D * nstates * N * H;
           }
 
           DType* dst_layer_0 = workptr;  //  T * N * D * H
           auto dst_layer_memory_0
               = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_0);
-          op.y_memory.push_back(dst_layer_memory_0);
+          op.mkldnn_mems.y_memory.push_back(dst_layer_memory_0);
 
           mkldnn::memory::dims dst_iter_tz_0 = {1, D, nstates, N, H};  //  ldsnc
           auto dst_iter_md_0 = mkldnn::memory::desc(
@@ -457,7 +460,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
           DType* dst_iter_0 = dst_layer_0 + T * N * D * H;  //  D * nstates * N * H
           auto dst_iter_memory_0 =
               mkldnn::memory({ dst_iter_md_0, cpu_engine }, dst_iter_0);
-          op.hcy_memory.push_back(dst_iter_memory_0);
+          op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_0);
           workptr = dst_iter_0 + D * nstates * N * H;
 
           //  next L - 1 layers
@@ -465,11 +468,11 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             auto user_src_layer_md = mkldnn::memory::desc(
                 { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
             auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine });
-            op.x_memory.push_back(user_src_layer_memory);
+            op.mkldnn_mems.x_memory.push_back(user_src_layer_memory);
 
             mkldnn::memory::dims weights_layer_tz = {L - 1, 1, H, ngates, H};  //  ldigo
             mkldnn::memory::dims weights_iter_tz = {L - 1, 1, H, ngates, H};  //  ldigo
-            mkldnn::memory::dims bias_tz = {L - 1, 1, ngates, H};
+            mkldnn::memory::dims bias_tz = {L - 1, 1, nbias, H};
             auto user_weight_layer_md = mkldnn::memory::desc(
                 { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
             auto user_weight_iter_md = mkldnn::memory::desc(
@@ -480,22 +483,23 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* weight_layer_n = workptr;  //  (L - 1) * H * ngates * H
             auto user_weight_layer_memory_n
                 = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
-            op.wx_memory.push_back(user_weight_layer_memory_n);
+            op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n);
 
             DType* weight_iter_n = weight_layer_n +
                 (L - 1) * H * ngates * H;  //  (L - 1) * H * ngates * H
             auto user_weight_iter_memory_n
                 = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
-            op.wh_memory.push_back(user_weight_iter_memory_n);
+            op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
 
-            DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H;  //  (L - 1) * ngates * H
+            DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H;  //  Generally, (L - 1) * ngates * H
+                                                                       //  LBR-Gru, (L -1) * (ngates + 1) * H
             auto user_bias_memory_n =
                 mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
-            op.bias_memory.push_back(user_bias_memory_n);
+            op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);
 
             auto wx_md_n = mkldnn::memory::desc(
                 { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
-            DType* wx_n = bias_n + (L - 1) * ngates * H;  //  (L - 1) * ngates * H * H
+            DType* wx_n = bias_n + (L - 1) * nbias * H;  //  (L - 1) * ngates * H * H
             auto wx_memory_n =
                 mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
             DType* wh_n = wx_n + (L - 1) * ngates * H * H;  //  (L - 1) * ngates * H * H
@@ -504,8 +508,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             auto wh_memory_n =
                 mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
 
-            op.concat_weight_memory.push_back(wx_memory_n);
-            op.concat_weight_memory.push_back(wh_memory_n);
+            op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n);
+            op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n);
             workptr = wh_n + (L - 1) * ngates * H * H;
 
             mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H};  //  ldsnc
@@ -515,7 +519,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
               DType* src_iter_n1 = workptr;  //  nstates * N * H
               auto src_iter_memory_n1 =
                   mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1);
-              op.concat_iter_memory.push_back(src_iter_memory_n1);
+              op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n1);
               workptr = src_iter_n1 + nstates * N * H;
             }
             mkldnn::memory::dims src_iter_tz_n = {L - 1, 1, nstates, N, H};  //  ldsnc
@@ -524,13 +528,13 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* src_iter_n = workptr;  //  (L - 1) * nstates * N * H
             auto src_iter_memory_n =
                 mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n);
-            op.concat_iter_memory.push_back(src_iter_memory_n);
-            op.hcx_memory.push_back(src_iter_memory_n);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory_n);
+            op.mkldnn_mems.hcx_memory.push_back(src_iter_memory_n);
 
             DType* dst_layer_n = src_iter_n + (L - 1) * nstates * N * H;  //  T * N * D * H
             auto dst_layer_memory_n
                 = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
-            op.y_memory.push_back(dst_layer_memory_n);
+            op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n);
 
             mkldnn::memory::dims dst_iter_tz_n = {L - 1, 1, nstates, N, H};  //  ldsnc
             auto dst_iter_md_n = mkldnn::memory::desc(
@@ -538,13 +542,14 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  (L - 1) * nstates * N * H
             auto dst_iter_memory_n =
                 mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
-            op.hcy_memory.push_back(dst_iter_memory_n);
+            op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n);
+            workptr = dst_iter_n + (L - 1) * nstates * N * H;
           }
 
           if (L > 1 && D == 2) {
             mkldnn::memory::dims weights_layer_tz = {1, D, H * D, ngates, H};  //  ldigo
             mkldnn::memory::dims weights_iter_tz = {1, D, H, ngates, H};  //  ldigo
-            mkldnn::memory::dims bias_tz = {1, D, ngates, H};
+            mkldnn::memory::dims bias_tz = {1, D, nbias, H};
             auto user_weight_layer_md = mkldnn::memory::desc(
                 { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
             auto user_weight_iter_md = mkldnn::memory::desc(
@@ -555,7 +560,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             auto user_src_layer_md = mkldnn::memory::desc(
                 { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
             auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine });
-            op.x_memory.push_back(user_src_layer_memory);
+            op.mkldnn_mems.x_memory.push_back(user_src_layer_memory);
 
             auto wx_md_n = mkldnn::memory::desc(
                 { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
@@ -566,19 +571,20 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
               DType* weight_layer_n = workptr;  //  D * (H * D) * ngates * H
               auto user_weight_layer_memory_n
                   = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
-              op.wx_memory.push_back(user_weight_layer_memory_n);
+              op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n);
 
               DType* weight_iter_n = weight_layer_n +
                   D * (H * D) * ngates * H;  //  D * H * ngates * H
               auto user_weight_iter_memory_n
                   = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
-              op.wh_memory.push_back(user_weight_iter_memory_n);
+              op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
 
-              DType* bias_n = weight_iter_n + D * H * ngates * H;  //  D * ngates * H
+              DType* bias_n = weight_iter_n + D * H * ngates * H;  //  Generally, D * ngates * H
+                                                                   //  LBR-Gru, D * (ngates + 1) * H
               auto user_bias_memory_n =
                   mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
-              op.bias_memory.push_back(user_bias_memory_n);
-              workptr = bias_n + D * ngates * H;
+              op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);
+              workptr = bias_n + D * nbias * H;
             }
 
             DType* wx_n = workptr;  //  D * ngates * (D * H) * H
@@ -587,8 +593,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
                 mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
             auto wh_memory_n =
                 mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
-            op.concat_weight_memory.push_back(wx_memory_n);
-            op.concat_weight_memory.push_back(wh_memory_n);
+            op.mkldnn_mems.concat_weight_memory.push_back(wx_memory_n);
+            op.mkldnn_mems.concat_weight_memory.push_back(wh_memory_n);
 
             mkldnn::memory::dims src_iter_undi_tz = {1, 1, nstates, N, H};  //  ldsnc
             auto src_iter_undi_md = mkldnn::memory::desc(
@@ -596,12 +602,12 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* src_iter_undi = wh_n + D * ngates * H * H;  //  nstates * N * H
             auto src_iter_undi_memory =
                 mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi);
-            op.concat_iter_memory.push_back(src_iter_undi_memory_0);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi_memory_0);
 
             DType* src_iter_undi2 = src_iter_undi + nstates * N * H;  //  nstates * N * H
             auto src_iter_undi2_memory =
                 mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi2);
-            op.concat_iter_memory.push_back(src_iter_undi2_memory);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_undi2_memory);
 
             mkldnn::memory::dims src_iter_tz = {1, D, nstates, N, H};  //  ldsnc
             auto src_iter_md = mkldnn::memory::desc(
@@ -609,13 +615,13 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* src_iter = src_iter_undi2 + nstates * N * H;  //  D * nstates * N * H
             auto src_iter_memory =
                 mkldnn::memory({ src_iter_md, cpu_engine }, src_iter);
-            op.concat_iter_memory.push_back(src_iter_memory);
-            op.hcx_memory.push_back(src_iter_memory);
+            op.mkldnn_mems.concat_iter_memory.push_back(src_iter_memory);
+            op.mkldnn_mems.hcx_memory.push_back(src_iter_memory);
 
             DType* dst_layer_n = src_iter + D * nstates * N * H;  //  T * N * D * H
             auto dst_layer_memory_n
                 = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
-            op.y_memory.push_back(dst_layer_memory_n);
+            op.mkldnn_mems.y_memory.push_back(dst_layer_memory_n);
 
             mkldnn::memory::dims dst_iter_tz_n = {1, D, nstates, N, H};  //  ldsnc
             auto dst_iter_md_n = mkldnn::memory::desc(
@@ -623,7 +629,8 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  D * nstates * N * H
             auto dst_iter_memory_n =
                 mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
-            op.hcy_memory.push_back(dst_iter_memory_n);
+            op.mkldnn_mems.hcy_memory.push_back(dst_iter_memory_n);
+            workptr = dst_iter_n + D * nstates * N * H;
           }
         }
       }

From 1e1f799c322a7886f826a2a765f770c9ee443afd Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Thu, 25 Jul 2019 18:56:57 +0800
Subject: [PATCH 02/24] Readable params and UT supplement

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h |  86 +++-----
 src/operator/rnn.cc                      |  36 ++--
 tests/python/unittest/test_operator.py   | 255 +++++++++++++----------
 3 files changed, 191 insertions(+), 186 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index 2db46ea84fc7..ecb4b536516a 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -127,43 +127,35 @@ static void ConcatData(mkldnn::memory::format src_format,
  * For bidirectional, it will fused as data + back_data (weight, bias,
  * iter etc)
  * 
- * @param L Number of Layers
- * @param D Direction of the RNN implement. It should be 1 or 2.
- * @param T The maximum sequence length.
- * @param N Batch size.
- * @param I Input channel. Also the dimension of the input feature.
- * @param H Hidden state size.
+ * @param num_layer Number of Layers
+ * @param direction Direction of the RNN implement. It should be 1 or 2.
+ * @param seq_len The maximum sequence length.
+ * @param batch_size Batch size.
+ * @param input_size Input channel. Also the dimension of the input feature.
+ * @param hidden_size Hidden state size.
  * @return The required cache size.
  */
-static size_t GetMKLDNNRNNCacheMemorySize(int L,
-                                          int D,
-                                          int T,
-                                          int N,
-                                          int I,
-                                          int H,
+static size_t GetMKLDNNRNNCacheMemorySize(int num_layer,
+                                          int direction,
+                                          int seq_len,
+                                          int batch_size,
+                                          int input_size,
+                                          int hidden_size,
                                           int mode) {
-  size_t size = 0;
-  switch (mode) {
-    case rnn_enum::kLstm:
-      size = 2 * (D * (I + H) * 4 * H + (L - 1) * D * (D * H + H) * 4 * H +
-             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H +
-             6 * D * (I + H + 2) * 4 * H + T * N * I * 2;
-      break;
-    case rnn_enum::kGru:
-      size = 2 * (D * (I + H) * 3 * H + (L - 1) * D * (D * H + H) * 3 * H +
-             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H +
-             6 * D * (I + H + 2) * 3 * H + T * N * I * 2;
-      break;
-    case rnn_enum::kRnnRelu:
-    case rnn_enum::kRnnTanh:
-      size = 2 * (D * (I + H) * 1 * H + (L - 1) * D * (D * H + H) * 1 * H +
-             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 1 * H + (L + 2) * D * 2 * N * H +
-             6 * D * (I + H + 2) * 1 * H + T * N * I * 2;
-      break;
-    default:
-      LOG(FATAL) << "unknown RNN mode " << mode;
-      break;
-  }
+  int n_gates = 0, n_states = 0;
+  GetMKLDNNRNNAlgo(mode, &n_gates, &n_states);
+  int n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates;
+  // sizes of single gates from a single cell
+  const size_t weights_size_0 = direction * (input_size + hidden_size) * hidden_size;
+  const size_t weights_size_n = direction * (direction * hidden_size + hidden_size) * hidden_size;
+  const size_t bias_size      = direction * hidden_size;
+  const size_t src_iter_size  = direction * batch_size * hidden_size;
+  const size_t dst_iter_size  = direction * batch_size * hidden_size;
+  const size_t dst_layer_size = seq_len * batch_size * direction * hidden_size;
+
+  size_t size = (weights_size_0 + weights_size_n * (num_layer - 1)) * n_gates * 2 +
+      bias_size * num_layer * n_bias + src_iter_size * num_layer * n_states * 2 +
+      dst_iter_size * num_layer * n_states + dst_layer_size * 2;
   return size;
 }
 
@@ -221,7 +213,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
                                           std::vector<primitive> *rnn_forward_prim,
                                           int layer_index,
                                           bool *has_cache,
-                                          int lvalue,
                                           int dtype,
                                           bool is_train,
                                           int mode) {
@@ -277,15 +268,12 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
     ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
         {weights_iter_r_tz, weights_iter_r_tz}, weights_iter_tz,
         mkldnn_dtype, 1, srcs_data1, src_wh, &(mkldnn_mems->weight_iter_mems));
-    int tmpvalue = 0;
-    if (lvalue > 0) {
-      tmpvalue = lvalue + 1;
-    }
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[tmpvalue]));
-    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[tmpvalue]));
+
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, mkldnn_mems->wx_memory[layer_index]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, mkldnn_mems->wh_memory[layer_index]));
 
     DType* user_bias = reinterpret_cast<DType *>
-        (mkldnn_mems->bias_memory[tmpvalue].get_data_handle());
+        (mkldnn_mems->bias_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kGru) {
       // While mxnet gru gate order is reset, update and new gates,
       // mkldnn gru gate order is update, reset and new gates. So
@@ -313,14 +301,6 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
       }
     }
   }
-  if (lvalue > 0) {
-    mkldnn_mems->wx_memory[layer_index].set_data_handle(
-        mkldnn_mems->wx_memory[lvalue + 1].get_data_handle());
-    mkldnn_mems->wh_memory[layer_index].set_data_handle(
-        mkldnn_mems->wh_memory[lvalue + 1].get_data_handle());
-    mkldnn_mems->bias_memory[layer_index].set_data_handle(
-        mkldnn_mems->bias_memory[lvalue + 1].get_data_handle());
-  }
 
   auto src_layer_md = mkldnn::memory::desc(
       { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
@@ -577,7 +557,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
           user_bias_f[g + l * single_b_size] =
               b_ptr[g + H + l * mx_single_b_sz * 2]
               + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz];
-        } 
+        }
         #pragma omp parallel for num_threads(omp_threads)
         for (int g = 2 * H; g < 3 * H; g++) {
           user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2];
@@ -692,7 +672,7 @@ static void MKLDNNRNNForward(bool state_outputs,
       MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr,
           hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
           mkldnn_mems, rnn_forward_prim,
-          0, has_cache, 0, dtype, is_train, mode);
+          0, has_cache, dtype, is_train, mode);
     } else {
       MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr,
           hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
@@ -721,7 +701,7 @@ static void MKLDNNRNNForward(bool state_outputs,
           MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull,
               hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr,
               cy_ptr, mkldnn_mems, rnn_forward_prim,
-              1, has_cache, l + 1, dtype, is_train, mode);
+              1, has_cache, dtype, is_train, mode);
           mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1];
           w_ptr += w_size;
           b_ptr += b_size;
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 7edcbe5c61a9..c6d6890fd3d3 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -567,25 +567,23 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
             auto wh_md_n = mkldnn::memory::desc(
                 { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
 
-            for (int l = 0; l < L; l++) {
-              DType* weight_layer_n = workptr;  //  D * (H * D) * ngates * H
-              auto user_weight_layer_memory_n
-                  = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
-              op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n);
-
-              DType* weight_iter_n = weight_layer_n +
-                  D * (H * D) * ngates * H;  //  D * H * ngates * H
-              auto user_weight_iter_memory_n
-                  = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
-              op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
-
-              DType* bias_n = weight_iter_n + D * H * ngates * H;  //  Generally, D * ngates * H
-                                                                   //  LBR-Gru, D * (ngates + 1) * H
-              auto user_bias_memory_n =
-                  mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
-              op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);
-              workptr = bias_n + D * nbias * H;
-            }
+            DType* weight_layer_n = workptr;  //  D * (H * D) * ngates * H
+            auto user_weight_layer_memory_n
+                = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
+            op.mkldnn_mems.wx_memory.push_back(user_weight_layer_memory_n);
+
+            DType* weight_iter_n = weight_layer_n +
+                D * (H * D) * ngates * H;  //  D * H * ngates * H
+            auto user_weight_iter_memory_n
+                = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
+            op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
+
+            DType* bias_n = weight_iter_n + D * H * ngates * H;  //  Generally, D * ngates * H
+                                                                  //  LBR-Gru, D * (ngates + 1) * H
+            auto user_bias_memory_n =
+                mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
+            op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);
+            workptr = bias_n + D * nbias * H;
 
             DType* wx_n = workptr;  //  D * ngates * (D * H) * H
             DType* wh_n = wx_n + D * ngates * (D * H) * H;  //  D * ngates * H * H
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d195ea9ef2f3..d875b224cd75 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -79,148 +79,175 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_lstm_sym():
-    T, N, I, H = 5, 32, 800, 800
-    fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='lstm', get_next_state=True, prefix='')
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.LSTMCell(H, prefix='l0_'))
-    stack.add(mx.rnn.LSTMCell(H, prefix='l1_'))
-    stack.add(mx.rnn.LSTMCell(H, prefix='l2_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='lstm', get_next_state=True, prefix='')
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.LSTMCell(H, prefix='l0_'))
+        stack.add(mx.rnn.LSTMCell(H, prefix='l1_'))
+        stack.add(mx.rnn.LSTMCell(H, prefix='l2_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_lstm_bidirectional():
-    T, N, I, H = 5, 20, 800, 800
-    fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='lstm',
-                                bidirectional=True, get_next_state=True, prefix='')
-
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.LSTMCell(H, prefix='l0_'),
-                mx.rnn.LSTMCell(H, prefix='r0_'),
-                output_prefix='bi_lstm_0_'))
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.LSTMCell(H, prefix='l1_'),
-                mx.rnn.LSTMCell(H, prefix='r1_'),
-                output_prefix='bi_lstm_1_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='lstm',
+                                    bidirectional=True, get_next_state=True, prefix='')
+
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.LSTMCell(H, prefix='l0_'),
+                    mx.rnn.LSTMCell(H, prefix='r0_'),
+                    output_prefix='bi_lstm_0_'))
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.LSTMCell(H, prefix='l1_'),
+                    mx.rnn.LSTMCell(H, prefix='r1_'),
+                    output_prefix='bi_lstm_1_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_gru_sym():
-    T, N, I, H = 5, 32, 800, 800
-    fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='gru', get_next_state=True, prefix='')
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.GRUCell(H, prefix='l0_'))
-    stack.add(mx.rnn.GRUCell(H, prefix='l1_'))
-    stack.add(mx.rnn.GRUCell(H, prefix='l2_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='gru', get_next_state=True, prefix='')
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.GRUCell(H, prefix='l0_'))
+        stack.add(mx.rnn.GRUCell(H, prefix='l1_'))
+        stack.add(mx.rnn.GRUCell(H, prefix='l2_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_gru_bidirectional():
-    T, N, I, H = 5, 20, 800, 800
-
-    fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='gru',
-                                bidirectional=True, get_next_state=True, prefix='')
-
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.GRUCell(H, prefix='l0_'),
-                mx.rnn.GRUCell(H, prefix='r0_'),
-                output_prefix='bi_gru_0_'))
-
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.GRUCell(H, prefix='l1_'),
-                mx.rnn.GRUCell(H, prefix='r1_'),
-                output_prefix='bi_gru_1_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='gru',
+                                    bidirectional=True, get_next_state=True, prefix='')
+
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.GRUCell(H, prefix='l0_'),
+                    mx.rnn.GRUCell(H, prefix='r0_'),
+                    output_prefix='bi_gru_0_'))
+
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.GRUCell(H, prefix='l1_'),
+                    mx.rnn.GRUCell(H, prefix='r1_'),
+                    output_prefix='bi_gru_1_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnntanh_sym():
-    T, N, I, H = 5, 32, 800, 800
-
-    fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_tanh', get_next_state=True, prefix='')
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'))
-    stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'))
-    stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l2_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_tanh', get_next_state=True, prefix='')
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'))
+        stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'))
+        stack.add(mx.rnn.RNNCell(H, activation='tanh', prefix='l2_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnntanh_bidirectional():
-    T, N, I, H = 5, 20, 800, 800
-
-    fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_tanh',
-                                bidirectional=True, get_next_state=True, prefix='')
-
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'),
-                mx.rnn.RNNCell(H, activation='tanh', prefix='r0_'),
-                output_prefix='bi_rnntanh_0_'))
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'),
-                mx.rnn.RNNCell(H, activation='tanh', prefix='r1_'),
-                output_prefix='bi_rnntanh_1_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_tanh',
+                                    bidirectional=True, get_next_state=True, prefix='')
+
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.RNNCell(H, activation='tanh', prefix='l0_'),
+                    mx.rnn.RNNCell(H, activation='tanh', prefix='r0_'),
+                    output_prefix='bi_rnntanh_0_'))
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.RNNCell(H, activation='tanh', prefix='l1_'),
+                    mx.rnn.RNNCell(H, activation='tanh', prefix='r1_'),
+                    output_prefix='bi_rnntanh_1_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnnrelu_sym():
-    T, N, I, H = 5, 32, 200, 200
-
-    fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_relu', get_next_state=True, prefix='')
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l0_'))
-    stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l1_'))
-    stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l2_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=3, mode='rnn_relu', get_next_state=True, prefix='')
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l0_'))
+        stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l1_'))
+        stack.add(mx.rnn.RNNCell(H, activation='relu', prefix='l2_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add')
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnnrelu_bidirectional():
-    T, N, I, H = 5, 20, 200, 200
-
-    fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_relu',
-                                bidirectional=True, get_next_state=True, prefix='')
-
-    stack = mx.rnn.SequentialRNNCell()
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.RNNCell(H, activation='relu', prefix='l0_'),
-                mx.rnn.RNNCell(H, activation='relu', prefix='r0_'),
-                output_prefix='bi_rnnrelu_0_'))
-    stack.add(mx.rnn.BidirectionalCell(
-                mx.rnn.RNNCell(H, activation='relu', prefix='l1_'),
-                mx.rnn.RNNCell(H, activation='relu', prefix='r1_'),
-                output_prefix='bi_rnnrelu_1_'))
-
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2)
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2)
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2)
+    Ts = [1, 5]
+    Ns = [1, 32]
+    Is = [32, 128, 512]
+    Hs = [32, 128, 512]
+    for T, N, I, H in itertools.product(Ts, Ns, Is, Hs):
+        fused = mx.rnn.FusedRNNCell(H, num_layers=2, mode='rnn_relu',
+                                    bidirectional=True, get_next_state=True, prefix='')
+
+        stack = mx.rnn.SequentialRNNCell()
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.RNNCell(H, activation='relu', prefix='l0_'),
+                    mx.rnn.RNNCell(H, activation='relu', prefix='r0_'),
+                    output_prefix='bi_rnnrelu_0_'))
+        stack.add(mx.rnn.BidirectionalCell(
+                    mx.rnn.RNNCell(H, activation='relu', prefix='l1_'),
+                    mx.rnn.RNNCell(H, activation='relu', prefix='r1_'),
+                    output_prefix='bi_rnnrelu_1_'))
+
+        check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2)
+        check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2)
+        check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2)
 
 @with_seed()
 def test_lstm_dropout():

From 49ebe014037b87b335706c91b13b4948a4172d7a Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Thu, 25 Jul 2019 19:18:51 +0800
Subject: [PATCH 03/24] Fix lint errors

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 12 ++++++++----
 src/operator/rnn.cc                      |  5 +++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index ecb4b536516a..5422636f82f6 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -374,7 +374,8 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
   MKLDNNStream::Get()->Submit();
 
   if (state_outputs) {
-    DType* dst_hcy = reinterpret_cast<DType *>(mkldnn_mems->hcy_memory[layer_index].get_data_handle());
+    DType* dst_hcy = reinterpret_cast<DType *>(
+        mkldnn_mems->hcy_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kLstm) {
       offset1 = nstates * single_cell_size;
       offset2 = (nstates + 1) * single_cell_size;
@@ -542,7 +543,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
     MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, mkldnn_mems->wx_memory[layer_index]));
     MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, mkldnn_mems->wh_memory[layer_index]));
 
-    DType* user_bias_f = reinterpret_cast<DType *>(mkldnn_mems->bias_memory[layer_index].get_data_handle());
+    DType* user_bias_f = reinterpret_cast<DType *>(
+        mkldnn_mems->bias_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kGru) {
       const int mx_single_b_sz = ngates * H;
       for (int l = 0; l < L; l++) {
@@ -569,7 +571,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
       #pragma omp parallel for num_threads(omp_threads)
       for (int j = 0; j < L * single_b_size; j++) {
         int k = j / single_b_size;
-        user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size];
+        user_bias_f[j] = b_ptr[j + k * single_b_size] +
+            b_ptr[j + k * single_b_size + single_b_size];
       }
     }
   }
@@ -604,7 +607,8 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
   MKLDNNStream::Get()->Submit();
 
   if (state_outputs) {
-    DType* dst_hcy = reinterpret_cast<DType *>(mkldnn_mems->hcy_memory[layer_index].get_data_handle());
+    DType* dst_hcy = reinterpret_cast<DType *>(
+        mkldnn_mems->hcy_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kLstm) {
       for (int l = 0; l < L; l++) {
         offset1 = l * single_cell_size;
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index c6d6890fd3d3..cc70beeb8e79 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -491,8 +491,9 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
                 = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
             op.mkldnn_mems.wh_memory.push_back(user_weight_iter_memory_n);
 
-            DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H;  //  Generally, (L - 1) * ngates * H
-                                                                       //  LBR-Gru, (L -1) * (ngates + 1) * H
+            DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H;  // Generally, (L - 1) *
+                                                                       // ngates * H. LBR-Gru,
+                                                                       // (L -1) * (ngates + 1) * H
             auto user_bias_memory_n =
                 mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
             op.mkldnn_mems.bias_memory.push_back(user_bias_memory_n);

From 71a822a14a8fb6059677952d875007aa35b485f6 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Mon, 29 Jul 2019 15:11:21 +0800
Subject: [PATCH 04/24] Retrigger CI


From 2facb29459191c35d5231e432bf9a91cc4122bb7 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Wed, 31 Jul 2019 09:26:30 +0800
Subject: [PATCH 05/24] Enable re-initialization with training path

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index 5422636f82f6..89f871719031 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -508,7 +508,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
   std::vector<mkldnn::memory::dims> src_l_dim_h;
 
   bool has_adjusted = false;
-  if (!initialized) {
+  if (!initialized || is_train) {
     if (L == 1) {
       DType* wx = w_ptr;
       DType* wh = wx + I * H * ngates;

From a6ee56c1736122b2cf4beba50be3a6c704e6da30 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Wed, 31 Jul 2019 18:43:55 +0800
Subject: [PATCH 06/24] Trigger CI


From 302f8dc350030ec72503109fa45d60889ca0b8ec Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Fri, 2 Aug 2019 16:35:31 +0800
Subject: [PATCH 07/24] Type refine and meaningful params

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 370 +++++++++++------------
 1 file changed, 178 insertions(+), 192 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index 89f871719031..d71cbdfeacdf 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -92,13 +92,13 @@ static void ConcatData(mkldnn::memory::format src_format,
                        std::vector<mkldnn::memory::dims> srcs_cds,
                        mkldnn::memory::dims dst_cds,
                        mkldnn::memory::data_type mkldnn_dtype,
-                       int concat_dimension,
+                       const int concat_dimension,
                        const std::vector<void*> &srcs_data,
                        const mkldnn::memory &dst,
                        std::vector<mkldnn::memory> *tmp_src_mems) {
   auto cpu_engine = CpuEngine::Get()->get_engine();
   std::vector<mkldnn::memory::primitive_desc> srcs_pd;
-  bool initialized = tmp_src_mems->size() > 0;
+  const bool initialized = tmp_src_mems->size() > 0;
   for (size_t i = 0; i < srcs_cds.size(); i++) {
     auto desc = mkldnn::memory::desc(srcs_cds[i], mkldnn_dtype, src_format);
     auto mpd = mkldnn::memory::primitive_desc(desc, cpu_engine);
@@ -119,13 +119,13 @@ static void ConcatData(mkldnn::memory::format src_format,
 /**
  * Size of cached memory
  * 
- * Cache memory of wx, wh from the first layer and next L - 1 layers
+ * Cache memory of wx, wh from the first layer and next num_layer - 1 layers
  * seperately, as well as the layer and iter memory for src and dst.
  * Output states memory hx, hc and bias memory are also cached. It
  * will prepare memory on before and after reorder and concat. For
- * unidirectional, it will fused as dim like 1  + (L - 1) when I != H.
- * For bidirectional, it will fused as data + back_data (weight, bias,
- * iter etc)
+ * unidirectional, it will fused as dim like 1  + (num_layer - 1) when 
+ * input_size != hidden_size. For bidirectional, it will fused as data + 
+ * back_data (weight, bias, iter etc)
  * 
  * @param num_layer Number of Layers
  * @param direction Direction of the RNN implement. It should be 1 or 2.
@@ -135,16 +135,16 @@ static void ConcatData(mkldnn::memory::format src_format,
  * @param hidden_size Hidden state size.
  * @return The required cache size.
  */
-static size_t GetMKLDNNRNNCacheMemorySize(int num_layer,
-                                          int direction,
-                                          int seq_len,
-                                          int batch_size,
-                                          int input_size,
-                                          int hidden_size,
-                                          int mode) {
+static size_t GetMKLDNNRNNCacheMemorySize(const size_t num_layer,
+                                          const size_t direction,
+                                          const size_t seq_len,
+                                          const size_t batch_size,
+                                          const size_t input_size,
+                                          const size_t hidden_size,
+                                          const size_t mode) {
   int n_gates = 0, n_states = 0;
   GetMKLDNNRNNAlgo(mode, &n_gates, &n_states);
-  int n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates;
+  const size_t n_bias = mode == rnn_enum::kGru ? n_gates + 1 : n_gates;
   // sizes of single gates from a single cell
   const size_t weights_size_0 = direction * (input_size + hidden_size) * hidden_size;
   const size_t weights_size_n = direction * (direction * hidden_size + hidden_size) * hidden_size;
@@ -161,46 +161,31 @@ static size_t GetMKLDNNRNNCacheMemorySize(int num_layer,
 
 template <typename DType>
 static void AdjustGruWeightGateOrder(DType* weight,
-                                     const int I,
-                                     const int H) {
+                                     const int input_size,
+                                     const int hidden_size) {
   // mxnet gru gate order is reset, update and new gates
   // mkldnn gru gate order is update, reset and new gates
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   DType* weight_reset = weight;
-  DType* weight_update = weight + I * H;
+  DType* weight_update = weight + input_size * hidden_size;
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < I * H; i++) {
+  for (int i = 0; i < input_size * hidden_size; i++) {
     DType tmp = weight_update[i];
     weight_update[i] = weight_reset[i];
     weight_reset[i] = tmp;
   }
 }
 
-template <typename DType>
-static void AdjustGruBiasGateOrder(DType* bias,
-                                   const int H) {
-  // mxnet gru gate order is reset, update and new gates
-  // mkldnn gru gate order is update, reset and new gates
-  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  DType* bias_reset = bias;
-  DType* bias_update = bias + H;
-  #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < H; i++) {
-    DType tmp = bias_update[i];
-    bias_update[i] = bias_reset[i];
-    bias_reset[i] = tmp;
-  }
-}
 // since there is different sematics of MKLDNN's Fused RNN and MXNet FusedRNN,
 // bidirectional will be fused layer by layer,
-// unidirectional will be done by fused 1 + fused (L - 1) layers or fused L layers(when I = H)
-
+// unidirectional will be done by fused 1 + fused (num_layer - 1) layers or fused num_layer
+// layers(when input_size = hidden_size)
 template <typename DType>
 static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
-                                          const int T,
-                                          const int N,
-                                          const int I,
-                                          const int H,
+                                          const int seq_len,
+                                          const int batch_size,
+                                          const int input_size,
+                                          const int hidden_size,
                                           DType* x_ptr,
                                           DType* hx_ptr,
                                           DType* cx_ptr,
@@ -220,42 +205,42 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
   algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
   const int nbias = mode == rnn_enum::kGru ? ngates + 1 : ngates;
   mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
-  const int single_cell_size = N * H;
-  const int mx_single_b_sz = ngates * H;
-  DType* wx = w_ptr;  //  ngates * H, I
-  DType* wh = w_ptr + I * H * ngates;  //  ngates * H, H
-  DType* back_wx = w_ptr + ngates * H * (I + H);
-  DType* back_wh = back_wx + I * H * ngates;
+  const int single_cell_size = batch_size * hidden_size;
+  const int mx_single_b_sz = ngates * hidden_size;
+  DType* wx = w_ptr;  //  ngates * hidden_size, input_size
+  DType* wh = w_ptr + input_size * hidden_size * ngates;  //  ngates * hidden_size, hidden_size
+  DType* back_wx = w_ptr + ngates * hidden_size * (input_size + hidden_size);
+  DType* back_wh = back_wx + input_size * hidden_size * ngates;
   DType* bx = b_ptr;
-  DType* bh = b_ptr + H * ngates;
+  DType* bh = b_ptr + hidden_size * ngates;
   DType* back_bx = b_ptr + mx_single_b_sz * 2;
-  DType* back_bh = back_bx + H * ngates;
+  DType* back_bh = back_bx + hidden_size * ngates;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   auto cpu_engine = CpuEngine::Get()->get_engine();
   auto null_memory_ = null_memory(cpu_engine);
   int offset1 = 0, offset2 = 0;
   bool initialized = *has_cache;
-  mkldnn::memory::dims src_layer_tz = {T, N, I};
-  mkldnn::memory::dims dst_layer_tz = {T, N, 2 * H};
-  mkldnn::memory::dims weights_layer_tz = {1, 2, I, ngates, H};  //  ldigo
-  mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
-  mkldnn::memory::dims weights_iter_tz = {1, 2, H, ngates, H};  //  ldigo
-  mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H};  //  ldigo for reorder
-  mkldnn::memory::dims bias_tz = {1, 2, nbias, H};  // ldgo
-  mkldnn::memory::dims src_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
-  mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
+  mkldnn::memory::dims src_layer_tz = {seq_len, batch_size, input_size};
+  mkldnn::memory::dims dst_layer_tz = {seq_len, batch_size, 2 * hidden_size};
+  mkldnn::memory::dims weights_layer_tz = {1, 2, input_size, ngates, hidden_size};  // ldigo
+  mkldnn::memory::dims weights_iter_tz = {1, 2, hidden_size, ngates, hidden_size};  // ldigo
+  mkldnn::memory::dims bias_tz = {1, 2, nbias, hidden_size};  // ldgo
+  mkldnn::memory::dims src_iter_tz = {1, 2, nstates, batch_size, hidden_size};  // ldsnc
+  mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, batch_size, hidden_size};  // ldsnc
+  mkldnn::memory::dims weights_layer_r_tz = {1, 1, input_size, ngates, hidden_size};
+  mkldnn::memory::dims weights_iter_r_tz = {1, 1, hidden_size, ngates, hidden_size};
 
   bool has_adjusted = false;
   if (!initialized || is_train) {
     if (mode == rnn_enum::kGru) {
-      AdjustGruWeightGateOrder(wx, I, H);
-      AdjustGruWeightGateOrder(back_wx, I, H);
-      AdjustGruWeightGateOrder(wh, H, H);
-      AdjustGruWeightGateOrder(back_wh, H, H);
+      AdjustGruWeightGateOrder(wx, input_size, hidden_size);
+      AdjustGruWeightGateOrder(back_wx, input_size, hidden_size);
+      AdjustGruWeightGateOrder(wh, hidden_size, hidden_size);
+      AdjustGruWeightGateOrder(back_wh, hidden_size, hidden_size);
       has_adjusted = true;
     }
-    auto src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index];
-    auto src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
+    mkldnn::memory& src_wx = mkldnn_mems->concat_weight_memory[2 * layer_index];
+    mkldnn::memory& src_wh = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
     std::vector<void*> srcs_data1;
     srcs_data1.push_back(wx);
     srcs_data1.push_back(back_wx);
@@ -278,20 +263,19 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
       // While mxnet gru gate order is reset, update and new gates,
       // mkldnn gru gate order is update, reset and new gates. So
       // we need to swap the order of reset and update from mxnet.
-      const index_t single_b_sz = nbias * H;
-      #pragma omp parallel for num_threads(omp_threads)
-      for (int j = 0; j < H; j++) {
-        user_bias[j + H] = bx[j] + bh[j];
-        user_bias[single_b_sz + j + H] = back_bx[j] + back_bh[j];
-        user_bias[j] = bx[j + H] + bh[j + H];
-        user_bias[single_b_sz + j] = back_bx[j + H] + back_bh[j + H];
-      }
+      const index_t single_b_sz = nbias * hidden_size;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int j = 2 * H; j < 3 * H; j++) {
-        user_bias[j] = bx[j];
-        user_bias[j + H] = bh[j];
-        user_bias[single_b_sz + j] = back_bx[j];
-        user_bias[single_b_sz + j + H] = back_bh[j];
+      for (int j = 0; j < hidden_size; j++) {
+        user_bias[j + hidden_size] = bx[j] + bh[j];
+        user_bias[single_b_sz + j + hidden_size] = back_bx[j] + back_bh[j];
+
+        user_bias[j] = bx[j + hidden_size] + bh[j + hidden_size];
+        user_bias[single_b_sz + j] = back_bx[j + hidden_size] + back_bh[j + hidden_size];
+
+        user_bias[j + 2 * hidden_size] = bx[j + 2 * hidden_size];
+        user_bias[j + 3 * hidden_size] = bh[j + 2 * hidden_size];
+        user_bias[single_b_sz + j + 2 * hidden_size] = back_bx[j + 2 * hidden_size];
+        user_bias[single_b_sz + j + 3 * hidden_size] = back_bh[j + 2 * hidden_size];
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
@@ -313,32 +297,35 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
   auto dst_iter_md = mkldnn::memory::desc(
       { dst_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
   auto src_iter_md = mkldnn::memory::desc(
-      {src_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc);
-  auto bias_md = mkldnn::memory::desc({bias_tz},
-      mkldnn_dtype, mkldnn::memory::format::ldgo);
+      { src_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+  auto bias_md = mkldnn::memory::desc(
+      { bias_tz }, mkldnn_dtype, mkldnn::memory::format::ldgo);
 
-  auto user_src_iter_memory = mkldnn_mems->concat_iter_memory[2];
+  mkldnn::memory& user_src_iter_memory = mkldnn_mems->concat_iter_memory[2];
   if (mode == rnn_enum::kLstm) {
     std::vector<void*> srcs_data1;
     srcs_data1.push_back(hx_ptr);
     srcs_data1.push_back(cx_ptr);
-    auto tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0];
+    mkldnn::memory& tmp1_src_iter_memory = mkldnn_mems->concat_iter_memory[0];
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
-        {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
-        srcs_data1, tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory));
+        {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}},
+        {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data1,
+        tmp1_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     std::vector<void*> srcs_data2;
     srcs_data2.push_back(hx_ptr + single_cell_size);
     srcs_data2.push_back(cx_ptr + single_cell_size);
-    auto tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1];
+    mkldnn::memory& tmp2_src_iter_memory = mkldnn_mems->concat_iter_memory[1];
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
-        {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
-        srcs_data2, tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory));
+        {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}},
+        {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data2,
+        tmp2_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     std::vector<void*> srcs_data3;
     srcs_data3.push_back(reinterpret_cast<DType *>(tmp1_src_iter_memory.get_data_handle()));
     srcs_data3.push_back(reinterpret_cast<DType *>(tmp2_src_iter_memory.get_data_handle()));
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
-        {{1, 1, nstates, N, H}, {1, 1, nstates, N, H}}, {1, 2, nstates, N, H},
-        mkldnn_dtype, 1, srcs_data3, user_src_iter_memory, &(mkldnn_mems->concat_states_memory));
+        {{1, 1, nstates, batch_size, hidden_size}, {1, 1, nstates, batch_size, hidden_size}},
+        {1, 2, nstates, batch_size, hidden_size}, mkldnn_dtype, 1, srcs_data3,
+        user_src_iter_memory, &(mkldnn_mems->concat_states_memory));
   } else {
     user_src_iter_memory.set_data_handle(hx_ptr);
   }
@@ -394,21 +381,21 @@ static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
     }
   }
   if (has_adjusted) {
-    AdjustGruWeightGateOrder(wx, I, H);
-    AdjustGruWeightGateOrder(back_wx, I, H);
-    AdjustGruWeightGateOrder(wh, H, H);
-    AdjustGruWeightGateOrder(back_wh, H, H);
+    AdjustGruWeightGateOrder(wx, input_size, hidden_size);
+    AdjustGruWeightGateOrder(back_wx, input_size, hidden_size);
+    AdjustGruWeightGateOrder(wh, hidden_size, hidden_size);
+    AdjustGruWeightGateOrder(back_wh, hidden_size, hidden_size);
   }
 }
 
 
 template <typename DType>
-static void MKLDNNRNNForwardUnidi(bool state_outputs,
-                                  const int L,
-                                  const int T,
-                                  const int N,
-                                  const int I,
-                                  const int H,
+static void MKLDNNRNNForwardUnidi(const bool state_outputs,
+                                  const int num_layer,
+                                  const int seq_len,
+                                  const int batch_size,
+                                  const int input_size,
+                                  const int hidden_size,
                                   DType* x_ptr,
                                   DType* hx_ptr,
                                   DType* cx_ptr,
@@ -428,25 +415,25 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
   algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
   const int nbias = (mode == rnn_enum::kGru ? ngates + 1 : ngates);
   mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
-  const int cell_size = N * H;
-  const int single_cell_size = N * H;
-  const int single_b_size = nbias * H;
-  int w_size = (I + H) * H * ngates;
+  const int cell_size = batch_size * hidden_size;
+  const int single_cell_size = batch_size * hidden_size;
+  const int single_b_size = nbias * hidden_size;
+  const int w_size = (input_size + hidden_size) * hidden_size * ngates;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   auto cpu_engine = CpuEngine::Get()->get_engine();
   auto null_memory_ = null_memory(cpu_engine);
   int offset1 = 0, offset2 = 0;
   bool initialized = *has_cache;
 
-  mkldnn::memory::dims src_layer_tz = {T, N, I};
-  mkldnn::memory::dims dst_layer_tz = {T, N, H};
-  mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H};  //  ldigo
-  mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H};  //  ldigo
-  mkldnn::memory::dims bias_tz = {L, 1, nbias, H};  // ldgo
-  mkldnn::memory::dims src_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
-  mkldnn::memory::dims dst_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
-  mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
-  mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H};  //  ldigo for reorder
+  mkldnn::memory::dims src_layer_tz = {seq_len, batch_size, input_size};
+  mkldnn::memory::dims dst_layer_tz = {seq_len, batch_size, hidden_size};
+  mkldnn::memory::dims weights_layer_tz = {num_layer, 1, input_size, ngates, hidden_size};  // ldigo
+  mkldnn::memory::dims weights_iter_tz = {num_layer, 1, hidden_size, ngates, hidden_size};  // ldigo
+  mkldnn::memory::dims bias_tz = {num_layer, 1, nbias, hidden_size};  // ldgo
+  mkldnn::memory::dims src_iter_tz = {num_layer, 1, nstates, batch_size, hidden_size};  //  ldsnc
+  mkldnn::memory::dims dst_iter_tz = {num_layer, 1, nstates, batch_size, hidden_size};  //  ldsnc
+  mkldnn::memory::dims weights_layer_r_tz = {1, 1, input_size, ngates, hidden_size};
+  mkldnn::memory::dims weights_iter_r_tz = {1, 1, hidden_size, ngates, hidden_size};
 
   auto weight_layer_md = mkldnn::memory::desc(
       { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
@@ -463,15 +450,16 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
   auto dst_iter_md = mkldnn::memory::desc(
       {dst_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc);
 
-  for (int l = 0; l < L; l++) {
+  for (int l = 0; l < num_layer; l++) {
     if (mode == rnn_enum::kLstm) {
       std::vector<void*> srcs_data;
       srcs_data.push_back(hx_ptr);
       srcs_data.push_back(cx_ptr);
-      auto tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index];
+      mkldnn::memory& tmp_src_iter_memory = mkldnn_mems->concat_iter_memory[l + layer_index];
       ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
-          {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype,
-          2, srcs_data, tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory));
+          {{1, 1, 1, batch_size, hidden_size}, {1, 1, 1, batch_size, hidden_size}},
+          {1, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 2, srcs_data,
+          tmp_src_iter_memory, &(mkldnn_mems->uni_states_memory));
     } else {
       mkldnn_mems->concat_iter_memory[l + layer_index].set_data_handle(hx_ptr);
     }
@@ -481,26 +469,26 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
     }
   }
 
-  auto user_src_iter_memory = null_memory_;
-  if (L == 1) {
-    user_src_iter_memory = mkldnn_mems->concat_iter_memory[layer_index];
+  mkldnn::memory* user_src_iter_memory;
+  if (num_layer == 1) {
+    user_src_iter_memory = &(mkldnn_mems->concat_iter_memory[layer_index]);
   } else {
-    user_src_iter_memory = mkldnn_mems->concat_iter_memory[L + layer_index];
+    user_src_iter_memory = &(mkldnn_mems->concat_iter_memory[num_layer + layer_index]);
     std::vector<void*> src_l_data;
     std::vector<mkldnn::memory::dims> src_l_dim;
-    for (int l = 0; l < L; l++) {
+    for (int l = 0; l < num_layer; l++) {
       src_l_data.push_back(reinterpret_cast<DType *>
           (mkldnn_mems->concat_iter_memory[l + layer_index].get_data_handle()));
-      src_l_dim.push_back({1, 1, nstates, N, H});
+      src_l_dim.push_back({1, 1, nstates, batch_size, hidden_size});
     }
     ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, src_l_dim,
-        {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory,
-        &(mkldnn_mems->concat_states_memory));
+        {num_layer, 1, nstates, batch_size, hidden_size}, mkldnn_dtype, 0, src_l_data,
+        *user_src_iter_memory, &(mkldnn_mems->concat_states_memory));
   }
-  mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
+  mkldnn_mems->hcx_memory[layer_index].set_data_handle(user_src_iter_memory->get_data_handle());
 
-  auto src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index];
-  auto src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
+  mkldnn::memory& src_wx_f = mkldnn_mems->concat_weight_memory[2 * layer_index];
+  mkldnn::memory& src_wh_f = mkldnn_mems->concat_weight_memory[2 * layer_index + 1];
 
   std::vector<void*> srcs_data_x;
   std::vector<void*> srcs_data_h;
@@ -509,23 +497,23 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
 
   bool has_adjusted = false;
   if (!initialized || is_train) {
-    if (L == 1) {
+    if (num_layer == 1) {
       DType* wx = w_ptr;
-      DType* wh = wx + I * H * ngates;
+      DType* wh = wx + input_size * hidden_size * ngates;
       if (mode == rnn_enum::kGru) {
-        AdjustGruWeightGateOrder(wx, I, H);
-        AdjustGruWeightGateOrder(wh, H, H);
+        AdjustGruWeightGateOrder(wx, input_size, hidden_size);
+        AdjustGruWeightGateOrder(wh, hidden_size, hidden_size);
         has_adjusted = true;
       }
       src_wx_f.set_data_handle(wx);
       src_wh_f.set_data_handle(wh);
     } else {
-      for (int l = 0; l < L; l++) {
+      for (int l = 0; l < num_layer; l++) {
         DType* wx = w_ptr + l * w_size;
-        DType* wh = wx + I * H * ngates;
+        DType* wh = wx + input_size * hidden_size * ngates;
         if (mode == rnn_enum::kGru) {
-          AdjustGruWeightGateOrder(wx, I, H);
-          AdjustGruWeightGateOrder(wh, H, H);
+          AdjustGruWeightGateOrder(wx, input_size, hidden_size);
+          AdjustGruWeightGateOrder(wh, hidden_size, hidden_size);
           has_adjusted = true;
         }
         srcs_data_x.push_back(wx);
@@ -546,30 +534,30 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
     DType* user_bias_f = reinterpret_cast<DType *>(
         mkldnn_mems->bias_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kGru) {
-      const int mx_single_b_sz = ngates * H;
-      for (int l = 0; l < L; l++) {
+      const int mx_single_b_sz = ngates * hidden_size;
+      for (int l = 0; l < num_layer; l++) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int g = 0; g < H; g++) {
+        for (int g = 0; g < hidden_size; g++) {
           // While mxnet gru gate order is reset, update and new gates,
           // mkldnn gru gate order is update, reset and new gates. So
           // we need to swap the order of reset and update from mxnet.
-          user_bias_f[g + H + l * single_b_size] =
+          user_bias_f[g + hidden_size + l * single_b_size] =
               b_ptr[g + l * mx_single_b_sz * 2]
               + b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz];
+
           user_bias_f[g + l * single_b_size] =
-              b_ptr[g + H + l * mx_single_b_sz * 2]
-              + b_ptr[g + H + l * mx_single_b_sz * 2 + mx_single_b_sz];
-        }
-        #pragma omp parallel for num_threads(omp_threads)
-        for (int g = 2 * H; g < 3 * H; g++) {
-          user_bias_f[g + l * single_b_size] = b_ptr[g + l * mx_single_b_sz * 2];
-          user_bias_f[g + l * single_b_size + H] =
-              b_ptr[g + l * mx_single_b_sz * 2 + mx_single_b_sz];
+              b_ptr[g + hidden_size + l * mx_single_b_sz * 2]
+              + b_ptr[g + hidden_size + l * mx_single_b_sz * 2 + mx_single_b_sz];
+          
+          user_bias_f[g + l * single_b_size + 2 * hidden_size] =
+              b_ptr[g + l * mx_single_b_sz * 2 + 2 * hidden_size];
+          user_bias_f[g + l * single_b_size + 3 * hidden_size] =
+              b_ptr[g + 2 * hidden_size + l * mx_single_b_sz * 2 + mx_single_b_sz];
         }
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int j = 0; j < L * single_b_size; j++) {
+      for (int j = 0; j < num_layer * single_b_size; j++) {
         int k = j / single_b_size;
         user_bias_f[j] = b_ptr[j + k * single_b_size] +
             b_ptr[j + k * single_b_size + single_b_size];
@@ -610,7 +598,7 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
     DType* dst_hcy = reinterpret_cast<DType *>(
         mkldnn_mems->hcy_memory[layer_index].get_data_handle());
     if (mode == rnn_enum::kLstm) {
-      for (int l = 0; l < L; l++) {
+      for (int l = 0; l < num_layer; l++) {
         offset1 = l * single_cell_size;
         offset2 = l * nstates * single_cell_size;
         #pragma omp parallel for num_threads(omp_threads)
@@ -621,29 +609,29 @@ static void MKLDNNRNNForwardUnidi(bool state_outputs,
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int n = 0; n < L * single_cell_size; n++) {
+      for (int n = 0; n < num_layer * single_cell_size; n++) {
         hy_ptr[n] = dst_hcy[n];
       }
     }
   }
   if (has_adjusted) {
-    for (int l = 0; l < L; l++) {
+    for (int l = 0; l < num_layer; l++) {
       DType* wx = w_ptr + l * w_size;
-      DType* wh = wx + I * H * ngates;
-      AdjustGruWeightGateOrder(wx, I, H);
-      AdjustGruWeightGateOrder(wh, H, H);
+      DType* wh = wx + input_size * hidden_size * ngates;
+      AdjustGruWeightGateOrder(wx, input_size, hidden_size);
+      AdjustGruWeightGateOrder(wh, hidden_size, hidden_size);
     }
   }
 }
 
 template <typename DType>
-static void MKLDNNRNNForward(bool state_outputs,
-                             const int L,
-                             const int D,
-                             const int T,
-                             const int N,
-                             const int I,
-                             const int H,
+static void MKLDNNRNNForward(const bool state_outputs,
+                             const int num_layer,
+                             const int direction,
+                             const int seq_len,
+                             const int batch_size,
+                             const int input_size,
+                             const int hidden_size,
                              DType* x_ptr,
                              DType* hx_ptr,
                              DType* cx_ptr,
@@ -660,38 +648,35 @@ static void MKLDNNRNNForward(bool state_outputs,
                              int mode) {
   int ngates = 0, nstates = 0;
   GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
-  const int b_size = 2 * H * ngates * D;
-  const int cell_size = N * H * D;
+  const int b_size = 2 * hidden_size * ngates * direction;
+  const int cell_size = batch_size * hidden_size * direction;
   //  First layer
-  int w_size = (I + H) * H * ngates * D;
+  int w_size = (input_size + hidden_size) * hidden_size * ngates * direction;
   DType* tmpNull = NULL;
-  // when D = 1 and I == H, L layers can be fused together
-  if (D == 1 && I == H) {
-    MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr,
-        hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
-        mkldnn_mems, rnn_forward_prim,
-        0, has_cache, dtype, is_train, mode);
+  // when direction = 1 and input_size == hidden_size, num_layer layers can be fused together
+  if (direction == 1 && input_size == hidden_size) {
+    MKLDNNRNNForwardUnidi(state_outputs, num_layer, seq_len, batch_size, input_size,
+        hidden_size, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+        mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode);
   } else {
-    if (D == 2) {
-      MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr,
-          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
-          mkldnn_mems, rnn_forward_prim,
-          0, has_cache, dtype, is_train, mode);
+    if (direction == 2) {
+      MKLDNNRNNForwardSingleLayerBi(state_outputs, seq_len, batch_size, input_size,
+          hidden_size, x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+          mkldnn_mems, rnn_forward_prim, 0, has_cache, dtype, is_train, mode);
     } else {
-      MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr,
-          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
-          mkldnn_mems, rnn_forward_prim,
+      MKLDNNRNNForwardUnidi(state_outputs, 1, seq_len, batch_size, input_size, hidden_size, x_ptr,
+          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim,
           0, has_cache, dtype, is_train, mode);
     }
-    if (L > 1) {
+    if (num_layer > 1) {
       mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[0];
-      //  go to next L - 1 layers.
-      //  If D = 2, do it layer by layer. If D = 1, fused L - 1 layers
+      //  go to next num_layer - 1 layers.
+      //  If direction = 2, do it layer by layer. If direction = 1, fused num_layer - 1 layers
       w_ptr += w_size;
       b_ptr += b_size;
-      if (D == 2) {
-        w_size = (H * D + H) * H * ngates * D;
-        for (int l = 0; l < L - 1; l++) {
+      if (direction == 2) {
+        w_size = (hidden_size * direction + hidden_size) * hidden_size * ngates * direction;
+        for (int l = 0; l < num_layer - 1; l++) {
           if (state_outputs) {
             hy_ptr += cell_size;
             if (mode == rnn_enum::kLstm) {
@@ -702,26 +687,27 @@ static void MKLDNNRNNForward(bool state_outputs,
           if (mode == rnn_enum::kLstm) {
             cx_ptr += cell_size;
           }
-          MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull,
-              hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr,
-              cy_ptr, mkldnn_mems, rnn_forward_prim,
-              1, has_cache, dtype, is_train, mode);
+          MKLDNNRNNForwardSingleLayerBi(state_outputs, seq_len, batch_size,
+              direction * hidden_size, hidden_size, tmpNull, hx_ptr, cx_ptr, w_ptr, b_ptr,
+              y_ptr, hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, dtype,
+              is_train, mode);
           mkldnn_mems->user_src_layer_memory_l = mkldnn_mems->y_memory[1];
           w_ptr += w_size;
           b_ptr += b_size;
         }
       }
-      if (D == 1) {
+      if (direction == 1) {
         if (state_outputs) {
           hy_ptr += cell_size;
           if (mode == rnn_enum::kLstm) {
             cy_ptr += cell_size;
           }
         }
-        w_size = (H + H) * H * ngates;
-        MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull,
-            hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, mkldnn_mems,
-            rnn_forward_prim, 1, has_cache, dtype, is_train, mode);
+        w_size = (hidden_size + hidden_size) * hidden_size * ngates;
+        MKLDNNRNNForwardUnidi(state_outputs, num_layer - 1, seq_len, batch_size,
+            hidden_size, hidden_size, tmpNull, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr,
+            hy_ptr, cy_ptr, mkldnn_mems, rnn_forward_prim, 1, has_cache, dtype,
+            is_train, mode);
       }
     }
   }
@@ -729,7 +715,7 @@ static void MKLDNNRNNForward(bool state_outputs,
 }
 
 template <typename DType>
-static void MKLDNNRNNForwardInference(bool state_outputs,
+static void MKLDNNRNNForwardInference(const bool state_outputs,
                                       const int num_layers,
                                       const int direction,
                                       const int seq_length,

From 794b1909a86ebd762584beb857e954250344054c Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 31 Jul 2019 20:26:55 +0200
Subject: [PATCH 08/24] Add missing default axis value to symbol.squeeze op
 (#15707)

* Add missing default arg

* Add test

* add test
---
 python/mxnet/symbol/symbol.py        |  2 +-
 tests/python/unittest/test_gluon.py  | 24 ++++++++++++++++++++----
 tests/python/unittest/test_symbol.py |  1 +
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index deedf0fe83d2..1e2defab3713 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -2539,7 +2539,7 @@ def softmin(self, *args, **kwargs):
         """
         return op.softmin(self, *args, **kwargs)
 
-    def squeeze(self, axis, inplace=False, **kwargs): # pylint: disable=unused-argument
+    def squeeze(self, axis=None, inplace=False, **kwargs): # pylint: disable=unused-argument
         """Convenience fluent method for :py:func:`squeeze`.
 
         The arguments are the same as for :py:func:`squeeze`, with
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index b59ce2d0864c..af30980b10ea 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -115,7 +115,7 @@ def test_parameter_dict():
     params1.get('w1', shape=(10, 10), stype='row_sparse')
     params1.load('test_parameter_dict.params', ctx)
     trainer1 = mx.gluon.Trainer(params1, 'sgd')
-    
+
     # compare the values before and after save/load
     cur_w0 = params1.get('w0').data(ctx)
     cur_w1 = params1.get('w1').row_sparse_data(all_row_ids)
@@ -134,7 +134,7 @@ def test_parameter_dict():
     cur_w1 = params2.get('w1').data(ctx)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
-    
+
     # test the dtype casting functionality
     params0 = gluon.ParameterDict('')
     params0.get('w0', shape=(10, 10), dtype='float32')
@@ -386,7 +386,7 @@ def hybrid_forward(self, F, x):
         if 'conv' in param_name and 'weight' in param_name:
             break
     assert np.dtype(net_fp64.params[param_name].dtype) == np.dtype(np.float64)
-    
+
     # 3.b Verify same functionnality with the imports API
     net_fp_64 = mx.gluon.SymbolBlock.imports(sym_file, 'data', params_file, ctx=ctx)
 
@@ -2788,7 +2788,7 @@ def test_gluon_param_load():
     net.cast('float16')
     net.load_parameters('test_gluon_param_load.params', cast_dtype=True)
     mx.nd.waitall()
-    
+
 @with_seed()
 def test_gluon_param_load_dtype_source():
     net = mx.gluon.nn.Dense(10, in_units=10)
@@ -2800,6 +2800,22 @@ def test_gluon_param_load_dtype_source():
     assert net.weight.dtype == np.float16
     mx.nd.waitall()
 
+@with_seed()
+def test_squeeze_consistency():
+    class Foo(gluon.HybridBlock):
+        def __init__(self, inplace, **kwargs):
+            super(Foo, self).__init__(**kwargs)
+            self.inplace = inplace
+
+        def forward(self, x):
+            return x.squeeze(inplace=self.inplace)
+
+    for inplace in (True, False):
+        block = Foo(inplace)
+        block.hybridize()
+        shape = (np.random.randint(1, 10), np.random.randint(1, 10), 1)
+        block(mx.nd.ones(shape))
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 2dfe3e44eedb..0c97c68b0880 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -242,6 +242,7 @@ def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
     check_fluent_regular('reshape', {'shape': (17, 1, 5)})
     check_fluent_regular('broadcast_to', {'shape': (5, 17, 47)})
     check_fluent_regular('squeeze', {'axis': (1, 3)}, shape=(2, 1, 3, 1, 4))
+    check_fluent_regular('squeeze', {}, shape=(2, 1, 3, 1, 4))
 
 def check_symbol_consistency(sym1, sym2, ctx, skip_grad=False, equal_nan=False):
     assert sym1.list_arguments() == sym2.list_arguments()

From 0042c49aa91ac2c9e2f721336beb7b708fa5e806 Mon Sep 17 00:00:00 2001
From: Haohuan Wang <haohuanw@umich.edu>
Date: Wed, 31 Jul 2019 14:36:14 -0700
Subject: [PATCH 09/24] add deconv in TRT subgraph (#15666)

---
 .../subgraph/tensorrt/nnvm_to_onnx-inl.h      | 19 +++++-
 .../subgraph/tensorrt/nnvm_to_onnx.cc         | 46 ++++++++++----
 src/operator/subgraph/tensorrt/tensorrt-inl.h |  2 +
 tests/python/tensorrt/test_tensorrt_deconv.py | 63 +++++++++++++++++++
 4 files changed, 116 insertions(+), 14 deletions(-)
 create mode 100644 tests/python/tensorrt/test_tensorrt_deconv.py

diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
index 55b3d938df0a..5a433f1d9820 100644
--- a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
@@ -41,6 +41,8 @@ namespace mxnet {
 namespace op {
 namespace nnvm_to_onnx {
 
+enum ConvDeconvType {Convolution, Deconvolution};
+
 using namespace nnvm;
 using namespace ::onnx;
 using int64 = ::google::protobuf::int64;
@@ -48,8 +50,7 @@ using int64 = ::google::protobuf::int64;
 std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(const ShapeVector& shape_inputs,
     const nnvm::IndexedGraph& ig);
 
-std::unordered_map<std::string, int> GetPlaceholderDTypes(const DTypeVector&
-dtype_inputs,
+std::unordered_map<std::string, int> GetPlaceholderDTypes(const DTypeVector& dtype_inputs,
     const nnvm::IndexedGraph& ig);
 
 std::unordered_map<std::string, uint32_t> GetOutputLookup(const nnvm::IndexedGraph& ig);
@@ -74,12 +75,25 @@ typedef void (*ConverterFunction)(NodeProto *node_proto,
                                   const nnvm::IndexedGraph &ig,
                                   const array_view<IndexedGraph::NodeEntry> &inputs);
 
+template <class ConvDeconvParam>
+void ConvDeconvConvertHelper(NodeProto *node_proto,
+                             const NodeAttrs &attrs,
+                             const nnvm::IndexedGraph &ig,
+                             const array_view<IndexedGraph::NodeEntry> &inputs,
+                             const ConvDeconvParam& param,
+                             ConvDeconvType type);
+
 // Forward declarations
 void ConvertConvolution(NodeProto *node_proto,
                         const NodeAttrs &attrs,
                         const nnvm::IndexedGraph &ig,
                         const array_view<IndexedGraph::NodeEntry> &inputs);
 
+void ConvertDeconvolution(NodeProto *node_proto,
+                        const NodeAttrs &attrs,
+                        const nnvm::IndexedGraph &ig,
+                        const array_view<IndexedGraph::NodeEntry> &inputs);
+
 void ConvertPooling(NodeProto *node_proto,
                     const NodeAttrs &attrs,
                     const nnvm::IndexedGraph &ig,
@@ -158,6 +172,7 @@ static const std::unordered_map<std::string, ConverterFunction> converter_map =
   {"BatchNorm", ConvertBatchNorm},
   {"clip", ConvertClip},
   {"Convolution", ConvertConvolution},
+  {"Deconvolution", ConvertDeconvolution},
   {"Concat", ConvertConcatenate},
   {"Dropout", ConvertDropout},
   {"elemwise_add", ConvertElementwiseAdd},
diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
index 6116f296e300..84580d0b05d0 100644
--- a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
@@ -31,6 +31,7 @@
 #include <mxnet/base.h>
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
+#include <operator/nn/deconvolution-inl.h>
 
 #include "../../../common/utils.h"
 #include "../../../ndarray/ndarray_function.h"
@@ -170,20 +171,25 @@ std::string ConvertNnvmGraphToOnnx(
   return serialized_onnx_graph;
 }
 
-void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
-                        const nnvm::IndexedGraph& /*ig*/,
-                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
-  const auto& conv_param = nnvm::get<op::ConvolutionParam>(attrs.parsed);
-
-  node_proto->set_op_type("Conv");
+template <class ConvDeconvParam>
+void ConvDeconvConvertHelper(NodeProto* node_proto, const NodeAttrs& attrs,
+                             const nnvm::IndexedGraph& /*ig*/,
+                             const array_view<IndexedGraph::NodeEntry>& /*input*/,
+                             const ConvDeconvParam& param,
+                             ConvDeconvType type) {
+  if (type == ConvDeconvType::Convolution) {
+    node_proto->set_op_type("Conv");
+  } else {
+    node_proto->set_op_type("ConvTranspose");
+  }
 
-  const mxnet::TShape kernel = conv_param.kernel;
-  const mxnet::TShape stride = conv_param.stride;
-  const mxnet::TShape dilate = conv_param.dilate;
-  const mxnet::TShape pad = conv_param.pad;
-  const uint32_t num_group = conv_param.num_group;
+  const mxnet::TShape kernel = param.kernel;
+  const mxnet::TShape stride = param.stride;
+  const mxnet::TShape dilate = param.dilate;
+  const mxnet::TShape pad = param.pad;
+  const uint32_t num_group = param.num_group;
   // const bool no_bias = conv_param.no_bias;
-  const dmlc::optional<int> layout = conv_param.layout;
+  const dmlc::optional<int> layout = param.layout;
 
   // dilations
   AttributeProto* const dilations = node_proto->add_attribute();
@@ -226,8 +232,24 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
   for (const dim_t kval : stride) {
     strides->add_ints(static_cast<int64>(kval));
   }
+}
+
+void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
+                        const nnvm::IndexedGraph& ig,
+                        const array_view<IndexedGraph::NodeEntry>& inputs) {
+  const auto& conv_param = nnvm::get<op::ConvolutionParam>(attrs.parsed);
+  ConvDeconvConvertHelper(node_proto, attrs, ig, inputs, conv_param,
+      ConvDeconvType::Convolution);
 }  // end ConvertConvolution
 
+void ConvertDeconvolution(NodeProto* node_proto, const NodeAttrs& attrs,
+                          const nnvm::IndexedGraph& ig,
+                          const array_view<IndexedGraph::NodeEntry>& inputs) {
+  const auto& deconv_param = nnvm::get<op::DeconvolutionParam>(attrs.parsed);
+  ConvDeconvConvertHelper(node_proto, attrs, ig, inputs, deconv_param,
+      ConvDeconvType::Deconvolution);
+}  // end ConvertDeconvolution
+
 void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
                     const nnvm::IndexedGraph& /*ig*/,
                     const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
index e258d892aaba..a6b93f10598a 100644
--- a/src/operator/subgraph/tensorrt/tensorrt-inl.h
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -88,6 +88,7 @@ class TensorrtSelector : public SubgraphSelector {
     "clip",
     "Concat",
     "Convolution",
+    "Deconvolution",
     "Dropout",
     "elemwise_add",
     "elemwise_sub",
@@ -104,6 +105,7 @@ class TensorrtSelector : public SubgraphSelector {
   const std::unordered_set<std::string> withWeightsOps = {
     "BatchNorm",
     "Convolution",
+    "Deconvolution",
     "FullyConnected"
   };
 
diff --git a/tests/python/tensorrt/test_tensorrt_deconv.py b/tests/python/tensorrt/test_tensorrt_deconv.py
new file mode 100644
index 000000000000..ef567d1dae3c
--- /dev/null
+++ b/tests/python/tensorrt/test_tensorrt_deconv.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import assert_almost_equal
+
+def get_params():
+    arg_params = {}
+    aux_params = {}
+    arg_params["trt_bn_test_conv_weight"] = mx.nd.ones((1, 1, 3, 3))
+    arg_params["trt_bn_test_deconv_weight"] = mx.nd.ones((1, 1, 3, 3))
+    return arg_params, aux_params
+
+def get_symbol():
+    data = mx.sym.Variable("data")
+    conv = mx.sym.Convolution(data=data, kernel=(3,3), no_bias=True, num_filter=1, num_group=1,
+                              name="trt_bn_test_conv")
+    deconv = mx.sym.Deconvolution(data=conv, kernel=(3, 3), no_bias=True, num_filter=1,
+                                  num_group=1, name="trt_bn_test_deconv")
+    return deconv
+
+def test_deconvolution_produce_same_output_as_tensorrt():
+    arg_params, aux_params = get_params()
+    arg_params_trt, aux_params_trt = get_params()
+
+    sym = get_symbol()
+    sym_trt = get_symbol().get_backend_symbol("TensorRT")
+
+    mx.contrib.tensorrt.init_tensorrt_params(sym_trt, arg_params_trt, aux_params_trt)
+
+    executor = sym.simple_bind(ctx=mx.gpu(), data=(1, 1, 3, 3), grad_req='null', force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
+
+    executor_trt = sym_trt.simple_bind(ctx=mx.gpu(), data=(1, 1, 3, 3), grad_req='null',
+                                  force_rebind=True)
+    executor_trt.copy_params_from(arg_params_trt, aux_params_trt)
+
+    input_data = mx.nd.random.uniform(low=0, high=1, shape=(1, 1, 3, 3))
+
+    y = executor.forward(is_train=False, data=input_data)
+    y_trt = executor_trt.forward(is_train=False, data=input_data)
+
+    print(y[0].asnumpy())
+    print(y_trt[0].asnumpy())
+    assert_almost_equal(y[0].asnumpy(), y_trt[0].asnumpy(), 1e-4, 1e-4)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

From 0bfac7d11a39358ba1ec8ff578d9388b0bcad53a Mon Sep 17 00:00:00 2001
From: Cody Allen <ceedubs@gmail.com>
Date: Wed, 31 Jul 2019 16:09:23 -0700
Subject: [PATCH 10/24] Fix Scala Symbolic API some/Some typo (#15687)

---
 docs/api/scala/symbol.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/scala/symbol.md b/docs/api/scala/symbol.md
index aaddc2a8a2f0..f92548e4820d 100644
--- a/docs/api/scala/symbol.md
+++ b/docs/api/scala/symbol.md
@@ -41,7 +41,7 @@ The following example configures a two-layer neural network.
     val data = Symbol.Variable("data")
     val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
     val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    val fc2 = Symbol.api.FullyConnected(some(act1), num_hidden = 64, name = "fc2")
+    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
     val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
     :type net
     // org.apache.mxnet.Symbol

From d6c17faf2f2d1cab51b6a0700581174df115e059 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Thu, 1 Aug 2019 12:43:36 +0800
Subject: [PATCH 11/24] Add MKLDNN 4c layout to fix gluoncv se_resnext101_64x4d
 (#15692)

* add 4c type

* trigger
---
 src/operator/nn/mkldnn/mkldnn_base.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index e36a0f008821..a13337b122c3 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -329,6 +329,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_nchw:
       case mkldnn_nhwc:
       case mkldnn_chwn:
+      case mkldnn_nChw4c:
       case mkldnn_nChw8c:
       case mkldnn_nChw16c:
         return mkldnn_nchw;
@@ -338,6 +339,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_iohw:
       case mkldnn_oIhw8i:
       case mkldnn_oIhw16i:
+      case mkldnn_OIhw4i4o:
       case mkldnn_OIhw8i8o:
       case mkldnn_hwio_s8s8:
       case mkldnn_OIhw16i16o:
@@ -376,6 +378,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_giohw:
       case mkldnn_hwigo:
       case mkldnn_hwigo_s8s8:
+      case mkldnn_gOIhw4i4o:
       case mkldnn_gOIhw8i8o:
       case mkldnn_gOIhw16i16o:
       case mkldnn_gOIhw4i16o4i:
@@ -383,6 +386,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_gOIhw8i16o2i:
       case mkldnn_gOIhw8o16i2o:
       case mkldnn_gOIhw8o8i:
+      case mkldnn_gOIhw4o4i:
       case mkldnn_gOIhw16o16i:
       case mkldnn_gIOhw16o16i:
       case mkldnn_gOihw8o:

From 29ba4fb3662eccd2e383fe77127b4acfb8e7dbdd Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 1 Aug 2019 13:54:15 +0800
Subject: [PATCH 12/24] Fix _copy_to on MKLDNN backend (#15637)

* Fix _copy_to

* Add comment
---
 src/imperative/imperative_utils.h | 34 ++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index b867162abc9b..07fe04782bd0 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -419,7 +419,14 @@ inline void PushFCompute(const FCompute& fn,
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       std::vector<OpReqType> tmp_req = req;
       // setup blobs
@@ -461,7 +468,14 @@ inline void PushFComputeEx(const FComputeEx& fn,
   const auto& run = [=](RunContext rctx) {
       OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       fn(attrs, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) {
@@ -508,7 +522,14 @@ inline void PushOperator(const OpStatePtr& state,
                           engine::CallbackOnComplete on_complete) {
       OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
 #if MXNET_USE_MKLDNN == 1
-      InvalidateOutputs(outputs, req);
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
+        InvalidateOutputs(outputs, req);
+      }
 #endif
       fcompute_ex(state, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync
@@ -547,7 +568,14 @@ inline void PushOperator(const OpStatePtr& state,
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
 #if MXNET_USE_MKLDNN == 1
+      if (exec_type != ExecType::kCrossDeviceCopy) {
+        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+        // its FCcomputeEx, but AsyncPush the copy operation to engine.
+        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+        // copying A to B may not happen, and will corrupt A's memory.
         InvalidateOutputs(outputs, req);
+      }
 #endif
         std::vector<OpReqType> tmp_req = req;
         // populate input blobs and output blobs

From 862423a70eaa79feea3e40872dc2b33587974c7a Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Thu, 1 Aug 2019 12:59:15 -0700
Subject: [PATCH 13/24] [DOC] refine autograd docs (#15109)

* refine autograd docs

* CR comments

* Fix examples

* CR comments

* Followup CR

* CR
---
 docs/api/python/autograd/autograd.md | 75 +++++++++++++++++++++++++---
 python/mxnet/autograd.py             |  3 ++
 python/mxnet/ndarray/ndarray.py      |  2 +
 3 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/docs/api/python/autograd/autograd.md b/docs/api/python/autograd/autograd.md
index 1905831b16d9..82da4eac05b5 100644
--- a/docs/api/python/autograd/autograd.md
+++ b/docs/api/python/autograd/autograd.md
@@ -42,16 +42,28 @@ to allocate space for the gradient. Then, start a `with autograd.record()` block
 and do some computation. Finally, call `backward()` on the result:
 
 ```python
->>> x = mx.nd.array([1,2,3,4])
->>> x.attach_grad()
->>> with mx.autograd.record():
-...     y = x * x + 1
->>> y.backward()
->>> print(x.grad)
+import mxnet as mx
+x = mx.nd.array([1,2,3,4])
+x.attach_grad()
+with mx.autograd.record():
+    y = x * x + 1
+y.backward()
+print(x.grad)
+```
+
+Which outputs:
+
+```
 [ 2.  4.  6.  8.]
 <NDArray 4 @cpu(0)>
 ```
 
+Gradient recording is enabled during the scope of the `with mx.autograd.record():` statement, then
+disabled when we go out of that scope.
+
+It can be also set manually by executing `mx.autograd.set_recording(True)`, and turning it off after
+we no longer want to record operations with `mx.autograd.set_recording(False)`.
+
 
 ## Train mode and Predict Mode
 
@@ -76,8 +88,59 @@ Detailed tutorials are available in Part 1 of
 [the MXNet gluon book](http://gluon.mxnet.io/).
 
 
+# Higher order gradient
+
+Some operators support higher order gradients. Some operators support differentiating multiple
+times, and others two, most just once.
+
+For calculating higher order gradients, we can use the `mx.autograd.grad` function while recording
+and then call backward, or call `mx.autograd.grad` two times. If we do the latter, is important that
+the first call uses `create_graph=True` and `retain_graph=True` and the second call uses
+`create_graph=False` and `retain_graph=True`. Otherwise we will not get the results that we want. If
+we would be to recreate the graph in the second call, we would end up with a graph of just the
+backward nodes, not the full initial graph that includes the forward nodes.
+
+The pattern to calculate higher order gradients is the following:
+
+```python
+from mxnet import ndarray as nd
+from mxnet import autograd as ag
+x = nd.array([1,2,3])
+x.attach_grad()
+def f(x):
+    # Any function which supports higher oder gradient
+    return nd.log(x)
+```
+
+If the operators used in `f` don't support higher order gradients you will get an error like
+`operator ... is non-differentiable because it didn't register FGradient attribute.`. This means
+that it doesn't support getting the gradient of the gradient. Which is, running backward on
+the backward graph.
+
+Using mxnet.autograd.grad multiple times:
+
+```python
+with ag.record():
+    y = f(x)
+    x_grad = ag.grad(heads=y, variables=x, create_graph=True, retain_graph=True)[0]
+    x_grad_grad = ag.grad(heads=x_grad, variables=x, create_graph=False, retain_graph=False)[0]
+```
+
+Running backward on the backward graph:
+
+```python
+with ag.record():
+    y = f(x)
+    x_grad = ag.grad(heads=y, variables=x, create_graph=True, retain_graph=True)[0]
+x_grad.backward()
+x_grad_grad = x.grad
+```
 
+Both methods are equivalent, except that in the second case, retain_graph on running backward is set
+to False by default. But both calls are running a backward pass as on the graph as usual to get the
+gradient of the first gradient `x_grad` with respect to `x` evaluated at the value of `x`.
 
+For more examples, check the [higher order gradient unit tests](https://github.com/apache/incubator-mxnet/blob/master/tests/python/unittest/test_higher_order_grad.py).
 
 
 <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index f461b77e2818..6f1cc4367821 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -197,6 +197,9 @@ def predict_mode():
 def mark_variables(variables, gradients, grad_reqs='write'):
     """Mark NDArrays as variables to compute gradient for autograd.
 
+    This is equivalent to the function .attach_grad() in a variable, but with this
+    call we can set the gradient to any value.
+
     Parameters
     ----------
     variables: NDArray or list of NDArray
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 53c563854511..3d8a7aa98c94 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -2243,6 +2243,8 @@ def attach_grad(self, grad_req='write', stype=None):
         """Attach a gradient buffer to this NDArray, so that `backward`
         can compute gradient with respect to it.
 
+        The gradient is initialized to zeros.
+
         Parameters
         ----------
         grad_req : {'write', 'add', 'null'}

From 4f6f124f55ee20f988e73f6b650f364324fd0ba1 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Fri, 2 Aug 2019 04:37:58 +0800
Subject: [PATCH 14/24] Fix quantized concat when inputs are mixed int8 and
 uint8 (#15693)

---
 .../quantization/mkldnn/mkldnn_quantized_concat.cc   | 12 +++++++++++-
 tests/python/mkl/test_subgraph.py                    | 11 +++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc
index d9e884e82806..2a4c6d612e65 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_concat.cc
@@ -64,22 +64,32 @@ static void MKLDNNQuantizedConcatForward(const nnvm::NodeAttrs& attrs, const OpC
   std::vector<const mkldnn::memory*> data_mem;
   // new_data_mem is for auto-free new created mkldnn memory
   std::vector<std::shared_ptr<mkldnn::memory>> new_data_mem;
+  const auto out_dtype = out_data[quantized_concat_enum::kOut].dtype();
   for (int i = 0; i < param_.num_args; ++i) {
     auto i_scale = GetScale(in_data[i], data_min[i], data_max[i]);
     if (i_scale == out_scale) {
+      CHECK(in_data[i].dtype() == out_dtype);
       auto mem = in_data[i].GetMKLDNNData();
       data_mem.push_back(mem);
       data_md.push_back(mem->get_primitive_desc());
     } else {
       auto mem = in_data[i].GetMKLDNNData();
       auto pd = mem->get_primitive_desc();
+      if (in_data[i].dtype() != out_dtype) {
+        auto mem_desc = pd.desc();
+        mkldnn::memory::desc new_md(
+            mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims),
+            get_mkldnn_type(out_dtype), static_cast<mkldnn::memory::format>(mem_desc.data.format));
+        pd = mkldnn::memory::primitive_desc(new_md, CpuEngine::Get()->get_engine());
+      }
       const auto rescaled_mem = std::make_shared<mkldnn::memory>(pd);
       new_data_mem.push_back(rescaled_mem);
       std::vector<float> reorder_scale = {out_scale / i_scale};
       primitive_attr reorder_attr;
       reorder_attr.set_int_output_round_mode(round_mode::round_nearest);
       reorder_attr.set_output_scales(0, reorder_scale);
-      const auto reorder_pd = mkldnn::reorder::primitive_desc(pd, pd, reorder_attr);
+      const auto reorder_pd =
+          mkldnn::reorder::primitive_desc(mem->get_primitive_desc(), pd, reorder_attr);
       MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *mem, *rescaled_mem));
       data_mem.push_back(rescaled_mem.get());
       data_md.push_back(pd);
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index b25fefc6cc0e..563fff1a6aa1 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -401,6 +401,15 @@ def single_concat(data_shape, input_num, dim):
   concat = mx.symbol.Concat(*inputs, name="concat", dim=dim)
   return concat
 
+def single_concat_pos_neg(data_shape):
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=4,
+                               kernel=(1, 1), stride=(1, 1), no_bias=True)
+  relu = mx.symbol.Activation(data=conv, name='relu', act_type='relu')
+  inputs = [data, relu]
+  concat = mx.symbol.Concat(*inputs, name="concat", dim=1)
+  return concat
+
 # concat scale alignment case
 def concat_scale_align(data_shape):
   data, weight = head_symbol(data_shape)
@@ -738,6 +747,8 @@ def test_pos_single_concat():
       net = single_concat(data_shape, 4, 3)
       check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
       check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
+      net = single_concat_pos_neg(data_shape)
+      check_quantize(net, data_shape, out_type, name='', check_calibration=False)
 
 @with_seed()
 def test_pos_concat_scale_align():

From ffcfce587a3085ae221b4458a900c26a48e92bfc Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Fri, 2 Aug 2019 04:41:35 +0800
Subject: [PATCH 15/24] [MKLDNN]Enhance Quantization APIs and Tutorial (#15448)

* enhance api and new tutorial

* Update MKLDNN_QUANTIZATION.md

update

* fix lint

* modify pics

* skip test

* add quantize layer in graph

* update

* remove center css flag

* change requantize color

* fix markdown pics

* change to use png

* Update MKLDNN_QUANTIZATION.md

update

* enable ipython script

* fix png

* fix lint

* Update MKLDNN_QUANTIZATION.md

* change title

* trigger

* use lower case

* some typo

* some typo

* use dmlc web data

* trigger

* trigger
---
 docs/tutorials/index.md                       |   3 +-
 docs/tutorials/mkldnn/mkldnn_quantization.md  | 259 ++++++++++++++++++
 example/quantization/README.md                |  95 +++++--
 .../quantization/imagenet_gen_qsym_mkldnn.py  | 168 ++++++------
 python/mxnet/contrib/quantization.py          | 237 ++++++++++++++++
 tests/tutorials/test_tutorials.py             |   3 +
 6 files changed, 657 insertions(+), 108 deletions(-)
 create mode 100644 docs/tutorials/mkldnn/mkldnn_quantization.md

diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 6e31e825e2ca..e01a30dbe68c 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -105,6 +105,8 @@ Select API:&nbsp;
     * [Module to Gluon API](/tutorials/python/module_to_gluon.html)
     * [Gluon end to end from training to inference](/tutorials/gluon/gluon_from_experiment_to_deployment.html)
     * [Automatic Mixed Precision in Gluon](/tutorials/amp/amp_tutorial.html)
+    * [How to build and install MXNet with MKL-DNN backend](/tutorials/mkldnn/MKLDNN_README.html)
+    * [How to quantize custom models with MKL-DNN backend](/tutorials/mkldnn/mkldnn_quantization.html)<span style="color:red"> (new!) </span>
 * API Guides
     * Core APIs
         * NDArray
@@ -157,7 +159,6 @@ Select API:&nbsp;
     * [Large-Scale Multi-Host Multi-GPU Image Classification](/tutorials/vision/large_scale_classification.html)
     * [Importing an ONNX model into MXNet](/tutorials/onnx/super_resolution.html)
     * [Optimizing Deep Learning Computation Graphs with TensorRT](/tutorials/tensorrt/inference_with_trt.html)
-    * [How to build and install MXNet with MKL-DNN backend](/tutorials/mkldnn/MKLDNN_README.html)
 * API Guides
     * Core APIs
         * NDArray
diff --git a/docs/tutorials/mkldnn/mkldnn_quantization.md b/docs/tutorials/mkldnn/mkldnn_quantization.md
new file mode 100644
index 000000000000..459bf2a17d40
--- /dev/null
+++ b/docs/tutorials/mkldnn/mkldnn_quantization.md
@@ -0,0 +1,259 @@
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Quantize custom models with MKL-DNN backend
+
+This document is to introduce how to quantize the customer models from FP32 to INT8 with Apache/MXNet toolkit and APIs under Intel CPU.
+
+If you are not familiar with Apache/MXNet quantization flow, please reference [quantization blog](https://medium.com/apache-mxnet/model-quantization-for-production-level-neural-network-inference-f54462ebba05) first, and the performance data is shown in [Apache/MXNet C++ interface](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) and [GluonCV](https://gluon-cv.mxnet.io/build/examples_deployment/int8_inference.html). 
+
+## Installation and Prerequisites
+
+Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](https://mxnet.incubator.apache.org/tutorials/mkldnn/MKLDNN_README.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running:
+
+```
+# release version
+pip install mxnet-mkl
+# nightly version
+pip install mxnet-mkl --pre
+```
+
+## Image Classification Demo
+
+A quantization script [imagenet_gen_qsym_mkldnn.py](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/imagenet_gen_qsym_mkldnn.py) has been designed to launch quantization for image-classification models. This script is  integrated with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that all pre-trained models can be downloaded from Gluon-CV and then converted for quantization. For details, you can refer [Model Quantization with Calibration Examples](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md).
+
+## Integrate Quantization Flow to Your Project
+
+Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization.
+
+In general, the quantization flow includes 4 steps. The user can get the acceptable accuracy from step 1 to 3 with minimum effort. Most of thing in this stage is out-of-box and the data scientists and researchers only need to focus on how to represent data and layers in their model. After a quantized model is generated, you may want to deploy it online and the performance will be the next key point. Thus, step 4, calibration, can improve the performance a lot by reducing lots of runtime calculation.
+
+![quantization flow](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/quantization.png)
+
+Now, we are going to take Gluon ResNet18 as an example to show how each step work.
+
+### Initialize Model
+
+```python
+import logging
+import mxnet as mx
+from mxnet.gluon.model_zoo import vision
+from mxnet.contrib.quantization import *
+
+logging.basicConfig()
+logger = logging.getLogger('logger')
+logger.setLevel(logging.INFO)
+
+batch_shape = (1, 3, 224, 224)
+resnet18 = vision.resnet18_v1(pretrained=True)
+resnet18.hybridize()
+resnet18.forward(mx.nd.zeros(batch_shape))
+resnet18.export('resnet18_v1')
+sym, arg_params, aux_params = mx.model.load_checkpoint('resnet18_v1', 0)
+# (optional) visualize float32 model
+mx.viz.plot_network(sym)
+```
+First, we download resnet18-v1 model from gluon modelzoo and export it as a symbol. You can visualize float32 model. Below is a raw residual block.
+
+![float32 model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_raw.png)
+
+#### Model Fusion
+
+```python
+sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
+# (optional) visualize fused float32 model
+mx.viz.plot_network(sym)
+```
+It's important to add this line to enable graph fusion before quantization to get better performance. Below is a fused residual block. Batchnorm, Activation and elemwise_add are fused into Convolution.
+
+![float32 fused model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_fusion.png)
+
+### Quantize Model
+
+A python interface `quantize_graph` is provided for the user. Thus, it is very flexible for the data scientist to construct the expected models based on different requirements in a real deployment.
+
+```python
+# quantize configs
+# set exclude layers
+excluded_names = []
+# set calib mode.
+calib_mode = 'none'
+# set calib_layer
+calib_layer = None
+# set quantized_dtype
+quantized_dtype = 'auto'
+logger.info('Quantizing FP32 model Resnet18-V1')
+qsym, qarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                          excluded_sym_names=excluded_names,
+                                                          calib_mode=calib_mode, calib_layer=calib_layer,
+                                                          quantized_dtype=quantized_dtype, logger=logger)
+# (optional) visualize quantized model
+mx.viz.plot_network(qsym)
+# save quantized model
+mx.model.save_checkpoint('quantized-resnet18_v1', 0, qsym, qarg_params, aux_params)
+```
+
+By applying `quantize_graph` to the symbolic model, a new quantized model can be generated, named `qsym` along with its parameters. We can see `_contrib_requantize` operators are inserted after `Convolution` to convert the INT32 output to FP32. 
+
+![none calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/none_calib.png)
+
+Below table gives some descriptions.
+
+| param              | type            | description|
+|--------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| excluded_sym_names | list of strings | A list of strings representing the names of the symbols that users want to excluding from being quantized.|
+| calib_mode         | str             | If calib_mode='none', no calibration will be used and the thresholds for requantization after the corresponding layers will be calculated at runtime by calling min  and max operators. The quantized models generated in this mode are normally 10-20% slower than those with  calibrations during inference.<br>If calib_mode='naive', the min and max values of the layer outputs from a calibration dataset will be directly taken as the thresholds for quantization.<br>If calib_mode='entropy', the thresholds for quantization will be derived such that the KL divergence between the distributions of FP32 layer outputs and  quantized layer outputs is minimized based upon the calibration dataset. |
+| calib_layer        | function        | Given a layer's output name in string, return True or False for deciding whether to calibrate this layer.<br>If yes, the statistics of the layer's output will be collected; otherwise, no information of the layer's output will be collected.<br>If not provided, all the layers' outputs that need requantization will be collected.|
+| quantized_dtype    | str             | The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'.<br>'auto' means automatically select output type according to calibration result.|
+
+### Evaluate & Tune
+
+Now, you get a pair of quantized symbol and params file for inference. For Gluon inference, only difference is to load model and params by a SymbolBlock as below example:
+
+```python
+quantized_net = mx.gluon.SymbolBlock.imports('quantized-resnet18_v1-symbol.json', 'data', 'quantized-resnet18_v1-0000.params')
+quantized_net.hybridize(static_shape=True, static_alloc=True)
+batch_size = 1
+data = mx.nd.ones((batch_size,3,224,224))
+quantized_net(data)
+```
+
+Now, you can get the accuracy from a quantized network. Furthermore, you can try to select different layers or OPs to be quantized by `excluded_sym_names` parameter and figure out an acceptable accuracy.
+
+### Calibrate Model (optional for performance)
+
+The quantized model generated in previous steps can be very slow during inference since it will calculate min and max at runtime. We recommend using offline calibration for better performance by setting `calib_mode` to `naive` or `entropy`. And then calling `set_monitor_callback` api to collect layer information with a subset of the validation datasets before int8 inference.
+
+```python
+# quantization configs
+# set exclude layers
+excluded_names = []
+# set calib mode.
+calib_mode = 'naive'
+# set calib_layer
+calib_layer = None
+# set quantized_dtype
+quantized_dtype = 'auto'
+logger.info('Quantizing FP32 model resnet18-V1')
+cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                          excluded_sym_names=excluded_names,
+                                                          calib_mode=calib_mode, calib_layer=calib_layer,
+                                                          quantized_dtype=quantized_dtype, logger=logger)
+
+# download imagenet validation dataset
+mx.test_utils.download('http://data.mxnet.io/data/val_256_q90.rec', 'dataset.rec')
+# set rgb info for data
+mean_std = {'mean_r': 123.68, 'mean_g': 116.779, 'mean_b': 103.939, 'std_r': 58.393, 'std_g': 57.12, 'std_b': 57.375}
+# set batch size
+batch_size = 16
+# create DataIter
+data = mx.io.ImageRecordIter(path_imgrec='dataset.rec', batch_size=batch_size, data_shape=batch_shape[1:], rand_crop=False, rand_mirror=False, **mean_std)
+# create module
+mod = mx.mod.Module(symbol=sym, label_names=None, context=mx.cpu())
+mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=None)
+mod.set_params(arg_params, aux_params)
+
+# calibration configs
+# set num_calib_batches
+num_calib_batches = 5
+max_num_examples = num_calib_batches * batch_size
+# monitor FP32 Inference
+mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
+num_batches = 0
+num_examples = 0
+for batch in data:
+    mod.forward(data_batch=batch, is_train=False)
+    num_batches += 1
+    num_examples += batch_size
+    if num_examples >= max_num_examples:
+        break
+if logger is not None:
+    logger.info("Collected statistics from %d batches with batch_size=%d"
+                % (num_batches, batch_size))
+```
+
+After that, layer information will be filled into the `collector` returned by `quantize_graph` api. Then, you need to write the layer information into int8 model by calling `calib_graph` api.
+
+
+```python
+# write scaling factor into quantized symbol
+cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params,
+                                            collector=collector, calib_mode=calib_mode,
+                                            quantized_dtype=quantized_dtype, logger=logger)
+# (optional) visualize quantized model
+mx.viz.plot_network(cqsym)
+```
+
+Below is a quantized residual block with naive calibration. We can see `min_calib_range` and `max_calib_range` are written into `_contrib_requantize` operators.
+
+![naive calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/naive_calib.png)
+
+When you get a quantized model with calibration, keeping sure to call fusion api again since this can fuse some `requantize` or `dequantize` operators for further performance improvement.
+
+```python
+# perform post-quantization fusion
+cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
+# (optional) visualize post-quantized model
+mx.viz.plot_network(cqsym)
+# save quantized model
+mx.model.save_checkpoint('quantized-resnet18_v1', 0, cqsym, cqarg_params, aux_params)
+```
+
+Below is a post-quantized residual block. We can see `_contrib_requantize` operators are fused into `Convolution` operators.
+
+![post-quantized model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/post_quantize.png)
+
+BTW, You can also modify the `min_calib_range` and `max_calib_range` in the JSON file directly.
+
+```
+    {
+      "op": "_sg_mkldnn_conv", 
+      "name": "quantized_sg_mkldnn_conv_bn_act_6", 
+      "attrs": {
+        "max_calib_range": "3.562147", 
+        "min_calib_range": "0.000000", 
+        "quantized": "true", 
+        "with_act": "true", 
+        "with_bn": "true"
+      }, 
+......
+```
+
+### Tips for Model Calibration
+
+#### Accuracy Tuning
+
+- Try to use `entropy` calib mode;
+
+- Try to exclude some layers which may cause obvious accuracy drop;
+
+- Change calibration dataset by setting different `num_calib_batches` or shuffle your validation dataset;
+
+#### Performance Tuning
+
+- Keep sure to perform graph fusion before quantization;
+
+- If lots of `requantize` layers exist, keep sure to perform post-quantization fusion after calibration;
+
+- Compare the MXNet profile or `MKLDNN_VERBOSE` of float32 and int8 inference;
+
+## Deploy with Python/C++
+
+MXNet also supports deploy quantized models with C++. Refer [MXNet C++ Package](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/README.md) for more details.
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/example/quantization/README.md b/example/quantization/README.md
index 09321beb7997..1ae58fbb3a69 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -9,13 +9,76 @@ This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or
 
 <h2 id="1">Model Quantization with Intel® MKL-DNN</h2>
 
-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for CNN models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. This script also supports custom models.
-
-Calibration is used for generating a calibration table for the quantized symbol. The quantization script supports three methods:
-
-- **none:** No calibration will be used. The thresholds for quantization will be calculated on the fly. This will result in inference speed slowdown and loss of accuracy in general.
-- **naive:** Simply take min and max values of layer outputs as thresholds for quantization. In general, the inference accuracy worsens with more examples used in calibration. It is recommended to use `entropy` mode as it produces more accurate inference results.
-- **entropy:** Calculate KL divergence of the fp32 output and quantized output for optimal thresholds. This mode is expected to produce the best inference accuracy of all three kinds of quantized models if the calibration dataset is representative enough of the inference dataset.
+Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.incubator.apache.org/tutorials/mkldnn/mkldnn_quantization.html).
+
+```
+usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH]
+                                   [--no-pretrained] [--batch-size BATCH_SIZE]
+                                   [--label-name LABEL_NAME]
+                                   [--calib-dataset CALIB_DATASET]
+                                   [--image-shape IMAGE_SHAPE]
+                                   [--data-nthreads DATA_NTHREADS]
+                                   [--num-calib-batches NUM_CALIB_BATCHES]
+                                   [--exclude-first-conv] [--shuffle-dataset]
+                                   [--shuffle-chunk-seed SHUFFLE_CHUNK_SEED]
+                                   [--shuffle-seed SHUFFLE_SEED]
+                                   [--calib-mode CALIB_MODE]
+                                   [--quantized-dtype {auto,int8,uint8}]
+                                   [--enable-calib-quantize ENABLE_CALIB_QUANTIZE]
+
+Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN
+support
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model MODEL         model to be quantized.
+  --epoch EPOCH         number of epochs, default is 0
+  --no-pretrained       If enabled, will not download pretrained model from
+                        MXNet or Gluon-CV modelzoo.
+  --batch-size BATCH_SIZE
+  --label-name LABEL_NAME
+  --calib-dataset CALIB_DATASET
+                        path of the calibration dataset
+  --image-shape IMAGE_SHAPE
+  --data-nthreads DATA_NTHREADS
+                        number of threads for data decoding
+  --num-calib-batches NUM_CALIB_BATCHES
+                        number of batches for calibration
+  --exclude-first-conv  excluding quantizing the first conv layer since the
+                        input data may have negative value which doesn't
+                        support at moment
+  --shuffle-dataset     shuffle the calibration dataset
+  --shuffle-chunk-seed SHUFFLE_CHUNK_SEED
+                        shuffling chunk seed, see https://mxnet.incubator.apac
+                        he.org/api/python/io/io.html?highlight=imager#mxnet.io
+                        .ImageRecordIter for more details
+  --shuffle-seed SHUFFLE_SEED
+                        shuffling seed, see https://mxnet.incubator.apache.org
+                        /api/python/io/io.html?highlight=imager#mxnet.io.Image
+                        RecordIter for more details
+  --calib-mode CALIB_MODE
+                        calibration mode used for generating calibration table
+                        for the quantized symbol; supports 1. none: no
+                        calibration will be used. The thresholds for
+                        quantization will be calculated on the fly. This will
+                        result in inference speed slowdown and loss of
+                        accuracy in general. 2. naive: simply take min and max
+                        values of layer outputs as thresholds for
+                        quantization. In general, the inference accuracy
+                        worsens with more examples used in calibration. It is
+                        recommended to use `entropy` mode as it produces more
+                        accurate inference results. 3. entropy: calculate KL
+                        divergence of the fp32 output and quantized output for
+                        optimal thresholds. This mode is expected to produce
+                        the best inference accuracy of all three kinds of
+                        quantized models if the calibration dataset is
+                        representative enough of the inference dataset.
+  --quantized-dtype {auto,int8,uint8}
+                        quantization destination data type for input data
+  --enable-calib-quantize ENABLE_CALIB_QUANTIZE
+                        If enabled, the quantize op will be calibrated offline
+                        if calibration mode is enabled
+```
 
 Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/):
 
@@ -23,12 +86,13 @@ Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/):
 pip install gluoncv
 ```
 
-The following models have been tested on Linux systems.
+Below are some quantization demos. These models have been tested on Linux systems.
 
 | Model | Source | Dataset | FP32 Accuracy (top-1/top-5)| INT8 Accuracy (top-1/top-5)|
 |:---|:---|---|:---:|:---:|
 | [ResNet18-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |70.15%/89.38%|69.92%/89.26%|
 | [ResNet50-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.34%/93.13%  |  75.91%/92.95% |
+| [ResNet50-V1b](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.82%/93.38% |  76.39%/93.24% |
 | [ResNet101-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.33%/93.59%  | 77.05%/93.43%  |
 |[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|56.98%/79.20%|52.98%/77.21%|
 |[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.03%/90.42%|
@@ -39,7 +103,7 @@ The following models have been tested on Linux systems.
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
-<h3 id='3'>ResNet18/50/101-V1</h3>
+<h3 id='3'>ResNetV1</h3>
 
 The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
 
@@ -47,7 +111,7 @@ The following command is to download the pre-trained model from Gluon-CV and tra
 python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
 ```
 
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
+The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. Set `--model` to `resnet18_v1/resnet50_v1b/resnet101_v1` to quantize other models. The following command is to launch inference.
 
 ```
 # Launch FP32 Inference
@@ -204,17 +268,14 @@ SSD model is located in [example/ssd](https://github.com/apache/incubator-mxnet/
 This script also supports custom symbolic models. You can easily add some quantization layer configs in `imagenet_gen_qsym_mkldnn.py` like below:
 
 ```
-elif args.model == 'custom':
+else:
+    logger.info('Please set proper RGB configs for model %s' % args.model)
     # add rgb mean/std of your model.
     rgb_mean = '0,0,0'
     rgb_std = '0,0,0'
-    calib_layer = lambda name: name.endswith('_output')
     # add layer names you donnot want to quantize.
-    # add conv/pool layer names that has negative inputs
-    # since Intel® MKL-DNN only support uint8 quantization temporary.
-    # add all fc layer names since Intel® MKL-DNN does not support temporary.
+    logger.info('Please set proper excluded_sym_names for model %s' % args.model)
     excluded_sym_names += ['layers']
-    # add your first conv layer names since Intel® MKL-DNN only support uint8 quantization temporary.
     if exclude_first_conv:
         excluded_sym_names += ['layers']
 ```
@@ -230,7 +291,7 @@ Some tips on quantization configs:
 python imagenet_inference.py --symbol-file=./model/custom-symbol.json --param-file=./model/custom-0000.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
 ```
 
-3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script. Notice that you should exclude conv/pool layers that have negative data since Intel® MKL-DNN only supports `uint8` quantization temporarily. You should also exclude all fc layers in your model.
+3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script.
 
 4. Then, you can run the following command for quantization:
 
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 482127ba355c..302a04449885 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -92,21 +92,12 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support')
-    parser.add_argument('--model', type=str, choices=['resnet18_v1',
-                                                      'resnet50_v1',
-                                                      'resnet101_v1',
-                                                      'inceptionv3',
-                                                      'squeezenet1.0',
-                                                      'mobilenet1.0',
-                                                      'mobilenetv2_1.0',
-                                                      'imagenet1k-resnet-152',
-                                                      'imagenet1k-inception-bn',
-                                                      'custom'],
-                        help='currently only supports imagenet1k-resnet-50_v1, imagenet1k-resnet-152 or imagenet1k-inception-bn.'
-                             'you can set to custom to load your pre-trained model.')
-    parser.add_argument('--use-gluon-model', type=bool, default=False,
-                        help='If enabled, will download pretrained model from Gluon-CV '
-                             'and convert to symbolic model ')
+    parser.add_argument('--model', type=str, default='resnet50_v1',
+                        help='model to be quantized.')
+    parser.add_argument('--epoch', type=int, default=0,
+                        help='number of epochs, default is 0')
+    parser.add_argument('--no-pretrained', action='store_true', default=False,
+                        help='If enabled, will not download pretrained model from MXNet or Gluon-CV modelzoo.')
     parser.add_argument('--batch-size', type=int, default=32)
     parser.add_argument('--label-name', type=str, default='softmax_label')
     parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
@@ -155,6 +146,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
 
+    logger.info(args)
     logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
 
     calib_mode = args.calib_mode
@@ -165,29 +157,24 @@ def save_params(fname, arg_params, aux_params, logger=None):
         download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
 
     # download model
-    if args.model in ['resnet18_v1',
-                      'resnet50_v1',
-                      'resnet101_v1',
-                      'squeezenet1.0',
-                      'mobilenet1.0',
-                      'mobilenetv2_1.0',
-                      'inceptionv3']:
-        logger.info('model %s is converted from GluonCV' % args.model)
-        args.use_gluon_model = True
-    if args.use_gluon_model == True:
-        prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger)
-        epoch = 0
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-    elif args.model == 'custom':
+    if not args.no_pretrained:
+        logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
+        logger.info('If you want to use custom model, please set --no-pretrained.')
+        if args.model in ['imagenet1k-resnet-152', 'imagenet1k-inception-bn']:
+            logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
+            prefix, epoch = download_model(model_name=args.model, logger=logger)
+        else:
+            logger.info('model %s is converted from GluonCV' % args.model)
+            prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger)
+            rgb_mean = '123.68,116.779,103.939'
+            rgb_std = '58.393, 57.12, 57.375'
+            epoch = 0
+    else:
         dir_path = os.path.dirname(os.path.realpath(__file__))
         prefix = os.path.join(dir_path, 'model', args.model)
-        epoch = 0
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-    else:
-        prefix, epoch = download_model(model_name=args.model, logger=logger)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        epoch = args.epoch
 
-    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
 
     # get batch size
     batch_size = args.batch_size
@@ -212,57 +199,59 @@ def save_params(fname, arg_params, aux_params, logger=None):
         logger.info('quantized dtype is set to uint8, will exclude first conv.')
         exclude_first_conv = True
     excluded_sym_names = []
-    if args.model == 'imagenet1k-resnet-152':
-        rgb_mean = '0,0,0'
-        rgb_std = '1,1,1'
-        excluded_sym_names += ['flatten0']
-        if exclude_first_conv:
-            excluded_sym_names += ['conv0']
-    elif args.model == 'imagenet1k-inception-bn':
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '1,1,1'
-        excluded_sym_names += ['flatten']
-        if exclude_first_conv:
-            excluded_sym_names += ['conv_1']
-    elif args.model in ['resnet18_v1', 'resnet50_v1', 'resnet101_v1']:
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '58.393, 57.12, 57.375'
-        if exclude_first_conv:
-            excluded_sym_names += ['resnetv10_conv0_fwd']
-    elif args.model == 'squeezenet1.0':
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '58.393, 57.12, 57.375'
-        excluded_sym_names += ['squeezenet0_flatten0_flatten0']
-        if exclude_first_conv:
-            excluded_sym_names += ['squeezenet0_conv0_fwd']
-    elif args.model == 'mobilenet1.0':
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '58.393, 57.12, 57.375'
-        excluded_sym_names += ['mobilenet0_flatten0_flatten0',
-                               'mobilenet0_pool0_fwd']
-        if exclude_first_conv:
-            excluded_sym_names += ['mobilenet0_conv0_fwd']
-    elif args.model == 'mobilenetv2_1.0':
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '58.393, 57.12, 57.375'
-        excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0']
-        if exclude_first_conv:
-            excluded_sym_names += ['mobilenetv20_conv0_fwd']
-    elif args.model == 'inceptionv3':
-        rgb_mean = '123.68,116.779,103.939'
-        rgb_std = '58.393, 57.12, 57.375'
-        if exclude_first_conv:
-            excluded_sym_names += ['inception30_conv0_fwd']
-    elif args.model == 'custom':
+    if not args.no_pretrained:
+        if args.model == 'imagenet1k-resnet-152':
+            rgb_mean = '0,0,0'
+            rgb_std = '1,1,1'
+            excluded_sym_names += ['flatten0']
+            if exclude_first_conv:
+                excluded_sym_names += ['conv0']
+        elif args.model == 'imagenet1k-inception-bn':
+            rgb_mean = '123.68,116.779,103.939'
+            rgb_std = '1,1,1'
+            excluded_sym_names += ['flatten']
+            if exclude_first_conv:
+                excluded_sym_names += ['conv_1']
+        elif args.model.find('resnet') != -1 and args.model.find('v1') != -1:
+            if exclude_first_conv:
+                excluded_sym_names += ['resnetv10_conv0_fwd']
+        elif args.model.find('resnet') != -1 and args.model.find('v2') != -1:
+            excluded_sym_names += ['resnetv20_flatten0_flatten0']
+            if exclude_first_conv:
+                excluded_sym_names += ['resnetv20_conv0_fwd']
+        elif args.model.find('vgg') != -1:
+            if exclude_first_conv:
+                excluded_sym_names += ['vgg0_conv0_fwd']
+        elif args.model.find('squeezenet1') != -1:
+            excluded_sym_names += ['squeezenet0_flatten0_flatten0']
+            if exclude_first_conv:
+                excluded_sym_names += ['squeezenet0_conv0_fwd']
+        elif args.model.find('mobilenet') != -1 and args.model.find('v2') == -1:
+            excluded_sym_names += ['mobilenet0_flatten0_flatten0',
+                                'mobilenet0_pool0_fwd']
+            if exclude_first_conv:
+                excluded_sym_names += ['mobilenet0_conv0_fwd']
+        elif args.model.find('mobilenet') != -1 and args.model.find('v2') != -1:
+            excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0']
+            if exclude_first_conv:
+                excluded_sym_names += ['mobilenetv20_conv0_fwd']
+        elif args.model == 'inceptionv3':
+            if exclude_first_conv:
+                excluded_sym_names += ['inception30_conv0_fwd']
+        else:
+            raise ValueError('Currently, model %s is not supported in this script' % args.model)
+    else:
+        logger.info('Please set proper RGB configs for model %s' % args.model)
         # add rgb mean/std of your model.
         rgb_mean = '0,0,0'
         rgb_std = '0,0,0'
         # add layer names you donnot want to quantize.
+        logger.info('Please set proper excluded_sym_names for model %s' % args.model)
         excluded_sym_names += ['layers']
         if exclude_first_conv:
             excluded_sym_names += ['layers']
-    else:
-        raise ValueError('model %s is not supported in this script' % args.model)
+
+    logger.info('These layers have been excluded %s' % excluded_sym_names)
 
     label_name = args.label_name
     logger.info('label_name = %s' % label_name)
@@ -281,10 +270,10 @@ def save_params(fname, arg_params, aux_params, logger=None):
     combine_mean_std.update(std_args)
     if calib_mode == 'none':
         logger.info('Quantizing FP32 model %s' % args.model)
-        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
-                                                       logger=logger)
+        qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                              ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                              calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                              logger=logger)
         sym_name = '%s-symbol.json' % (prefix + '-quantized')
     else:
         logger.info('Creating ImageRecordIter for reading calibration dataset')
@@ -301,12 +290,12 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                      seed=args.shuffle_seed,
                                      **combine_mean_std)
 
-        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                        calib_mode=calib_mode, calib_data=data,
-                                                        num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
-                                                        label_names=(label_name,), logger=logger)
+        qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                              ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                              calib_mode=calib_mode, calib_data=data,
+                                                              num_calib_examples=num_calib_batches * batch_size,
+                                                              calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                              label_names=(label_name,), logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
         elif calib_mode == 'naive':
@@ -315,7 +304,6 @@ def save_params(fname, arg_params, aux_params, logger=None):
             raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
                              % calib_mode)
         sym_name = '%s-symbol.json' % (prefix + suffix)
-    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
     save_symbol(sym_name, qsym, logger)
     param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
     save_params(param_name, qarg_params, aux_params, logger)
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index b94b5a8da32a..fa2ab1842f5f 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -543,3 +543,240 @@ def quantize_model(sym, arg_params, aux_params,
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
+
+def quantize_model_mkldnn(sym, arg_params, aux_params,
+                          data_names=('data',), label_names=('softmax_label',),
+                          ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
+                          calib_data=None, num_calib_examples=None, calib_layer=None,
+                          quantized_dtype='int8', logger=logging):
+    """User-level API for generating a fusion + quantized model from a FP32 model
+    w/ or w/o calibration with Intel MKL-DNN.
+    The backend quantized operators are only enabled for Linux systems. Please do not run
+    inference using the quantized models on Windows for now.
+
+    Parameters
+    ----------
+    sym : str or Symbol
+        Defines the structure of a neural network for FP32 data types.
+    arg_params : dict
+        Dictionary of name to `NDArray`.
+    aux_params : dict
+        Dictionary of name to `NDArray`.
+    data_names : a list of strs
+        Data names required for creating a Module object to run forward propagation on the
+        calibration dataset.
+    label_names : a list of strs
+        Label names required for creating a Module object to run forward propagation on the
+        calibration dataset.
+    ctx : Context
+        Defines the device that users want to run forward propagation on the calibration
+        dataset for collecting layer output statistics. Currently, only supports single context.
+    excluded_sym_names : list of strings
+        A list of strings representing the names of the symbols that users want to excluding
+        from being quantized.
+    calib_mode : str
+        If calib_mode='none', no calibration will be used and the thresholds for
+        requantization after the corresponding layers will be calculated at runtime by
+        calling min and max operators. The quantized models generated in this
+        mode are normally 10-20% slower than those with calibrations during inference.
+        If calib_mode='naive', the min and max values of the layer outputs from a calibration
+        dataset will be directly taken as the thresholds for quantization.
+        If calib_mode='entropy' (default mode), the thresholds for quantization will be
+        derived such that the KL divergence between the distributions of FP32 layer outputs and
+        quantized layer outputs is minimized based upon the calibration dataset.
+    calib_data : DataIter
+        A data iterator initialized by the calibration dataset.
+    num_calib_examples : int or None
+        The maximum number of examples that user would like to use for calibration. If not provided,
+        the whole calibration dataset will be used.
+    calib_layer : function
+        Given a layer's output name in string, return True or False for deciding whether to
+        calibrate this layer. If yes, the statistics of the layer's output will be collected;
+        otherwise, no information of the layer's output will be collected. If not provided,
+        all the layers' outputs that need requantization will be collected.
+    quantized_dtype : str
+        The quantized destination type for input data. Currently support 'int8'
+        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
+        Default value is 'int8'.
+    logger : Object
+        A logging object for printing information during the process of quantization.
+
+    Returns
+    -------
+    tuple
+        A tuple of quantized symbol, quantized arg_params, and aux_params.
+    -------
+    """
+    if ctx != cpu():
+        raise ValueError(
+            'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend')
+
+    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
+
+    qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                   data_names=data_names, label_names=label_names,
+                                                   ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                   calib_mode=calib_mode, calib_data=calib_data,
+                                                   num_calib_examples=num_calib_examples, calib_layer=calib_layer,
+                                                   quantized_dtype=quantized_dtype, logger=logger)
+
+    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
+
+    return qsym, qarg_params, aux_params
+
+def quantize_graph(sym, arg_params, aux_params,
+                   excluded_sym_names=None, calib_mode='entropy',
+                   calib_layer=None, quantized_dtype='int8', logger=logging):
+    """User-level API for generating a quantized model from a FP32 model w/o calibration
+    and a collector for naive or entropy calibration.
+    The backend quantized operators are only enabled for Linux systems. Please do not run
+    inference using the quantized models on Windows for now.
+    The quantization implementation adopts the TensorFlow's approach:
+    https://www.tensorflow.org/performance/quantization.
+    The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT:
+    http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    and adapts the method to MXNet.
+    Parameters
+    ----------
+    sym : str or Symbol
+        Defines the structure of a neural network for FP32 data types.
+    arg_params : dict
+        Dictionary of name to `NDArray`.
+    aux_params : dict
+        Dictionary of name to `NDArray`.
+    excluded_sym_names : list of strings
+        A list of strings representing the names of the symbols that users want to excluding
+        from being quantized.
+    calib_mode : str
+        If calib_mode='none', no calibration will be used and the thresholds for
+        requantization after the corresponding layers will be calculated at runtime by
+        calling min and max operators. The quantized models generated in this
+        mode are normally 10-20% slower than those with calibrations during inference.
+        If calib_mode='naive', the min and max values of the layer outputs from a calibration
+        dataset will be directly taken as the thresholds for quantization.
+        If calib_mode='entropy' (default mode), the thresholds for quantization will be
+        derived such that the KL divergence between the distributions of FP32 layer outputs and
+        quantized layer outputs is minimized based upon the calibration dataset.
+    calib_layer : function
+        Given a layer's output name in string, return True or False for deciding whether to
+        calibrate this layer. If yes, the statistics of the layer's output will be collected;
+        otherwise, no information of the layer's output will be collected. If not provided,
+        all the layers' outputs that need requantization will be collected.
+    quantized_dtype : str
+        The quantized destination type for input data. Currently support 'int8'
+        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
+        Default value is 'int8'.
+    logger : Object
+        A logging object for printing information during the process of quantization.
+    Returns
+    -------
+    tuple
+        A tuple of quantized symbol, quantized arg_params, aux_params and collector.
+    -------
+    """
+    if excluded_sym_names is None:
+        excluded_sym_names = []
+    if not isinstance(excluded_sym_names, list):
+        raise ValueError('excluded_sym_names must be a list of strings representing'
+                         ' the names of the symbols that will not be quantized,'
+                         ' while received type %s' % str(type(excluded_sym_names)))
+
+    logger.info('Quantizing graph')
+    if quantized_dtype not in ('int8', 'uint8', 'auto'):
+        raise ValueError('unknown quantized_dtype %s received,'
+                         ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
+    qsym = _quantize_symbol(sym, excluded_symbols=excluded_sym_names,
+                            offline_params=list(arg_params.keys()),
+                            quantized_dtype=quantized_dtype)
+
+    th_dict = {}
+    collector = None
+    if calib_mode is not None and calib_mode != 'none':
+        if calib_mode == 'entropy':
+            collector = _LayerOutputCollector(
+                include_layer=calib_layer, logger=logger)
+            logger.info(
+                'Create a layer output collector for entropy calibration.')
+        elif calib_mode == 'naive':
+            collector = _LayerOutputMinMaxCollector(
+                include_layer=calib_layer, logger=logger)
+            logger.info(
+                'Create a layer output minmax collector for naive calibration')
+        else:
+            raise ValueError('unknown calibration mode %s received,'
+                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
+        logger.info('Collector created, please use set_monitor_callback'
+                    ' to collect calibration information.')
+
+    logger.info('Quantizing parameters')
+    qarg_params = _quantize_params(qsym, arg_params, th_dict)
+
+    return qsym, qarg_params, aux_params, collector
+
+def calib_graph(qsym, arg_params, aux_params, collector,
+                calib_mode='entropy', quantized_dtype='int8', logger=logging):
+    """User-level API for calibrating a quantized model using a filled collector.
+    The backend quantized operators are only enabled for Linux systems. Please do not run
+    inference using the quantized models on Windows for now.
+    The quantization implementation adopts the TensorFlow's approach:
+    https://www.tensorflow.org/performance/quantization.
+    The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT:
+    http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    and adapts the method to MXNet.
+    Parameters
+    ----------
+    qsym : str or Symbol
+        Defines the structure of a neural network for INT8 data types.
+    arg_params : dict
+        Dictionary of name to `NDArray`.
+    aux_params : dict
+        Dictionary of name to `NDArray`.
+    collector : function
+        layer collector for naive or entropy calibration.
+    calib_mode : str
+        If calib_mode='none', no calibration will be used and the thresholds for
+        requantization after the corresponding layers will be calculated at runtime by
+        calling min and max operators. The quantized models generated in this
+        mode are normally 10-20% slower than those with calibrations during inference.
+        If calib_mode='naive', the min and max values of the layer outputs from a calibration
+        dataset will be directly taken as the thresholds for quantization.
+        If calib_mode='entropy' (default mode), the thresholds for quantization will be
+        derived such that the KL divergence between the distributions of FP32 layer outputs and
+        quantized layer outputs is minimized based upon the calibration dataset.
+    calib_layer : function
+        Given a layer's output name in string, return True or False for deciding whether to
+        calibrate this layer. If yes, the statistics of the layer's output will be collected;
+        otherwise, no information of the layer's output will be collected. If not provided,
+        all the layers' outputs that need requantization will be collected.
+    quantized_dtype : str
+        The quantized destination type for input data. Currently support 'int8'
+        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
+        Default value is 'int8'.
+    logger : Object
+        A logging object for printing information during the process of quantization.
+    Returns
+    -------
+    tuple
+        A tuple of calibrated symbol, quantized arg_params, aux_params.
+    -------
+    """
+    th_dict = {}
+    if calib_mode is not None and calib_mode != 'none':
+        if calib_mode == 'entropy':
+            logger.info('Calculating optimal thresholds for quantization')
+            th_dict = _get_optimal_thresholds(
+                collector.nd_dict, quantized_dtype, logger=logger)
+        elif calib_mode == 'naive':
+            th_dict = collector.min_max_dict
+        else:
+            raise ValueError('unknown calibration mode %s received,'
+                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
+        logger.info('Calibrating quantized symbol')
+        qsym = _calibrate_quantized_sym(qsym, th_dict)
+    else:
+        raise ValueError('please set calibration mode to naive or entropy.')
+
+    logger.info('Quantizing parameters')
+    qarg_params = _quantize_params(qsym, arg_params, th_dict)
+
+    return qsym, qarg_params, aux_params
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 0c4954acbd8b..5fe6a03eae7b 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -213,3 +213,6 @@ def test_control_flow():
 
 def test_amp():
     assert _test_tutorial_nb('amp/amp_tutorial')
+
+def test_mkldnn_quantization():
+    assert _test_tutorial_nb('mkldnn/mkldnn_quantization')
\ No newline at end of file

From 0b1c8f6d31113c6ce1b1a1c35dc03925da77a890 Mon Sep 17 00:00:00 2001
From: dtracz <41399548+dtracz@users.noreply.github.com>
Date: Thu, 1 Aug 2019 14:41:13 -0700
Subject: [PATCH 16/24] make TransposeShape infer shape form both sides
 (#15713)

* make TransposeShape infer shape form both sides

* small fixes

* remove redundant lines

* unit tests
---
 src/operator/tensor/matrix_op-inl.h    | 19 +++++++++++++++++--
 tests/python/unittest/test_operator.py | 20 ++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5cd7bf6652d3..cd98cb020c6b 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -344,19 +344,34 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& shp = (*in_attrs)[0];
+  mxnet::TShape& out_shp = (*out_attrs)[0];
   CHECK_LE(shp.ndim(), 6) << "Transpose support at most 6 dimensions";
-  mxnet::TShape ret(shp.ndim(), -1);
+  CHECK_NE(shp.ndim(), 0) << "Number of dimensions cannot be 0";
+  CHECK_NE(out_shp.ndim(), 0) << "Number of dimensions cannot be 0";
+  if (shp.ndim() == -1 && out_shp.ndim() == -1)
+    return false;  // none of the shapes is known
+  if (out_shp.ndim() > 0 && shp.ndim() > 0)
+    CHECK_EQ(out_shp.ndim(), shp.ndim());
+  mxnet::TShape get(std::max(shp.ndim(), out_shp.ndim()), -1);
+  mxnet::TShape ret(std::max(shp.ndim(), out_shp.ndim()), -1);
   if (param.axes.ndim() == 0) {
     for (int i = 0; i < shp.ndim(); ++i) {
       ret[i] = shp[shp.ndim()-1-i];
     }
+    for (int i = 0; i < out_shp.ndim(); ++i) {
+      get[shp.ndim()-1-i] = out_shp[i];
+    }
   } else {
-    CHECK_EQ(shp.ndim(), param.axes.ndim());
+    CHECK_EQ(std::max(shp.ndim(), out_shp.ndim()), param.axes.ndim());
     for (int i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
       ret[i] = shp[param.axes[i]];
     }
+    for (int i = 0; i < out_shp.ndim(); ++i) {
+      get[param.axes[i]] = out_shp[i];
+    }
   }
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, get);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
   return shape_is_known(ret);
 }
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index adc52a1dd50f..5d7e51af7467 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -8997,6 +8997,26 @@ def test_get_operator_arguments():
     ok_(operator_arguments.narg == 2)
 
 
+def test_transpose_infer_shape_back():
+    o1 = mx.sym.ones(shape=[2,3])
+    o2 = mx.sym.ones(shape=[-1,-1])
+    t = mx.sym.transpose(o2)
+    b = o1 + t
+    x = b.bind(mx.cpu(), args={})
+    y = x.forward()
+    assert(y[0].shape == (2,3))
+
+
+def test_transpose_infer_shape_mixed():
+    o1 = mx.sym.ones(shape=[2,-1])
+    o2 = mx.sym.ones(shape=[3,-1])
+    t = mx.sym.transpose(o2)
+    b = o1 + t
+    x = b.bind(mx.cpu(), args={})
+    y = x.forward()
+    assert(y[0].shape == (2,3))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From a3d32e4e476699edcdfbdb1acba739c19069598b Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Thu, 1 Aug 2019 15:09:04 -0700
Subject: [PATCH 17/24] [MXNET-1358] Fit api tutorial (#15353)

* Added tutorial for FIT API

* Added tests for Fit API tutorial

* Updated index.md for the new tutorial to show up

* Addressed PR feedback

* Addressed PR feedback

* Removed spurious comment for Py2 and Py3 compatibility

* Address PR feedback

* Addressed PR feedback

* Fixed typo

* Added example to showcase custom event handler

* Fixed imports as estimator moved to contrib package

* Added a side note to inform about estimator reference being updated by the handlers

* Corrected typo

* update tutorial

* address comments

* new line

* fix import

* fix cached graph

* fix import

* address comments

* fix doc gen

* add softmax

* add to website index

* fix doc string

* Fix doc gen (#12)

* fix warining

* fix test

* fix

* fix

* fix print

* fix test (#13)

* fix warning (#14)

* fix href (#15)
---
 docs/api/python/gluon/contrib.md              |  30 ++
 docs/tutorials/gluon/fit_api_tutorial.md      | 271 ++++++++++++++++++
 docs/tutorials/index.md                       |   2 +
 python/mxnet/gluon/contrib/__init__.py        |   2 +
 .../mxnet/gluon/contrib/estimator/__init__.py |   2 +
 .../gluon/contrib/estimator/estimator.py      |  64 +++--
 .../gluon/contrib/estimator/event_handler.py  |  23 +-
 tests/python/unittest/test_gluon_estimator.py |   7 +-
 tests/tutorials/test_tutorials.py             |   3 +
 9 files changed, 367 insertions(+), 37 deletions(-)
 create mode 100644 docs/tutorials/gluon/fit_api_tutorial.md

diff --git a/docs/api/python/gluon/contrib.md b/docs/api/python/gluon/contrib.md
index a940f697de69..22cdebb53b85 100644
--- a/docs/api/python/gluon/contrib.md
+++ b/docs/api/python/gluon/contrib.md
@@ -114,6 +114,33 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
     WikiText103
 ```
 
+### Estimator
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.contrib.estimator
+
+.. autosummary::
+    :nosignatures:
+    
+    Estimator
+```
+
+#### EventHandler
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.contrib.estimator
+
+.. autosummary::
+    :nosignatures:
+
+    StoppingHandler
+    MetricHandler
+    ValidationHandler
+    LoggingHandler
+    CheckpointHandler
+    EarlyStoppingHandler
+```
+
 ## API Reference
 
 <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
@@ -144,6 +171,9 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
     :members:
     :imported-members:
 
+.. automodule:: mxnet.gluon.contrib.estimator
+    :members:
+    :imported-members:
 ```
 
 <script>auto_index("api-reference");</script>
diff --git a/docs/tutorials/gluon/fit_api_tutorial.md b/docs/tutorials/gluon/fit_api_tutorial.md
new file mode 100644
index 000000000000..bc50690ac1a2
--- /dev/null
+++ b/docs/tutorials/gluon/fit_api_tutorial.md
@@ -0,0 +1,271 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# MXNet Gluon Fit API
+
+In this tutorial, you will learn how to use the [Gluon Fit API](https://cwiki.apache.org/confluence/display/MXNET/Gluon+Fit+API+-+Tech+Design) which is the easiest way to train deep learning models using the [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html) in Apache MXNet. 
+
+With the Fit API, you can train a deep learning model with a minimal amount of code. Just specify the network, loss function and the data you want to train on. You don't need to worry about the boiler plate code to loop through the dataset in batches (often called as 'training loop'). Advanced users can train with bespoke training loops, and many of these use cases will be covered by the Fit API.
+
+To demonstrate the Fit API, you will train an image classification model using the [ResNet-18](https://arxiv.org/abs/1512.03385) neural network architecture. The model will be trained using the [Fashion-MNIST dataset](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/). 
+
+## Prerequisites
+
+To complete this tutorial, you will need:
+
+- [MXNet](https://mxnet.incubator.apache.org/install/#overview) (The version of MXNet will be >= 1.5.0, you can use `pip install mxnet` to get 1.5.0 release pip package or build from source with master, refer to [MXNet installation](http://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Linux&language=Python&processor=CPU)
+- [Jupyter Notebook](https://jupyter.org/index.html) (For interactively running the provided .ipynb file)
+
+
+
+
+```python
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+from mxnet.gluon.contrib.estimator import estimator
+from mxnet.gluon.contrib.estimator.event_handler import TrainBegin, TrainEnd, EpochEnd, CheckpointHandler
+
+gpu_count = mx.context.num_gpus()
+ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
+```
+
+## Dataset
+
+[Fashion-MNIST](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/) dataset consists of fashion items divided into ten categories: t-shirt/top, trouser, pullover, dress, coat, sandal, shirt, sneaker, bag and ankle boot. 
+
+- It has 60,000 grayscale images of size 28 * 28 for training.  
+- It has 10,000 grayscale images of size 28 * 28 for testing/validation. 
+
+We will use the ```gluon.data.vision``` package to directly import the Fashion-MNIST dataset and perform pre-processing on it.
+
+
+```python
+# Get the training data 
+fashion_mnist_train = gluon.data.vision.FashionMNIST(train=True)
+
+# Get the validation data
+fashion_mnist_val = gluon.data.vision.FashionMNIST(train=False)
+```
+
+
+```python
+transforms = [gluon.data.vision.transforms.Resize(224), # We pick 224 as the model we use takes an input of size 224.
+                gluon.data.vision.transforms.ToTensor()]
+
+# Now we will stack all these together.
+transforms = gluon.data.vision.transforms.Compose(transforms)
+```
+
+
+```python
+# Apply the transformations
+fashion_mnist_train = fashion_mnist_train.transform_first(transforms)
+fashion_mnist_val = fashion_mnist_val.transform_first(transforms)
+```
+
+
+```python
+batch_size = 256 # Batch size of the images
+num_workers = 4 # The number of parallel workers for loading the data using Data Loaders.
+
+train_data_loader = gluon.data.DataLoader(fashion_mnist_train, batch_size=batch_size, 
+                                          shuffle=True, num_workers=num_workers)
+val_data_loader = gluon.data.DataLoader(fashion_mnist_val, batch_size=batch_size, 
+                                        shuffle=False, num_workers=num_workers)
+```
+
+## Model and Optimizers
+
+Let's load the resnet-18 model architecture from [Gluon Model Zoo](http://mxnet.apache.org/api/python/gluon/model_zoo.html) and initialize its parameters. The Gluon Model Zoo contains a repository of pre-trained models as well the model architecture definitions. We are using the model architecture from the model zoo in order to train it from scratch.
+
+
+```python
+resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10)
+resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx)
+```
+
+We will be using `SoftmaxCrossEntropyLoss` as the loss function since this is a multi-class classification problem. We will be using `sgd` (Stochastic Gradient Descent) as the optimizer. 
+You can experiment with a [different loss](http://mxnet.incubator.apache.org/versions/master/api/python/gluon/loss.html) or [optimizer](http://mxnet.incubator.apache.org/versions/master/api/python/optimization/optimization.html) as well. 
+
+
+```python
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+Let's define the trainer object for training the model.
+
+
+```python
+learning_rate = 0.04 # You can experiment with your own learning rate here
+num_epochs = 2 # You can run training for more epochs
+trainer = gluon.Trainer(resnet_18_v1.collect_params(), 
+                        'sgd', {'learning_rate': learning_rate})
+```
+
+## Train using Fit API
+
+As stated earlier, the Fit API greatly simplifies the boiler plate code and complexity for training using MXNet Gluon.
+
+In the basic usage example, with just 2 lines of code, we will set up our model for training.
+
+### Basic Usage
+
+
+```python
+train_acc = mx.metric.Accuracy() # Metric to monitor
+
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+est = estimator.Estimator(net=resnet_18_v1, 
+                          loss=loss_fn, 
+                          metrics=train_acc, 
+                          trainer=trainer, 
+                          context=ctx)
+
+# ignore warnings for nightly test on CI only
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    # Magic line
+    est.fit(train_data=train_data_loader,
+        epochs=num_epochs)
+```
+
+    Training begin: using optimizer SGD with current learning rate 0.0400 <!--notebook-skip-line-->
+    Train for 2 epochs. <!--notebook-skip-line-->
+    
+    [Epoch 0] finished in 25.110s: train_accuracy : 0.7877 train_softmaxcrossentropyloss0 : 0.5905 <!--notebook-skip-line-->
+    
+    [Epoch 1] finished in 23.595s: train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 <!--notebook-skip-line-->
+    Train finished using total 48s at epoch 1. train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 <!--notebook-skip-line-->
+
+
+### Advanced Usage
+
+The Fit API is also customizable with several `Event Handlers` which give a fine grained control over the steps in training and exposes callback methods that provide control over the stages involved in training. Available callback methods are: `train_begin`, `train_end`, `batch_begin`, `batch_end`, `epoch_begin` and `epoch_end`.
+
+You can use built-in event handlers such as `LoggingHandler`, `CheckpointHandler` or `EarlyStoppingHandler` to log and save the model at certain time-steps during training. You can also stop the training when the model's performance plateaus. 
+There are also some default utility handlers that will be added to your estimator by default. For example, `StoppingHandler` is used to control when the training ends, based on number of epochs or number of batches trained. 
+`MetricHandler` is used to calculate training metrics at end of each batch and epoch. 
+`ValidationHandler` is used to validate your model on test data at each epoch's end and then calculate validation metrics.
+You can create these utility handlers with different configurations and pass to estimator. This will override the default handler configuration.
+You can create a custom handler by inheriting one or multiple 
+[base event handlers](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/contrib/estimator/event_handler.py#L32)
+ including: `TrainBegin`, `TrainEnd`, `EpochBegin`, `EpochEnd`, `BatchBegin`, `BatchEnd`.
+
+
+### Custom Event Handler
+
+Here we will showcase an example custom event handler the inherits features from a few base handler classes. 
+Our custom event handler is a simple one: record the loss values at the end of every epoch in our training phase.
+
+Note: For each of the method, the `Estimator` object is passed along, so you can access training metrics.
+
+```python
+class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
+    def __init__(self):
+        super(LossRecordHandler, self).__init__()
+        self.loss_history = {}
+
+    def train_begin(self, estimator, *args, **kwargs):
+        print("Training begin")
+
+    def train_end(self, estimator, *args, **kwargs):
+        # Print all the losses at the end of training
+        print("Training ended")
+        for loss_name in self.loss_history:
+            for i, loss_val in enumerate(self.loss_history[loss_name]):
+                print("Epoch: {}, Loss name: {}, Loss value: {}".format(i, loss_name, loss_val))
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        for metric in estimator.train_metrics:
+            # look for train Loss in training metrics
+            # we wrapped loss value as a metric to record it
+            if isinstance(metric, mx.metric.Loss):
+                loss_name, loss_val = metric.get()
+                # append loss value for this epoch
+                self.loss_history.setdefault(loss_name, []).append(loss_val)
+```
+
+
+```python
+# Let's reset the model, trainer and accuracy objects from above
+
+resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx)
+trainer = gluon.Trainer(resnet_18_v1.collect_params(), 
+                        'sgd', {'learning_rate': learning_rate})
+train_acc = mx.metric.Accuracy()
+```
+
+
+```python
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+est = estimator.Estimator(net=resnet_18_v1,
+                          loss=loss_fn,
+                          metrics=train_acc,
+                          trainer=trainer, 
+                          context=ctx)
+
+# Define the handlers, let's say in built Checkpointhandler
+checkpoint_handler = CheckpointHandler(model_dir='./',
+                                       model_prefix='my_model',
+                                       monitor=train_acc,  # Monitors a metric
+                                       save_best=True)  # Save the best model in terms of
+# Let's instantiate another handler which we defined above 
+loss_record_handler = LossRecordHandler()
+# ignore warnings for nightly test on CI only
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    # Magic line
+    est.fit(train_data=train_data_loader,
+            val_data=val_data_loader,
+            epochs=num_epochs,
+            event_handlers=[checkpoint_handler, loss_record_handler]) # Add the event handlers
+```
+
+    Training begin: using optimizer SGD with current learning rate 0.0400 <!--notebook-skip-line-->
+    Train for 2 epochs. <!--notebook-skip-line-->
+    
+    [Epoch 0] finished in 25.236s: train_accuracy : 0.7917 train_softmaxcrossentropyloss0 : 0.5741 val_accuracy : 0.6612 val_softmaxcrossentropyloss0 : 0.8627 <!--notebook-skip-line-->
+    
+    [Epoch 1] finished in 24.892s: train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 <!--notebook-skip-line-->
+    
+    Train finished using total 50s at epoch 1. train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 <!--notebook-skip-line-->
+
+    Training begin <!--notebook-skip-line-->
+    Epoch 1, loss 0.5741 <!--notebook-skip-line-->
+    Epoch 2, loss 0.3229 <!--notebook-skip-line-->
+
+You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](save_load_params.html#saving-model-parameters-to-file)
+
+
+```python
+resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes=10)
+resnet_18_v1.load_parameters('./my_model-best.params', ctx=ctx)
+```
+
+## Summary
+
+- To learn more about deep learning with MXNeT, see [Dive Into Deep Learning](http://gluon.io)
+
+## Next Steps 
+
+- For more hands on learning about deep learning, check out [Dive into Deep Learning](https://d2l.ai)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index e01a30dbe68c..f773a79f63a7 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -139,6 +139,8 @@ Select API:&nbsp;
             * [Data Transforms](/tutorials/gluon/transforms.html)
             * [Applying Data Augmentation](/tutorials/gluon/data_augmentation.html)
             * [Data Augmentation with Masks (for Object Segmentation)](https://mxnet.incubator.apache.org/tutorials/python/data_augmentation_with_masks.html)
+        * Fit API
+            * [Using Fit API](/tutorials/gluon/fit_api_tutorial.html)
 </div> <!--end of gluon-->
 
 <div class="module">
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index 83be8a39ba32..7590eb740f67 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -25,3 +25,5 @@
 from . import cnn
 
 from . import data
+
+from . import estimator
diff --git a/python/mxnet/gluon/contrib/estimator/__init__.py b/python/mxnet/gluon/contrib/estimator/__init__.py
index 58600dadffb4..bb0a0917c363 100644
--- a/python/mxnet/gluon/contrib/estimator/__init__.py
+++ b/python/mxnet/gluon/contrib/estimator/__init__.py
@@ -17,5 +17,7 @@
 
 # pylint: disable=wildcard-import
 """Gluon Estimator Module"""
+from . import estimator
+from . import event_handler
 from .estimator import *
 from .event_handler import *
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index da1a3915caec..b6142e100d96 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -24,9 +24,15 @@
 
 from .event_handler import MetricHandler, ValidationHandler, LoggingHandler, StoppingHandler
 from .event_handler import TrainBegin, EpochBegin, BatchBegin, BatchEnd, EpochEnd, TrainEnd
-from .... import gluon, autograd
+from ...data import DataLoader
+from ...loss import SoftmaxCrossEntropyLoss
+from ...loss import Loss as gluon_loss
+from ...trainer import Trainer
+from ...utils import split_and_load
+from .... import autograd
 from ....context import Context, cpu, gpu, num_gpus
-from ....metric import EvalMetric, Loss, Accuracy
+from ....metric import EvalMetric, Accuracy
+from ....metric import Loss as metric_loss
 
 __all__ = ['Estimator']
 
@@ -69,9 +75,9 @@ def __init__(self, net,
         self.trainer = self._check_trainer(trainer)
 
     def _check_loss(self, loss):
-        if isinstance(loss, gluon.loss.Loss):
+        if isinstance(loss, gluon_loss):
             loss = [loss]
-        elif isinstance(loss, list) and all([isinstance(l, gluon.loss.Loss) for l in loss]):
+        elif isinstance(loss, list) and all([isinstance(l, gluon_loss) for l in loss]):
             loss = loss
         else:
             raise ValueError("loss must be a Loss or a list of Loss, "
@@ -146,9 +152,9 @@ def _check_trainer(self, trainer):
         if not trainer:
             warnings.warn("No trainer specified, default SGD optimizer "
                           "with learning rate 0.001 is used.")
-            trainer = gluon.Trainer(self.net.collect_params(),
-                                    'sgd', {'learning_rate': 0.001})
-        elif not isinstance(trainer, gluon.Trainer):
+            trainer = Trainer(self.net.collect_params(),
+                              'sgd', {'learning_rate': 0.001})
+        elif not isinstance(trainer, Trainer):
             raise ValueError("Trainer must be a Gluon Trainer instance, refer to "
                              "gluon.Trainer:{}".format(trainer))
         return trainer
@@ -165,8 +171,8 @@ def _is_initialized(self):
     def _get_data_and_label(self, batch, ctx, batch_axis=0):
         data = batch[0]
         label = batch[1]
-        data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
-        label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
+        data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
+        label = split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
         return data, label
 
     def prepare_loss_and_metrics(self):
@@ -179,13 +185,13 @@ def prepare_loss_and_metrics(self):
         """
         if any(not hasattr(self, attribute) for attribute in
                ['train_metrics', 'val_metrics']):
-            # Use default mx.metric.Accuracy() for gluon.loss.SoftmaxCrossEntropyLoss()
-            if not self.train_metrics and any([isinstance(l, gluon.loss.SoftmaxCrossEntropyLoss) for l in self.loss]):
+            # Use default mx.metric.Accuracy() for SoftmaxCrossEntropyLoss()
+            if not self.train_metrics and any([isinstance(l, SoftmaxCrossEntropyLoss) for l in self.loss]):
                 self.train_metrics = [Accuracy()]
             self.val_metrics = []
             for loss in self.loss:
                 # remove trailing numbers from loss name to avoid confusion
-                self.train_metrics.append(Loss(loss.name.rstrip('1234567890')))
+                self.train_metrics.append(metric_loss(loss.name.rstrip('1234567890')))
             for metric in self.train_metrics:
                 val_metric = copy.deepcopy(metric)
                 metric.name = "train " + metric.name
@@ -208,10 +214,10 @@ def evaluate(self,
          batch_axis : int, default 0
              Batch axis to split the validation data into devices.
          """
-        if not isinstance(val_data, gluon.data.DataLoader):
+        if not isinstance(val_data, DataLoader):
             raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
                              "can transform your DataIter or any NDArray into Gluon DataLoader. "
-                             "Refer to gluon.data.dataloader")
+                             "Refer to gluon.data.DataLoader")
 
         for metric in val_metrics:
             metric.reset()
@@ -222,7 +228,7 @@ def evaluate(self,
             loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)]
             # update metrics
             for metric in val_metrics:
-                if isinstance(metric, Loss):
+                if isinstance(metric, metric_loss):
                     metric.update(0, loss)
                 else:
                     metric.update(label, pred)
@@ -254,7 +260,7 @@ def fit(self, train_data,
         batch_axis : int, default 0
             Batch axis to split the training data into devices.
         """
-        if not isinstance(train_data, gluon.data.DataLoader):
+        if not isinstance(train_data, DataLoader):
             raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
                              "can transform your DataIter or any NDArray into Gluon DataLoader. "
                              "Refer to gluon.data.dataloader")
@@ -328,28 +334,36 @@ def fit(self, train_data,
     def _prepare_default_handlers(self, val_data, event_handlers):
         event_handlers = event_handlers or []
         default_handlers = []
-        train_metrics, val_metrics = self.prepare_loss_and_metrics()
+        self.prepare_loss_and_metrics()
 
         # no need to add to default handler check as StoppingHandler does not use metrics
         event_handlers.append(StoppingHandler(self.max_epoch, self.max_batch))
+        default_handlers.append("StoppingHandler")
 
         if not any(isinstance(handler, MetricHandler) for handler in event_handlers):
-            event_handlers.append(MetricHandler(train_metrics=train_metrics))
+            event_handlers.append(MetricHandler(train_metrics=self.train_metrics))
             default_handlers.append("MetricHandler")
 
-        if val_data and not any(isinstance(handler, ValidationHandler) for handler in event_handlers):
-            event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate,
-                                                    val_metrics=val_metrics))
-            default_handlers.append("ValidationHandler")
+        if not any(isinstance(handler, ValidationHandler) for handler in event_handlers):
+            # no validation handler
+            if val_data:
+                # add default validation handler if validation data found
+                event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate,
+                                                        val_metrics=self.val_metrics))
+                default_handlers.append("ValidationHandler")
+                val_metrics = self.val_metrics
+            else:
+                # set validation metrics to None if no validation data and no validation handler
+                val_metrics = []
 
         if not any(isinstance(handler, LoggingHandler) for handler in event_handlers):
-            event_handlers.append(LoggingHandler(train_metrics=train_metrics,
+            event_handlers.append(LoggingHandler(train_metrics=self.train_metrics,
                                                  val_metrics=val_metrics))
             default_handlers.append("LoggingHandler")
 
         # if there is a mix of user defined event handlers and default event handlers
         # they should have the same set of loss and metrics
-        if default_handlers:
+        if default_handlers and len(event_handlers) != len(default_handlers):
             msg = "You are training with the following default event handlers: %s. " \
                   "They use loss and metrics from estimator.prepare_loss_and_metrics(). " \
                   "Please use the same set of metrics for all your other handlers." % \
@@ -368,7 +382,7 @@ def _prepare_default_handlers(self, val_data, event_handlers):
             # remove None metric references
             references = set([ref for ref in references if ref])
             for metric in references:
-                if metric not in train_metrics + val_metrics:
+                if metric not in self.train_metrics + self.val_metrics:
                     msg = "We have added following default handlers for you: %s and used " \
                           "estimator.prepare_loss_and_metrics() to pass metrics to " \
                           "those handlers. Please use the same set of metrics " \
diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py
index ed97c7bc3d19..da2c84455e35 100644
--- a/python/mxnet/gluon/contrib/estimator/event_handler.py
+++ b/python/mxnet/gluon/contrib/estimator/event_handler.py
@@ -26,7 +26,12 @@
 
 import numpy as np
 
-from ....metric import EvalMetric, Loss
+from ....metric import EvalMetric
+from ....metric import Loss as metric_loss
+
+__all__ = ['TrainBegin', 'TrainEnd', 'EpochBegin', 'EpochEnd', 'BatchBegin', 'BatchEnd',
+           'StoppingHandler', 'MetricHandler', 'ValidationHandler',
+           'LoggingHandler', 'CheckpointHandler', 'EarlyStoppingHandler']
 
 
 class TrainBegin(object):
@@ -127,7 +132,7 @@ def batch_end(self, estimator, *args, **kwargs):
         label = kwargs['label']
         loss = kwargs['loss']
         for metric in self.train_metrics:
-            if isinstance(metric, Loss):
+            if isinstance(metric, metric_loss):
                 # metric wrapper for loss values
                 metric.update(0, loss)
             else:
@@ -135,7 +140,7 @@ def batch_end(self, estimator, *args, **kwargs):
 
 
 class ValidationHandler(TrainBegin, BatchEnd, EpochEnd):
-    """"Validation Handler that evaluate model on validation dataset
+    """Validation Handler that evaluate model on validation dataset
 
     :py:class:`ValidationHandler` takes validation dataset, an evaluation function,
     metrics to be evaluated, and how often to run the validation. You can provide custom
@@ -430,7 +435,7 @@ def train_begin(self, estimator, *args, **kwargs):
         self.current_epoch = 0
         self.current_batch = 0
         if self.save_best:
-            self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable
+            self.best = np.Inf if self.monitor_op == np.less else -np.Inf  # pylint: disable=comparison-with-callable
         if self.resume_from_checkpoint:
             error_msg = "To use resume from checkpoint, you must only specify " \
                         "the same type of period you used for training." \
@@ -506,12 +511,12 @@ def _save_checkpoint(self, estimator):
 
     def _save_symbol(self, estimator):
         symbol_file = os.path.join(self.model_dir, self.model_prefix + '-symbol.json')
-        if hasattr(estimator.net, '_cached_graph'):
+        if hasattr(estimator.net, '_cached_graph') and estimator.net._cached_graph:
             sym = estimator.net._cached_graph[1]
             sym.save(symbol_file)
         else:
-            self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock"
-                             "to construct your model, can call net.hybridize() before passing to"
+            self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock "
+                             "to construct your model, can call net.hybridize() before passing to "
                              "Estimator in order to save model architecture as %s.", symbol_file)
 
     def _save_params_and_trainer(self, estimator, file_prefix):
@@ -666,7 +671,7 @@ def __init__(self,
                                  "if you want otherwise", self.monitor.get()[0])
                 self.monitor_op = np.less
 
-        if self.monitor_op == np.greater: # pylint: disable=comparison-with-callable
+        if self.monitor_op == np.greater:  # pylint: disable=comparison-with-callable
             self.min_delta *= 1
         else:
             self.min_delta *= -1
@@ -679,7 +684,7 @@ def train_begin(self, estimator, *args, **kwargs):
         if self.baseline is not None:
             self.best = self.baseline
         else:
-            self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable
+            self.best = np.Inf if self.monitor_op == np.less else -np.Inf  # pylint: disable=comparison-with-callable
 
     def epoch_end(self, estimator, *args, **kwargs):
         monitor_name, monitor_value = self.monitor.get()
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index d2e8c082aa08..ae47d925670f 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -19,11 +19,13 @@
 
 import sys
 import unittest
+import warnings
 
 import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.gluon.contrib.estimator import *
+from mxnet.gluon.contrib.estimator.event_handler import *
 from nose.tools import assert_raises
 
 
@@ -335,10 +337,9 @@ def test_default_handlers():
                     metrics=train_acc,
                     trainer=trainer,
                     context=ctx)
-    # no handler
+    # no handler(all default handlers), no warning
     with warnings.catch_warnings(record=True) as w:
         est.fit(train_data=train_data, epochs=num_epochs)
-        assert 'You are training with the' in str(w[-1].message)
 
     # handler with prepared loss and metrics
     # use mix of default and user defined handlers
@@ -353,7 +354,7 @@ def test_default_handlers():
     # handler with all user defined metrics
     # use mix of default and user defined handlers
     metric = MetricHandler(train_metrics=[train_acc])
-    logging = LoggingHandler(train_metrics=[train_acc], val_metrics=[mx.metric.RMSE("val acc")])
+    logging = LoggingHandler(train_metrics=[train_acc])
     est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[metric, logging])
 
     # handler with mixed metrics, some handler use metrics prepared by estimator
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 5fe6a03eae7b..c2173a7dc071 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -133,6 +133,9 @@ def test_gluon_learning_rate_schedules_advanced():
 def test_gluon_info_gan():
     assert _test_tutorial_nb('gluon/info_gan')
 
+def test_gluon_fit_api_fashion_mnist():
+    assert _test_tutorial_nb('gluon/fit_api_tutorial')
+
 def test_nlp_cnn():
     assert _test_tutorial_nb('nlp/cnn')
 

From e15605637ed9c60b00ce257b77059d4dcefa7ce5 Mon Sep 17 00:00:00 2001
From: Sheng Zha <zhasheng@amazon.com>
Date: Thu, 18 Jul 2019 20:06:12 -0700
Subject: [PATCH 18/24] remove mshadow submodule

---
 .gitmodules      | 3 ---
 3rdparty/mshadow | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 3rdparty/mshadow

diff --git a/.gitmodules b/.gitmodules
index e0ffec11bfd0..90ef157f0eec 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "3rdparty/mshadow"]
-	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
deleted file mode 160000
index 1d79ecfdb4c9..000000000000
--- a/3rdparty/mshadow
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 1d79ecfdb4c9234537e1bf5148f44a1af54501ec

From 3f60274f30ae2130af33b7cea17010fc877babd4 Mon Sep 17 00:00:00 2001
From: Sheng Zha <zhasheng@amazon.com>
Date: Thu, 18 Jul 2019 20:11:57 -0700
Subject: [PATCH 19/24] import mshadow source tree

---
 3rdparty/mshadow/.gitignore                   |   21 +
 3rdparty/mshadow/.travis.yml                  |   43 +
 3rdparty/mshadow/CHANGES.md                   |   12 +
 3rdparty/mshadow/CMakeLists.txt               |    6 +
 3rdparty/mshadow/LICENSE                      |   13 +
 3rdparty/mshadow/README.md                    |   37 +
 3rdparty/mshadow/cmake/Cuda.cmake             |  324 +++
 3rdparty/mshadow/cmake/Utils.cmake            |  398 +++
 3rdparty/mshadow/cmake/mshadow.cmake          |   91 +
 3rdparty/mshadow/cmake/mshadowUtils.cmake     |    2 +
 3rdparty/mshadow/doc/Doxyfile                 | 2358 +++++++++++++++++
 3rdparty/mshadow/doc/README.md                |  321 +++
 3rdparty/mshadow/doc/mkdoc.sh                 |    4 +
 3rdparty/mshadow/guide/.gitignore             |    3 +
 3rdparty/mshadow/guide/Makefile               |   37 +
 3rdparty/mshadow/guide/README.md              |  226 ++
 3rdparty/mshadow/guide/basic.cpp              |  161 ++
 3rdparty/mshadow/guide/basic_stream.cu        |   35 +
 3rdparty/mshadow/guide/config.mk              |   39 +
 3rdparty/mshadow/guide/defop.cpp              |   49 +
 .../mshadow/guide/exp-template/.gitignore     |    1 +
 3rdparty/mshadow/guide/exp-template/Makefile  |   20 +
 3rdparty/mshadow/guide/exp-template/README.md |  340 +++
 .../mshadow/guide/exp-template/exp_lazy.cpp   |   45 +
 .../guide/exp-template/exp_template.cpp       |   72 +
 .../guide/exp-template/exp_template_op.cpp    |   92 +
 3rdparty/mshadow/guide/mshadow-ps/.gitignore  |    4 +
 .../mshadow/guide/mshadow-ps/2-levels.png     |  Bin 0 -> 59413 bytes
 3rdparty/mshadow/guide/mshadow-ps/Makefile    |   45 +
 3rdparty/mshadow/guide/mshadow-ps/README.md   |  227 ++
 3rdparty/mshadow/guide/mshadow-ps/config.mk   |   40 +
 3rdparty/mshadow/guide/mshadow-ps/dbstr.h     |   35 +
 .../guide/mshadow-ps/dist_async_sum-inl.h     |  124 +
 .../guide/mshadow-ps/dist_async_sum.cpp       |   11 +
 3rdparty/mshadow/guide/mshadow-ps/local.sh    |   39 +
 .../mshadow/guide/mshadow-ps/local_sum-inl.h  |  119 +
 .../mshadow/guide/mshadow-ps/local_sum.cpp    |    4 +
 .../mshadow/guide/mshadow-ps/local_sum.cu     |    4 +
 3rdparty/mshadow/guide/neuralnet/Makefile     |   38 +
 3rdparty/mshadow/guide/neuralnet/README.md    |   16 +
 3rdparty/mshadow/guide/neuralnet/config.mk    |   35 +
 3rdparty/mshadow/guide/neuralnet/convnet.cu   |  282 ++
 3rdparty/mshadow/guide/neuralnet/nnet.cu      |  202 ++
 3rdparty/mshadow/guide/neuralnet/nnet_ps.cu   |  312 +++
 3rdparty/mshadow/guide/neuralnet/util.h       |   86 +
 3rdparty/mshadow/make/README.md               |   18 +
 3rdparty/mshadow/make/mshadow.mk              |  166 ++
 3rdparty/mshadow/mshadow-ps/.gitignore        |    3 +
 3rdparty/mshadow/mshadow-ps/README.md         |    4 +
 3rdparty/mshadow/mshadow-ps/mshadow_ps.h      |  358 +++
 3rdparty/mshadow/mshadow-ps/ps_dist-inl.h     |  126 +
 3rdparty/mshadow/mshadow-ps/ps_local-inl.h    |  814 ++++++
 3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h    |  113 +
 3rdparty/mshadow/mshadow-ps/thread.h          |  261 ++
 3rdparty/mshadow/mshadow-ps/thread_util.h     |  169 ++
 3rdparty/mshadow/mshadow/README.md            |    8 +
 3rdparty/mshadow/mshadow/base.h               | 1110 ++++++++
 3rdparty/mshadow/mshadow/cuda/reduce.cuh      |  120 +
 .../mshadow/mshadow/cuda/tensor_gpu-inl.cuh   |  828 ++++++
 3rdparty/mshadow/mshadow/dot_engine-inl.h     |  936 +++++++
 3rdparty/mshadow/mshadow/expr_engine-inl.h    |  482 ++++
 3rdparty/mshadow/mshadow/expr_scalar-inl.h    |  165 ++
 3rdparty/mshadow/mshadow/expression.h         |  416 +++
 3rdparty/mshadow/mshadow/extension.h          |   41 +
 .../mshadow/mshadow/extension/broadcast.h     |  165 ++
 .../mshadow/extension/broadcast_with_axis.h   |  258 ++
 .../mshadow/mshadow/extension/channel_pool.h  |  108 +
 .../mshadow/extension/channel_unpool.h        |  137 +
 3rdparty/mshadow/mshadow/extension/choose.h   |   90 +
 3rdparty/mshadow/mshadow/extension/complex.h  |  525 ++++
 3rdparty/mshadow/mshadow/extension/concat.h   |  194 ++
 3rdparty/mshadow/mshadow/extension/crop.h     |  119 +
 3rdparty/mshadow/mshadow/extension/fill.h     |  103 +
 3rdparty/mshadow/mshadow/extension/flip.h     |  132 +
 .../mshadow/mshadow/extension/implicit_gemm.h |  128 +
 3rdparty/mshadow/mshadow/extension/mask.h     |   97 +
 3rdparty/mshadow/mshadow/extension/mirror.h   |   62 +
 3rdparty/mshadow/mshadow/extension/one_hot.h  |   87 +
 .../mshadow/extension/pack_col2patch.h        |  154 ++
 3rdparty/mshadow/mshadow/extension/pad.h      |  111 +
 3rdparty/mshadow/mshadow/extension/range.h    |  118 +
 .../mshadow/extension/reduce_with_axis.h      |  136 +
 .../mshadow/mshadow/extension/reduceto1d.h    |  104 +
 3rdparty/mshadow/mshadow/extension/reshape.h  |   87 +
 3rdparty/mshadow/mshadow/extension/slice.h    |  156 ++
 3rdparty/mshadow/mshadow/extension/slice_ex.h |  135 +
 .../mshadow/mshadow/extension/spatial_pool.h  |  152 ++
 .../mshadow/extension/spatial_unpool.h        |  135 +
 .../extension/spatial_upsampling_nearest.h    |   71 +
 3rdparty/mshadow/mshadow/extension/swapaxis.h |  110 +
 3rdparty/mshadow/mshadow/extension/take.h     |   99 +
 .../mshadow/mshadow/extension/take_grad.h     |  111 +
 .../mshadow/mshadow/extension/transpose.h     |  200 ++
 .../mshadow/extension/unpack_patch2col.h      |  151 ++
 3rdparty/mshadow/mshadow/half.h               |  354 +++
 3rdparty/mshadow/mshadow/half2.h              |  143 +
 3rdparty/mshadow/mshadow/io.h                 |  137 +
 3rdparty/mshadow/mshadow/logging.h            |  234 ++
 3rdparty/mshadow/mshadow/packet-inl.h         |  413 +++
 3rdparty/mshadow/mshadow/packet/plain-inl.h   |   76 +
 3rdparty/mshadow/mshadow/packet/sse-inl.h     |  147 +
 3rdparty/mshadow/mshadow/random.h             |  570 ++++
 3rdparty/mshadow/mshadow/stream_gpu-inl.h     |  214 ++
 3rdparty/mshadow/mshadow/tensor.h             | 1081 ++++++++
 3rdparty/mshadow/mshadow/tensor_container.h   |  208 ++
 3rdparty/mshadow/mshadow/tensor_cpu-inl.h     |  627 +++++
 3rdparty/mshadow/mshadow/tensor_gpu-inl.h     |  245 ++
 3rdparty/mshadow/scripts/travis_script.sh     |   19 +
 3rdparty/mshadow/test/Makefile                |   35 +
 3rdparty/mshadow/test/pairtest.cu             |  105 +
 3rdparty/mshadow/test/pool.cu                 |   69 +
 3rdparty/mshadow/test/reshape.cu              |   74 +
 3rdparty/mshadow/test/test.cu                 |   79 +
 3rdparty/mshadow/test/test.h                  |   67 +
 3rdparty/mshadow/test/unpack.cu               |   85 +
 115 files changed, 21728 insertions(+)
 create mode 100644 3rdparty/mshadow/.gitignore
 create mode 100644 3rdparty/mshadow/.travis.yml
 create mode 100644 3rdparty/mshadow/CHANGES.md
 create mode 100644 3rdparty/mshadow/CMakeLists.txt
 create mode 100644 3rdparty/mshadow/LICENSE
 create mode 100644 3rdparty/mshadow/README.md
 create mode 100644 3rdparty/mshadow/cmake/Cuda.cmake
 create mode 100644 3rdparty/mshadow/cmake/Utils.cmake
 create mode 100644 3rdparty/mshadow/cmake/mshadow.cmake
 create mode 100644 3rdparty/mshadow/cmake/mshadowUtils.cmake
 create mode 100644 3rdparty/mshadow/doc/Doxyfile
 create mode 100644 3rdparty/mshadow/doc/README.md
 create mode 100755 3rdparty/mshadow/doc/mkdoc.sh
 create mode 100644 3rdparty/mshadow/guide/.gitignore
 create mode 100644 3rdparty/mshadow/guide/Makefile
 create mode 100644 3rdparty/mshadow/guide/README.md
 create mode 100644 3rdparty/mshadow/guide/basic.cpp
 create mode 100644 3rdparty/mshadow/guide/basic_stream.cu
 create mode 100644 3rdparty/mshadow/guide/config.mk
 create mode 100644 3rdparty/mshadow/guide/defop.cpp
 create mode 100644 3rdparty/mshadow/guide/exp-template/.gitignore
 create mode 100644 3rdparty/mshadow/guide/exp-template/Makefile
 create mode 100644 3rdparty/mshadow/guide/exp-template/README.md
 create mode 100644 3rdparty/mshadow/guide/exp-template/exp_lazy.cpp
 create mode 100644 3rdparty/mshadow/guide/exp-template/exp_template.cpp
 create mode 100644 3rdparty/mshadow/guide/exp-template/exp_template_op.cpp
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/.gitignore
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/2-levels.png
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/Makefile
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/README.md
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/config.mk
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dbstr.h
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp
 create mode 100755 3rdparty/mshadow/guide/mshadow-ps/local.sh
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp
 create mode 100644 3rdparty/mshadow/guide/mshadow-ps/local_sum.cu
 create mode 100644 3rdparty/mshadow/guide/neuralnet/Makefile
 create mode 100644 3rdparty/mshadow/guide/neuralnet/README.md
 create mode 100644 3rdparty/mshadow/guide/neuralnet/config.mk
 create mode 100644 3rdparty/mshadow/guide/neuralnet/convnet.cu
 create mode 100644 3rdparty/mshadow/guide/neuralnet/nnet.cu
 create mode 100644 3rdparty/mshadow/guide/neuralnet/nnet_ps.cu
 create mode 100644 3rdparty/mshadow/guide/neuralnet/util.h
 create mode 100644 3rdparty/mshadow/make/README.md
 create mode 100644 3rdparty/mshadow/make/mshadow.mk
 create mode 100644 3rdparty/mshadow/mshadow-ps/.gitignore
 create mode 100644 3rdparty/mshadow/mshadow-ps/README.md
 create mode 100644 3rdparty/mshadow/mshadow-ps/mshadow_ps.h
 create mode 100644 3rdparty/mshadow/mshadow-ps/ps_dist-inl.h
 create mode 100644 3rdparty/mshadow/mshadow-ps/ps_local-inl.h
 create mode 100644 3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h
 create mode 100644 3rdparty/mshadow/mshadow-ps/thread.h
 create mode 100644 3rdparty/mshadow/mshadow-ps/thread_util.h
 create mode 100644 3rdparty/mshadow/mshadow/README.md
 create mode 100755 3rdparty/mshadow/mshadow/base.h
 create mode 100644 3rdparty/mshadow/mshadow/cuda/reduce.cuh
 create mode 100755 3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
 create mode 100644 3rdparty/mshadow/mshadow/dot_engine-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/expr_engine-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/expr_scalar-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/expression.h
 create mode 100644 3rdparty/mshadow/mshadow/extension.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/broadcast.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/channel_pool.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/channel_unpool.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/choose.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/complex.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/concat.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/crop.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/fill.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/flip.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/implicit_gemm.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/mask.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/mirror.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/one_hot.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/pack_col2patch.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/pad.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/range.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/reduceto1d.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/reshape.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/slice.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/slice_ex.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_pool.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_unpool.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/swapaxis.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/take.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/take_grad.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/transpose.h
 create mode 100644 3rdparty/mshadow/mshadow/extension/unpack_patch2col.h
 create mode 100644 3rdparty/mshadow/mshadow/half.h
 create mode 100755 3rdparty/mshadow/mshadow/half2.h
 create mode 100644 3rdparty/mshadow/mshadow/io.h
 create mode 100644 3rdparty/mshadow/mshadow/logging.h
 create mode 100644 3rdparty/mshadow/mshadow/packet-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/packet/plain-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/packet/sse-inl.h
 create mode 100644 3rdparty/mshadow/mshadow/random.h
 create mode 100644 3rdparty/mshadow/mshadow/stream_gpu-inl.h
 create mode 100755 3rdparty/mshadow/mshadow/tensor.h
 create mode 100644 3rdparty/mshadow/mshadow/tensor_container.h
 create mode 100755 3rdparty/mshadow/mshadow/tensor_cpu-inl.h
 create mode 100755 3rdparty/mshadow/mshadow/tensor_gpu-inl.h
 create mode 100755 3rdparty/mshadow/scripts/travis_script.sh
 create mode 100644 3rdparty/mshadow/test/Makefile
 create mode 100644 3rdparty/mshadow/test/pairtest.cu
 create mode 100644 3rdparty/mshadow/test/pool.cu
 create mode 100644 3rdparty/mshadow/test/reshape.cu
 create mode 100644 3rdparty/mshadow/test/test.cu
 create mode 100644 3rdparty/mshadow/test/test.h
 create mode 100644 3rdparty/mshadow/test/unpack.cu

diff --git a/3rdparty/mshadow/.gitignore b/3rdparty/mshadow/.gitignore
new file mode 100644
index 000000000000..3da5172aeb2a
--- /dev/null
+++ b/3rdparty/mshadow/.gitignore
@@ -0,0 +1,21 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*~
+doc/html
+doc/latex
+rabit
+dmlc-core
+*.db
+*.bak
+build
diff --git a/3rdparty/mshadow/.travis.yml b/3rdparty/mshadow/.travis.yml
new file mode 100644
index 000000000000..a4d6223d8ea7
--- /dev/null
+++ b/3rdparty/mshadow/.travis.yml
@@ -0,0 +1,43 @@
+# disable sudo to use container based build
+sudo: false
+
+# Use Build Matrix to do lint and build seperately
+env:
+  matrix:
+    - TASK=lint LINT_LANG=cpp
+    - TASK=doc
+    - TASK=build CXX=g++
+
+# dependent apt packages
+addons:
+  apt:
+    packages:
+      - doxygen
+      - wget
+      - unzip
+      - libblas-dev
+      - python3-pip
+
+before_install:
+  - git clone https://github.com/dmlc/dmlc-core
+  - export TRAVIS=dmlc-core/scripts/travis
+  - source ${TRAVIS}/travis_setup_env.sh
+
+install:
+  - pip3 install --upgrade pip --user
+  - pip3 install  --user  cpplint pylint
+  
+script: scripts/travis_script.sh
+
+before_cache:
+  - ${TRAVIS}/travis_before_cache.sh
+
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+
+notifications:
+  email:
+    on_success: change
+    on_failure: always
+
diff --git a/3rdparty/mshadow/CHANGES.md b/3rdparty/mshadow/CHANGES.md
new file mode 100644
index 000000000000..03bb16936acd
--- /dev/null
+++ b/3rdparty/mshadow/CHANGES.md
@@ -0,0 +1,12 @@
+Change Log
+=====
+
+mshadow-1.0
+=====
+* Initial release
+
+mshadow-2.0: in progress
+=====
+* Support multiple data type
+* Great refactoring of code
+* Parameter server interface for MultiGPU and distributed learning
diff --git a/3rdparty/mshadow/CMakeLists.txt b/3rdparty/mshadow/CMakeLists.txt
new file mode 100644
index 000000000000..b89e9028a77b
--- /dev/null
+++ b/3rdparty/mshadow/CMakeLists.txt
@@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 2.8.7)
+
+project(mshadow C CXX)
+
+set(mshadow_LINT_DIRS mshadow mshadow-ps)
+add_custom_target(mshadow_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${mshadow_LINT_DIRS} -DPROJECT_SOURCE_DIR=${PROJECT_SOURCE_DIR} -DPROJECT_NAME=mshadow -P ${PROJECT_SOURCE_DIR}/../dmlc-core/cmake/lint.cmake)
diff --git a/3rdparty/mshadow/LICENSE b/3rdparty/mshadow/LICENSE
new file mode 100644
index 000000000000..ebf9611d76cd
--- /dev/null
+++ b/3rdparty/mshadow/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2014 by Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/3rdparty/mshadow/README.md b/3rdparty/mshadow/README.md
new file mode 100644
index 000000000000..cc18964a65f7
--- /dev/null
+++ b/3rdparty/mshadow/README.md
@@ -0,0 +1,37 @@
+mshadow: Matrix Shadow
+======
+[![Build Status](https://travis-ci.org/dmlc/mshadow.svg?branch=master)](https://travis-ci.org/dmlc/mshadow)
+
+MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***,
+***device invariant*** and ***simple*** tensor library for machine learning project that aims for maximum performance and control, while also emphasize simplicity.
+
+MShadow also provides interface that allows writing Multi-GPU and distributed deep learning programs in an easy and unified way.
+
+* [Contributors](https://github.com/tqchen/mshadow/graphs/contributors)
+* [Tutorial](guide)
+* [Documentation](doc)
+* [Parameter Server Interface for GPU Tensor](guide/mshadow-ps)
+
+Features
+--------
+* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code
+  - No temporal memory allocation will happen for expression you write
+  - mshadow will generate specific kernel for every expression you write in compile time.
+* Device invariant: you can write one code and it will run on both CPU and GPU
+* Simple: mshadow allows you to write machine learning code using expressions.
+* Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called
+* Lightweight library: light amount of code to support frequently used functions in machine learning
+* Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required.
+* MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way.
+
+Version
+-------
+* This version mshadow-2.x, there are a lot of changes in the interface and it is not backward compatible with mshadow-1.0
+  - If you use older version of cxxnet, you will need to use the legacy mshadow code
+* For legacy code, refer to [Here](https://github.com/tqchen/mshadow/releases/tag/v1.1)
+* Change log in [CHANGES.md](CHANGES.md)
+
+Projects Using MShadow
+----------------------
+* [MXNet: Efficient and Flexible Distributed Deep Learning Framework](https://github.com/dmlc/mxnet)
+* [CXXNet: A lightweight  C++ based deep learnig framework](https://github.com/dmlc/cxxnet)
diff --git a/3rdparty/mshadow/cmake/Cuda.cmake b/3rdparty/mshadow/cmake/Cuda.cmake
new file mode 100644
index 000000000000..bc09a3905076
--- /dev/null
+++ b/3rdparty/mshadow/cmake/Cuda.cmake
@@ -0,0 +1,324 @@
+if(NOT USE_CUDA)
+  return()
+endif()
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   mshadow_detect_installed_gpus(out_variable)
+function(mshadow_detect_installed_gpus out_variable)
+set(CUDA_gpu_detect_output "")
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Running GPU architecture autodetection")
+    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${__cufile} ""
+      "#include <cstdio>\n"
+      "#include <iostream>\n"
+      "using namespace std;\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) { return -1; }\n"
+      "  if (count == 0) { cerr << \"No cuda devices detected\" << endl; return -1; }\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    if(MSVC)
+      #find vcvarsall.bat and run it building msvc environment
+      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
+      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    else()
+      if(CUDA_LIBRARY_PATH)
+        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
+      endif()
+      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
+                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+    if(__nvcc_res EQUAL 0)
+      # nvcc outputs text containing line breaks when building with MSVC.
+      # The line below prevents CMake from inserting a variable with line
+      # breaks in the cache
+      message(STATUS "Found CUDA arch ${__nvcc_out}")
+      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
+      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
+      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE)
+    else()
+      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mshadow_known_gpu_archs}).")
+    set(${out_variable} ${mshadow_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   mshadow_select_nvcc_arch_flags(out_variable)
+function(mshadow_select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
+  set(__archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND __archs_names "Auto")
+    set(__archs_name_default "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " __archs_names "${__archs_names}")
+    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${mshadow_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
+    set(__cuda_arch_bin "20 21(20)")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(__cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(__cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(__cuda_arch_bin ${mshadow_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    mshadow_detect_installed_gpus(__cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
+  mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx)
+
+  set(__nvcc_flags "")
+  set(__nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(__arch ${__cuda_arch_bin})
+    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
+      list(APPEND __nvcc_archs_readable sm_${__arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(__arch ${__cuda_arch_ptx})
+    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
+    list(APPEND __nvcc_archs_readable compute_${__arch})
+  endforeach()
+
+  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
+  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Short command for cuda comnpilation
+# Usage:
+#   mshadow_cuda_compile(<objlist_variable> <cuda_files>)
+macro(mshadow_cuda_compile objlist_variable)
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var}_backup_in_cuda_compile_ "${${var}}")
+
+    # we remove /EHa as it generates warnings under windows
+    string(REPLACE "/EHa" "" ${var} "${${var}}")
+
+  endforeach()
+  if(UNIX OR APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
+  endif()
+
+  if(APPLE)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
+  endif()
+
+  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G")
+
+  if(MSVC)
+    # disable noisy warnings:
+    # 4819: The file contains a character that cannot be represented in the current code page (number).
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+
+  # If the build system is a container, make sure the nvcc intermediate files
+  # go into the build output area rather than in /tmp, which may run out of space
+  if(IS_CONTAINER_BUILD)
+    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
+    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
+  endif()
+
+  cuda_compile(cuda_objcs ${ARGN})
+
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var} "${${var}_backup_in_cuda_compile_}")
+    unset(${var}_backup_in_cuda_compile_)
+  endforeach()
+
+  set(${objlist_variable} ${cuda_objcs})
+endmacro()
+
+################################################################################################
+# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution.
+# That's why not FindcuDNN.cmake file, but just the macro
+# Usage:
+#   detect_cuDNN()
+function(detect_cuDNN)
+  set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder")
+
+  find_path(CUDNN_INCLUDE cudnn.h
+            PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE}
+            DOC "Path to cuDNN include directory." )
+
+  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+  find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a
+                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist}
+                             DOC "Path to cuDNN library.")
+
+  if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
+    set(HAVE_CUDNN  TRUE PARENT_SCOPE)
+    set(CUDNN_FOUND TRUE PARENT_SCOPE)
+
+    mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT)
+    message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})")
+  endif()
+endfunction()
+
+
+################################################################################################
+###  Non macro section
+################################################################################################
+
+# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
+if(NOT CUDA_TOOLKIT_ROOT_DIR)
+  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
+  if(CUDA_LIBRARY_PATH)
+    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
+    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
+  endif()
+endif()
+
+find_package(CUDA 5.5 QUIET REQUIRED)
+find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
+
+if(NOT CUDA_FOUND)
+  return()
+endif()
+
+set(HAVE_CUDA TRUE)
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
+                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+
+# Known NVIDIA GPU achitectures mshadow can be compiled for.
+# This list will be used for CUDA_ARCH_NAME = All option
+if(CUDA_ARCH_ALL)
+  set(mshadow_known_gpu_archs "${CUDA_ARCH_ALL}")
+else()
+  if(${CUDA_VERSION} EQUAL 9.0 OR ${CUDA_VERSION} GREATER 9.0)
+    set(mshadow_known_gpu_archs "30 35 50 52 60 61 70")
+  elseif(${CUDA_VERSION} EQUAL 8.0 OR ${CUDA_VERSION} GREATER 8.0)
+    set(mshadow_known_gpu_archs "30 35 50 52 60 61")
+  else()
+    set(mshadow_known_gpu_archs "30 35 50 52")
+  endif()
+endif()
+
+# cudnn detection
+if(USE_CUDNN)
+  detect_cuDNN()
+  if(HAVE_CUDNN)
+    add_definitions(-DUSE_CUDNN)
+    include_directories(SYSTEM ${CUDNN_INCLUDE})
+    list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY})
+  endif()
+endif()
+
+# setting nvcc arch flags
+mshadow_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
+# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
+if(Boost_VERSION EQUAL 105500)
+  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
+  # avoid warning for CMake >= 2.8.12
+  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
+endif()
+
+# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
+foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
+  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
+endforeach()
+
+# setting default testing device
+if(NOT CUDA_TEST_DEVICE)
+  set(CUDA_TEST_DEVICE -1)
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
+
+# Handle clang/libc++ issue
+if(APPLE)
+  mshadow_detect_darwin_version(OSX_VERSION)
+
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  if(OSX_VERSION VERSION_GREATER 10.8)
+    # enabled by default if and only if CUDA version is less than 7.0
+    mshadow_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
+  endif()
+endif()
diff --git a/3rdparty/mshadow/cmake/Utils.cmake b/3rdparty/mshadow/cmake/Utils.cmake
new file mode 100644
index 000000000000..dc464f0092f5
--- /dev/null
+++ b/3rdparty/mshadow/cmake/Utils.cmake
@@ -0,0 +1,398 @@
+################################################################################################
+# Command alias for debugging messages
+# Usage:
+#   dmsg(<message>)
+function(dmsg)
+  message(STATUS ${ARGN})
+endfunction()
+
+################################################################################################
+# Removes duplicates from list(s)
+# Usage:
+#   mshadow_list_unique(<list_variable> [<list_variable>] [...])
+macro(mshadow_list_unique)
+  foreach(__lst ${ARGN})
+    if(${__lst})
+      list(REMOVE_DUPLICATES ${__lst})
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Clears variables from list
+# Usage:
+#   mshadow_clear_vars(<variables_list>)
+macro(mshadow_clear_vars)
+  foreach(_var ${ARGN})
+    unset(${_var})
+  endforeach()
+endmacro()
+
+################################################################################################
+# Removes duplicates from string
+# Usage:
+#   mshadow_string_unique(<string_variable>)
+function(mshadow_string_unique __string)
+  if(${__string})
+    set(__list ${${__string}})
+    separate_arguments(__list)
+    list(REMOVE_DUPLICATES __list)
+    foreach(__e ${__list})
+      set(__str "${__str} ${__e}")
+    endforeach()
+    set(${__string} ${__str} PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Prints list element per line
+# Usage:
+#   mshadow_print_list(<list>)
+function(mshadow_print_list)
+  foreach(e ${ARGN})
+    message(STATUS ${e})
+  endforeach()
+endfunction()
+
+################################################################################################
+# Function merging lists of compiler flags to single string.
+# Usage:
+#   mshadow_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
+function(mshadow_merge_flag_lists out_var)
+  set(__result "")
+  foreach(__list ${ARGN})
+    foreach(__flag ${${__list}})
+      string(STRIP ${__flag} __flag)
+      set(__result "${__result} ${__flag}")
+    endforeach()
+  endforeach()
+  string(STRIP ${__result} __result)
+  set(${out_var} ${__result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Converts all paths in list to absolute
+# Usage:
+#   mshadow_convert_absolute_paths(<list_variable>)
+function(mshadow_convert_absolute_paths variable)
+  set(__dlist "")
+  foreach(__s ${${variable}})
+    get_filename_component(__abspath ${__s} ABSOLUTE)
+    list(APPEND __list ${__abspath})
+  endforeach()
+  set(${variable} ${__list} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Reads set of version defines from the header file
+# Usage:
+#   mshadow_parse_header(<file> <define1> <define2> <define3> ..)
+macro(mshadow_parse_header FILENAME FILE_VAR)
+  set(vars_regex "")
+  set(__parnet_scope OFF)
+  set(__add_cache OFF)
+  foreach(name ${ARGN})
+    if("${name}" STREQUAL "PARENT_SCOPE")
+      set(__parnet_scope ON)
+    elseif("${name}" STREQUAL "CACHE")
+      set(__add_cache ON)
+    elseif(vars_regex)
+      set(vars_regex "${vars_regex}|${name}")
+    else()
+      set(vars_regex "${name}")
+    endif()
+  endforeach()
+  if(EXISTS "${FILENAME}")
+    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
+  else()
+    unset(${FILE_VAR})
+  endif()
+  foreach(name ${ARGN})
+    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
+      if(${FILE_VAR})
+        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
+          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
+        else()
+          set(${name} "")
+        endif()
+        if(__add_cache)
+          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
+        elseif(__parnet_scope)
+          set(${name} "${${name}}" PARENT_SCOPE)
+        endif()
+      else()
+        unset(${name} CACHE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Reads single version define from the header file and parses it
+# Usage:
+#   mshadow_parse_header_single_define(<library_name> <file> <define_name>)
+function(mshadow_parse_header_single_define LIBNAME HDR_PATH VARNAME)
+  set(${LIBNAME}_H "")
+  if(EXISTS "${HDR_PATH}")
+    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
+  endif()
+
+  if(${LIBNAME}_H)
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
+    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
+    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+
+    # append a TWEAK version if it exists:
+    set(${LIBNAME}_VERSION_TWEAK "")
+    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
+      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
+    endif()
+    if(${LIBNAME}_VERSION_TWEAK)
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
+    else()
+      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+########################################################################################################
+# An option that the user can select. Can accept condition to control when option is available for user.
+# Usage:
+#   mshadow_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
+function(mshadow_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+  unset(__varname)
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    else()
+      option(${variable} "${description}" ${__value})
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+endfunction()
+
+################################################################################################
+# Utility macro for comparing two lists. Used for CMake debugging purposes
+# Usage:
+#   mshadow_compare_lists(<list_variable> <list2_variable> [description])
+function(mshadow_compare_lists list1 list2 desc)
+  set(__list1 ${${list1}})
+  set(__list2 ${${list2}})
+  list(SORT __list1)
+  list(SORT __list2)
+  list(LENGTH __list1 __len1)
+  list(LENGTH __list2 __len2)
+
+  if(NOT ${__len1} EQUAL ${__len2})
+    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
+  endif()
+
+  foreach(__i RANGE 1 ${__len1})
+    math(EXPR __index "${__i}- 1")
+    list(GET __list1 ${__index} __item1)
+    list(GET __list2 ${__index} __item2)
+    if(NOT ${__item1} STREQUAL ${__item2})
+      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
+    endif()
+  endforeach()
+endfunction()
+
+################################################################################################
+# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
+# Usage:
+#   mshadow_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
+macro(mshadow_warnings_disable)
+  set(_flag_vars "")
+  set(_msvc_warnings "")
+  set(_gxx_warnings "")
+
+  foreach(arg ${ARGN})
+    if(arg MATCHES "^CMAKE_")
+      list(APPEND _flag_vars ${arg})
+    elseif(arg MATCHES "^/wd")
+      list(APPEND _msvc_warnings ${arg})
+    elseif(arg MATCHES "^-W")
+      list(APPEND _gxx_warnings ${arg})
+    endif()
+  endforeach()
+
+  if(NOT _flag_vars)
+    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  if(MSVC AND _msvc_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_msvc_warnings})
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
+    foreach(var ${_flag_vars})
+      foreach(warning ${_gxx_warnings})
+        if(NOT warning MATCHES "^-Wno-")
+          string(REPLACE "${warning}" "" ${var} "${${var}}")
+          string(REPLACE "-W" "-Wno-" warning "${warning}")
+        endif()
+        set(${var} "${${var}} ${warning}")
+      endforeach()
+    endforeach()
+  endif()
+  mshadow_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
+endmacro()
+
+################################################################################################
+# Helper function get current definitions
+# Usage:
+#   mshadow_get_current_definitions(<definitions_variable>)
+function(mshadow_get_current_definitions definitions_var)
+  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
+  set(result "")
+
+  foreach(d ${current_definitions})
+    list(APPEND result -D${d})
+  endforeach()
+
+  mshadow_list_unique(result)
+  set(${definitions_var} ${result} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function get current includes/definitions
+# Usage:
+#   mshadow_get_current_cflags(<cflagslist_variable>)
+function(mshadow_get_current_cflags cflags_var)
+  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
+  mshadow_convert_absolute_paths(current_includes)
+  mshadow_get_current_definitions(cflags)
+
+  foreach(i ${current_includes})
+    list(APPEND cflags "-I${i}")
+  endforeach()
+
+  mshadow_list_unique(cflags)
+  set(${cflags_var} ${cflags} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to parse current linker libs into link directories, libflags and osx frameworks
+# Usage:
+#   mshadow_parse_linker_libs(<mshadow_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
+function(mshadow_parse_linker_libs mshadow_LINKER_LIBS_variable folders_var flags_var frameworks_var)
+
+  set(__unspec "")
+  set(__debug "")
+  set(__optimized "")
+  set(__framework "")
+  set(__varname "__unspec")
+
+  # split libs into debug, optimized, unspecified and frameworks
+  foreach(list_elem ${${mshadow_LINKER_LIBS_variable}})
+    if(list_elem STREQUAL "debug")
+      set(__varname "__debug")
+    elseif(list_elem STREQUAL "optimized")
+      set(__varname "__optimized")
+    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
+      list(APPEND __framework -framework ${CMAKE_MATCH_1})
+    else()
+      list(APPEND ${__varname} ${list_elem})
+      set(__varname "__unspec")
+    endif()
+  endforeach()
+
+  # attach debug or optimized libs to unspecified according to current configuration
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(__libs ${__unspec} ${__debug})
+  else()
+    set(__libs ${__unspec} ${__optimized})
+  endif()
+
+  set(libflags "")
+  set(folders "")
+
+  # convert linker libraries list to link flags
+  foreach(lib ${__libs})
+    if(TARGET ${lib})
+      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
+      list(APPEND libflags -l${lib})
+    elseif(lib MATCHES "^-l.*")
+      list(APPEND libflags ${lib})
+    elseif(IS_ABSOLUTE ${lib})
+      get_filename_component(name_we ${lib} NAME_WE)
+      get_filename_component(folder  ${lib} PATH)
+
+      string(REGEX MATCH "^lib(.*)" __match ${name_we})
+      list(APPEND libflags -l${CMAKE_MATCH_1})
+      list(APPEND folders    ${folder})
+    else()
+      message(FATAL_ERROR "Logic error. Need to update cmake script")
+    endif()
+  endforeach()
+
+  mshadow_list_unique(libflags folders)
+
+  set(${folders_var} ${folders} PARENT_SCOPE)
+  set(${flags_var} ${libflags} PARENT_SCOPE)
+  set(${frameworks_var} ${__framework} PARENT_SCOPE)
+endfunction()
+
+################################################################################################
+# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
+# Usage:
+#   mshadow_detect_darwin_version(<version_variable>)
+function(mshadow_detect_darwin_version output_var)
+  if(APPLE)
+    execute_process(COMMAND /usr/bin/sw_vers -productVersion
+                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
+  else()
+    set(${output_var} "" PARENT_SCOPE)
+  endif()
+endfunction()
+
+################################################################################################
+# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
+# Usage:
+#   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
+function(mshadow_source_group group)
+  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
+  if(CAFFE_SOURCE_GROUP_GLOB)
+    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
+    source_group(${group} FILES ${srcs1})
+  endif()
+
+  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
+    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
+    source_group(${group} FILES ${srcs2})
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/3rdparty/mshadow/cmake/mshadow.cmake b/3rdparty/mshadow/cmake/mshadow.cmake
new file mode 100644
index 000000000000..1ef76988d8d0
--- /dev/null
+++ b/3rdparty/mshadow/cmake/mshadow.cmake
@@ -0,0 +1,91 @@
+set(mshadow_LINKER_LIBS "")
+
+set(BLAS "Open" CACHE STRING "Selected BLAS library")
+set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
+
+if(DEFINED USE_BLAS)
+  set(BLAS "${USE_BLAS}") 
+else()
+  if(USE_MKL_IF_AVAILABLE)
+    if(NOT MKL_FOUND)
+      find_package(MKL)
+    endif()
+    if(MKL_FOUND)
+      set(BLAS "MKL")
+    endif()
+  endif()
+endif()
+
+if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
+  find_package(Atlas REQUIRED)
+  include_directories(SYSTEM ${Atlas_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+  add_definitions(-DMSHADOW_USE_MKL=0)
+elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
+  find_package(OpenBLAS REQUIRED)
+  include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB})
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+  add_definitions(-DMSHADOW_USE_MKL=0)
+elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
+  find_package(MKL REQUIRED)
+  include_directories(SYSTEM ${MKL_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_CBLAS=0)
+  add_definitions(-DMSHADOW_USE_MKL=1)
+elseif(BLAS STREQUAL "apple")
+  find_package(Accelerate REQUIRED)
+  include_directories(SYSTEM ${Accelerate_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_MKL=0)
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+endif()
+
+if(SUPPORT_MSSE2)
+	add_definitions(-DMSHADOW_USE_SSE=1)
+else()
+	add_definitions(-DMSHADOW_USE_SSE=0)
+endif()
+
+if(NOT DEFINED SUPPORT_F16C AND NOT MSVC)
+    check_cxx_compiler_flag("-mf16c"     COMPILER_SUPPORT_MF16C)
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        execute_process(COMMAND cat /proc/cpuinfo
+                COMMAND grep flags
+                COMMAND grep f16c
+                OUTPUT_VARIABLE CPU_SUPPORT_F16C)
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+        execute_process(COMMAND sysctl -a
+                COMMAND grep machdep.cpu.features
+                COMMAND grep F16C
+                OUTPUT_VARIABLE CPU_SUPPORT_F16C)
+    endif()
+    if(NOT CPU_SUPPORT_F16C)
+        message("CPU does not support F16C instructions")
+    endif()
+    if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C)
+        set(SUPPORT_F16C TRUE)
+    endif()
+endif()
+
+if(SUPPORT_F16C)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mf16c")
+else()
+    add_definitions(-DMSHADOW_USE_F16C=0)
+endif()
+
+if(USE_CUDA)
+	find_package(CUDA 5.5 QUIET)
+	find_cuda_helper_libs(curand)
+	if(NOT CUDA_FOUND)
+		message(FATAL_ERROR "-- CUDA is disabled.")
+	endif()
+	add_definitions(-DMSHADOW_USE_CUDA=1)
+	add_definitions(-DMSHADOW_FORCE_STREAM)
+	include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+    list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
+                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+else()
+  add_definitions(-DMSHADOW_USE_CUDA=0)
+endif()
diff --git a/3rdparty/mshadow/cmake/mshadowUtils.cmake b/3rdparty/mshadow/cmake/mshadowUtils.cmake
new file mode 100644
index 000000000000..d4b8bfc89b7a
--- /dev/null
+++ b/3rdparty/mshadow/cmake/mshadowUtils.cmake
@@ -0,0 +1,2 @@
+include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake")
+
diff --git a/3rdparty/mshadow/doc/Doxyfile b/3rdparty/mshadow/doc/Doxyfile
new file mode 100644
index 000000000000..3e83d471844c
--- /dev/null
+++ b/3rdparty/mshadow/doc/Doxyfile
@@ -0,0 +1,2358 @@
+# Doxyfile 1.8.8
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "mshadow"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = mshadow \
+                         mshadow-ps
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = *-inl.* \
+                         utils.h \
+                         thread_util.h \
+                         thread.h \
+                         kv_array.h
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = mshadow::expr::Plan* \
+                         mshadow::expr::*Engine*
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra stylesheet files is of importance (e.g. the last
+# stylesheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
+# for the replacement values of the other commands the user is refered to
+# HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = NO
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+PLANTUML_JAR_PATH      =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/3rdparty/mshadow/doc/README.md b/3rdparty/mshadow/doc/README.md
new file mode 100644
index 000000000000..2fed3444a38c
--- /dev/null
+++ b/3rdparty/mshadow/doc/README.md
@@ -0,0 +1,321 @@
+MShadow Documentation
+=====
+This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. 
+
+### Links to Topics
+
+* [Tutorial](../guide)
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc)
+  - You can run ```./mkdoc.sh``` to make the document locally
+* [Tutorial about Expression Template](../guide/exp-template)
+* [Writing Multi-GPU and Distributed ML](../guide/mshadow-ps)
+* [Compile Configuration script](../make)
+* [Expression API](#expression-api)
+  - Expression api introduces the concept of expression in mshadow
+
+Expression API
+=====
+Expression is the key concept in mshadow, a common operation of mshadow is ```tensor = some code to construct expression```
+
+There are three major types of expression:
+* Mapper expression: only contain element-wise operations of Mapper expressions  
+  - Mapper expression can used as composition component of other operations.
+  - Tensor, scalar are Mapper expressions
+  - Example: ``` weight =  - eta * (grad + lambda * weight)```  is a Mapper expression.
+  - Mapper expressions are translated using expression template code implemented by mshadow.
+  - ***Assign safety***: Element-wise mapping are assign safe, which means, we can write ```A = A * 2 + B```, making lvalue appear in expression, the results are still correct.
+* Chainer expression: may contain element-wise operation such as reduction and broadcast
+  - Example: ```dst = mirror(src)``` is a chainer expression
+  - ***Assign safety***: Most of the chainer extensions are not assignment safe, which means user should avoid putting target in source epression.
+* Complex expression: complex operations, need special translation rule to translate to specific implementations.
+   - Complex expression can not be used as composition component of other operations.
+   - Example: ``` dot(lhs.T(), rhs)```,  is complex expression, we can not write
+``` dst =  1.0 + dot(lhs.T(), rhs)```
+   - But limited syntax is supported depending on specification, for example, we do support ``` dst +=  2.0f * dot(lhs.T(), rhs)```
+   - Complex expressions are translated into specific implementations such as BLAS.
+
+### Element-wise Operations
+The basic binary operators are overloaded to composite Mapper expressions, so we can write 
+```c++
+weight = (-eta) * (grad + lambda * weight);
+```
+We can also use customized binary operators, and unary operators:
+```c++
+struct maximum {
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+template<typename xpu>
+void ExampleMaximum(Tensor<xpu, 2> out,
+                    const Tensor<xpu, 2> &A,
+                    const Tensor<xpu, 2> &B) {
+  out= 10.0f * F<maximum>(A+1.0f, B); 
+}
+struct sigmoid {
+  MSHADOW_XINLINE static float Map(float a) {
+    return 1.0f/(1.0f+expf(-a));
+  }
+};
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  // equivalent to out = sigmoid(in*2) + 1; 
+  out = F<op::plus>(F<sigmoid>(in * 2.0f), ScalarExp(1.0f));
+}
+```
+### Matrix Multiplications
+Matrix multiplications are supported by following syntax, with things brackets [] are optional
+```
+dst <sv> [scale*] dot(lhs [.T()] , rhs [.T()]), <sv> can be =,+=,-=
+```
+Example:
+```c++
+template<typename xpu>
+void Backprop(Tensor<xpu, 2> gradin,
+              const Tensor<xpu, 2> &gradout,
+              const Tensor<xpu, 2> &netweight) {
+  gradin = 2.0 * dot(gradout, netweight.T());
+}
+```
+
+### Introducing Expression Extensions
+Naming conventions:
+* ```Tensor<xpu, dim>``` to refer to any Tensor with device any device and dimension. 
+* ```xpu```, ```dim```, are implicit template parameters. 
+* ```Expr<xpu, dim>``` will be used to refer to any mapper expression with type ```Tensor<xpu,dim>```.
+
+List of functions:
+* [reshape](#reshape): reshapes a tensor to another shape, number of content must be same
+* [broadcast<?>](#broadcast): replicate a 1 dimension tensor in certain dimension
+* [repmat](#repmat), special case of broadcast<0>: repeat vector over rows to form a matrix
+* [sumall_except_dim<?>](#sumall_except_dim): sum over all the dimensions, except the dimension specified in template parameter
+* [sum_rows](#sum_rows): special case of sumall_except_dim<0>, sum of rows in the matrix
+* [unpack_patch2col](#unpack_patch2col): unpack local (overlap) patches of image to column of mat, can be used to implement convolution
+* [pack_col2patch](#pack_col2patch): reverse operation of unpack_patch2col, can be used to implement deconvolution
+* [pool](#pool): do pooling on image
+* [unpool](#unpool): get gradient of pooling result
+* [crop](#crop): crop the original image to a smaller size
+* [mirror](#mirror): get the mirrored result of input expression
+
+======
+##### reshape
+* ```reshape(Expr<xpu,dim> src, Shape<dimdst> oshape)```
+* reshapes a tensor to another shape, total number of elements must be same
+* parameters:
+  - src:  input data
+  - oshape: target shape
+* result expression type: ```Tensor<xpu, dimdst>``` with ```shape=oshape```, is Mapper expression
+```c++
+void ExampleReshape(void) {
+  Tensor<cpu, 2> dst = NewTensor<cpu>(Shape2(4, 5));
+  Tensor<cpu, 1> src = NewTensor<cpu>(Shape1(20), 1.0f); 
+  dst = reshape(src, dst.shape_);
+  ...
+}
+```
+======
+
+##### broadcast
+* ```broadcast<dimcast>(Tensor<xpu,1> src, Shape<dimdst> oshape)```
+* replicate a 1 dimension tensor certain dimension, specified by template parameter dimcast
+* parameters:
+  - src: input 1 dimensional tensor
+  - oshape: shape of output
+* return expression type: ```Tensor<xpu, dimdst>```, ```shape = oshape```, is Chainer expression 
+```c++
+void ExampleBroadcast(void) {
+  Tensor<cpu, 2> dst = NewTensor<cpu>(Shape2(2, 3));
+  Tensor<cpu, 1> src = NewTensor<cpu>(Shape1(2), 1.0f);
+  src[0] = 2.0f; src[1] = 1.0f;
+  dst = broadcast<0>(src, dst.shape_);
+  // dst[0][0] = 2, dst[0][1] = 2; dst[1][0]=1, dst[1][1] = 1
+  ...
+}
+```
+======
+##### repmat
+* ```repmat(Tensor<xpu, 1> src, int nrows) ```
+* special case of broadcast, repeat 1d tensor over rows
+* input parameters:
+  - src: input vector
+  - nrows: number of rows in target
+* return expression type:  ```Tensor<xpu, 2>```, with ```shape=(nrows, src.size(0))```,  is Chainer expression
+```c++
+void ExampleRepmat(void) {
+  Tensor<cpu,2> dst = NewTensor<cpu>(Shape2(3, 2));
+  Tensor<cpu,1> src = NewTensor<cpu>(Shape1(2), 1.0f);
+  src[0] = 2.0f; src[1] = 1.0f;
+  dst = repmat(src, 3);
+  // dst[0][0] = 2, dst[0][1] = 1; dst[1][0]=2, dst[1][1] = 1
+  ...
+}
+```
+======
+##### sumall_except_dim
+* ```sumall_except_dim<dimkeep>(Expr<xpu,dim> src) ```
+* sum over all dimensions, except dimkeep
+* input parameters:
+  - src: input mapper expression
+* return expression type:  ```Tensor<xpu, 1>```, with ```shape=(src.size(dimkeep))```,  is Complex expression
+* Syntax: ```dst [sv] [scale*] sumall_except_dim<dimkeep>(src) , <sv> can be =, +=, -=, *=, /=````
+```c++
+void ExampleSumAllExceptDim(void) {
+  Tensor<cpu,3> src = NewTensor<cpu>(Shape3(2, 3, 2), 1.0f);
+  Tensor<cpu,1> dst = NewTensor<cpu>(Shape1(3), 1.0f);
+  dst += sum_all_except<1>(src * 2.0f);
+  // dst[0] = 1.0 + 4.0 *2.0 = 9.0
+  ...
+}
+```
+======
+##### sum_rows
+* ```sum_rows(Expr<xpu, 2> src) ```
+* sum of rows in the matrix
+* input parameters:
+  - src: input mapper  expression
+* return expression type:  ```Tensor<xpu,1>```, with ```shape=(src.size(0))```,  is Complex expression
+* Syntax: ```dst [sv] [scale*] sum_rows(src) , <sv> can be =,+=,-=,*=,/=````
+```c++
+void ExampleSumRows(void) {
+  Tensor<cpu, 2> src = NewTensor<cpu>(Shape2(3, 2), 1.0f);
+  Tensor<cpu, 1> dst = NewTensor<cpu>(Shape1(2), 1.0f);
+  dst += sum_rows(src + 1.0f);
+  // dst[0] = 1.0 + 3.0 *(1.0+1.0) = 7.0
+  ...
+}
+```
+======
+##### unpack_patch2col
+* ```unpack_patch2col(Expr<xpu,3> img, int psize_y, int p_size_x, int pstride) ```
+* unpack local (overlap) patches of image to column of mat, can be used to implement convolution, after getting unpacked mat, we can use: ```output = dot(weight, mat)``` to get covolved results, the relations:
+  - weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x
+  - output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images
+  -  out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0
+  - out_width  = (in_width - psize_x) / pstride + 1
+* input parameters:
+  - img: source image, can be expression; (in_channels, in_height, in_width)
+  - psize_y height of each patch
+  - psize_x width of each patch
+  - pstride: stride of each patch
+* return expression type:  ```Tensor<xpu, 2>```, with ```shape=(in_channel*psize_x*psize_y, out_height*out_width)```,  is Chainer expression
+```c++
+void ExampleCovolution(Tensor<cpu, 3> dst, Tensor<cpu, 3> src,
+                       Tensor<cpu, 2> weight, int ksize, int stride) {
+  int o_height = (src.size(1)- ksize) / stride + 1;
+  int o_width  = (src.size(2)- ksize) / stride + 1;
+  utils::Assert(weight.size(1) == src.size(0) * ksize * ksize);
+  TensorContainer<cpu, 2> tmp_col(Shape2(src.size(0) * ksize * ksize,
+                                         o_height * o_width)); 
+  TensorContainer<cpu, 2> tmp_dst(Shape2(weight.size(0),
+                                         o_height * o_width)); 
+  tmp_col = unpack_patch2col(src, ksize, ksize, stride);
+  tmp_dst = dot(weight, tmp_col);
+  dst = reshape(tmp_dst, dst.shape_);
+}
+```
+
+======
+##### pack_col2patch
+* ```pack_col2patch(Tensor<xpu, 2> mat, Shape<3> imshape, int psize_y, int psize_x, int pstride) ````
+* reverse operation of unpack_patch2col, can be used to implement deconvolution
+* input parameters:
+  - mat: source mat, same shape as output of unpack_patch2col
+  - imshape: shape of target image
+  - psize_y height of each patch
+  - psize_x width of each patch
+  - pstride: stride of each patch
+* return expression type:  ```Tensor<xpu, 3>```, with ```shape = imshape```,  is Chainer expression
+```c++
+void ExampleDecovolution(Tensor<cpu, 3> bottom, Tensor<cpu, 3> top,
+                         Tensor<cpu, 2> weight, int ksize, int stride) {
+  int o_height = (bottom.size(1)- ksize) / stride + 1;
+  int o_width  = (bottom.size(2)- ksize) / stride + 1;
+  utils::Assert(weight.size(1) == bottom.size(0) * ksize * ksize);
+  TensorContainer<cpu, 2> tmp_col(Shape2(bottom.size(0) * ksize * ksize,
+                                         o_height * o_width)); 
+  TensorContainer<cpu, 2> tmp_dst(Shape2(weight.size(0), o_height*o_width)); 
+  tmp_dst = reshape(top, tmp_dst.shape_);
+  tmp_col = dot(weight.T(), tmp_dst);
+  bottom = pack_col2patch(tmp_col, bottom.shape_, ksize, ksize, stride);
+}
+```
+
+======
+##### pool
+* ```pool<Reducer>(Expr<xpu, dim> img, [Shape<2> pshape,] int ksize_y, int ksize_x, int kstride)```
+* Pooling on image with specify kernel size and stride, can be used to implement max pooilng and other pooling layer
+* input parameters:
+  - Reducer: operation can be max or sum
+  - img: source image, can be expression; (in_channels, in_height, in_width)
+  - [optional] Shape<2> pshape, output shape
+  - ksize_y height of each patch
+  - ksize_x width of each patch
+  - kstride: stride of each patch
+* return expression:  ```Expr<xpu, dim>```, with ```shape = (in_channel, (out_height - ksize) / kstride + 1, (out_width - ksize) / kstride + 1)```, or expression in pshape
+  - Chainer expression
+```c++
+void ExampleMaxPooling(TensorContainer<cpu, 3> &data, int ksize, int stride) {
+  TensorContainer<cpu, 3> pooled(Shape3(data.size(0),
+                                        (data.size(2) - ksize) / kstride + 1), 
+                                        (data.size(1) - ksize) / kstride + 1));
+  pooled = pool<red::maximum>(data, ksize, ksize, stride);
+}
+```
+
+======
+##### unpool
+* ```unpool<Reducer>(Tensor<xpu, 4> data_src, Tensor<xpu, 4> data_pooled, Tensor<xpu, 4> grad_pooled, int ksize_y,  int ksize_x, int kstride)```
+* Unpooling on image with specify kernel size and stride, can be used to implement backprop of max pooilng and other pooling layer
+* input parameters:
+  - Reducer: operation can be max or sum
+  - data_src: source image batch. 
+  - data_pooled: pooled image batch. 
+  - grad_pooled: gradient of upper layer
+  - ksize_y height of each patch
+  - ksize_x width of each patch
+  - kstride: stride of each patch
+* return:
+  Expression, same shape to data_src
+```c++
+void ExampleMaxUnpooling(Tensor<cpu, 4> &data_src, Tensor<cpu, 4> &data_pooled, 
+                         Tensor<cpu, 4> &grad_pooled, int ksize, int kstride) {
+  TensorContainer<cpu, 4> grad(data_src.shape_);
+  grad = unpool<red::maximum>(data_src, data_pooled,
+                              grad_pooled, ksize, ksize, kstride);
+}
+```
+
+======
+##### crop
+* ```crop(Expr<xpu, dim> src, Shape<2> oshape, int start_height, int start_width)```
+* input parameters:
+ - src: input expression 
+ - oshape: output shape after crop
+ - start_height: start height for cropping
+ - start_width: start width for cropping
+* Can also be ```crop(Expr<xpu, dim> src, Shape<2> oshape)``` where the crop will happen in center. 
+* return
+ - cropped expression
+```c++
+void ExampleCrop(TensorContainer<cpu, 3> img, int start_height, int start_width) {
+  TensorContainer<cpu> cropped(Shape3(img.size(0),
+                                      img.size(1) - start_height,
+                                      img.size(2) - start_width));
+  cropped = crop(img, start_height, start_width);
+}
+```
+
+======
+##### mirror
+* ```mirrow(Expr<xpu, dim> src)```
+* input:
+    - src, source expression to be mirrored
+* output:
+    - expression of mirrored result
+```c++
+void ExampleMirror(TensorContainer<cpu, 3> img) {
+  TensorContainer<cpu> mirrored(img.shape_);
+  mirrored = mirror(img);
+}
+```
+
diff --git a/3rdparty/mshadow/doc/mkdoc.sh b/3rdparty/mshadow/doc/mkdoc.sh
new file mode 100755
index 000000000000..3ee3d71b8ce8
--- /dev/null
+++ b/3rdparty/mshadow/doc/mkdoc.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+cd ..
+doxygen doc/Doxyfile
+cd doc
diff --git a/3rdparty/mshadow/guide/.gitignore b/3rdparty/mshadow/guide/.gitignore
new file mode 100644
index 000000000000..ef7796743059
--- /dev/null
+++ b/3rdparty/mshadow/guide/.gitignore
@@ -0,0 +1,3 @@
+defop
+basic
+config.mk
diff --git a/3rdparty/mshadow/guide/Makefile b/3rdparty/mshadow/guide/Makefile
new file mode 100644
index 000000000000..bad7a8e94b1d
--- /dev/null
+++ b/3rdparty/mshadow/guide/Makefile
@@ -0,0 +1,37 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export NVCC =nvcc
+include config.mk
+include ../make/mshadow.mk
+export CFLAGS = -Wall -O3 -std=c++11 -I../ $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+
+# specify tensor path
+BIN = basic defop
+OBJ =
+CUOBJ =
+CUBIN =
+.PHONY: clean all
+
+all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
+
+basic: basic.cpp
+defop: defop.cpp
+basic_stream: basic_stream.cu
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(CUOBJ) :
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+
+$(CUBIN) :
+	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
diff --git a/3rdparty/mshadow/guide/README.md b/3rdparty/mshadow/guide/README.md
new file mode 100644
index 000000000000..3f2e6819b7ec
--- /dev/null
+++ b/3rdparty/mshadow/guide/README.md
@@ -0,0 +1,226 @@
+Tutorial of mshadow
+=====
+This is a beginner's tutorial for mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcome to contribute :)
+Please send a pull-request if you would like to share your experience.
+
+See also other related materials about mshadow
+* [Expression Template Tutorial](exp-template)
+* [Writing Multi-GPU and Distributed ML](mshadow-ps)
+
+**List of Topics**
+* [Tensor Data Structure](#tensor-data-structure)
+* [Memory Allocation](#memory-allocation)
+* [Elementwise Operations](#elementwise-operations)
+* [One code for both CPU and GPU](#one-code-for-both-cpu-and-gpu)
+* [Matrix Multiplications](#matrix-multiplications)
+* [User Defined Operator](#user-defined-operator)
+
+Tensor Data Structure
+====
+The basic data structure of mshadow is Tensor. The following is a simplified equivalent version of
+the declaration in [mashadow/tensor.h](../mshadow/tensor.h)
+```c++
+typedef unsigned index_t;
+template<int dimension>
+struct Shape {
+  index_t shape_[dimension];
+};
+template<typename Device, int dimension, typename DType = float>
+struct Tensor {
+  DType *dptr_;
+  Shape<dimension> shape_;
+  Stream<Device> stream_;
+  index_t stride_;
+};
+// this is how shape object declaration look like
+Shape<2> shape2;
+// this is how tensor object declaration look like
+// you can
+Tensor<cpu, 2> ts2;
+Tensor<gpu, 3, float> ts3;
+```
+``` Tensor<cpu,2>``` is a two dimensional tensor in host memory, while ```Tensor<gpu,3>``` is a three dimensional tensor in device memory.
+```Shape<k>``` gives the shape information of a k-dimensional tensor. The declarations use templates and
+can be specialized to tensors on a specific device and of a specific dimension. This is what a two dimensional tensor would look like:
+```c++
+struct Shape<2> {
+  index_t shape_[2];
+};
+struct Tensor<cpu, 2, float> {
+  float *dptr_;
+  Shape<2> shape_;
+  index_t stride_;
+};
+```
+* ``` Tensor<cpu, 2>``` contains ```dptr_```, which points to the space that backs up the tensor.
+* ```Shape<2>``` is a structure that stores shape information, the convention is the same as numpy.
+* ```stride_``` gives the number of cell spaces allocated in the smallest dimension (if we use numpy convention, the dimension corresponds to shape_[-1]).
+This is introduced when we introduce some padding cells in lowest dimension to make sure memory is aligned. ```stride_``` is automatically set during
+memory allocation of a tensor in mshadow.
+
+To understand the data structure, consider the following code:
+``` c++
+float data[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+Tensor<cpu, 2> ts;
+ts.dptr_ = data;
+ts.shape_ = mshadow::Shape2(3, 2);
+ts.stride_ = 3;
+// now: ts[0][0] == 0, ts[0][1] == 1 , ts[1][0] == 3, ts[1][1] == 4
+for (index_t i = 0; i < ts.size(0); ++i) {
+  for (index_t j = 0; j < ts.size(1); ++j) {
+    printf("ts[%u][%u]=%f\n", i, j, ts[i][j]);
+  }
+}
+```
+The result ts should be a 3 * 2 matrix, where data[2], data[5], data[8] are padding cells that are ignored. If you want a continuous memory, set ```stride_=shape_[1]```.
+
+NOTICE: We highly recommend use stream in ```gpu``` mode, there will be an error thrown out if no stream is set. Check [basic_stream.cu](basic_stream.cu) for more detail.
+
+Memory Allocation
+====
+An important design choice in mshadow was making the data structure ```Tensor``` a **whitebox**,
+it works so long as we set the space pointer ```dptr_``` corresponding ```shape_``` and ```stride_```:
+* For ```Tensor<cpu, k>``` ```dptr_``` must point to space created by ```new float[]``` or to some existing space such as the float array in the last example.
+* For ```Tensor<gpu, k>``` ```dptr_``` must point to space on the device created by ```cudaMallocPitch```.
+
+mshadow also provides an explicit memory allocation routine, as shown in following code:
+``` c++
+// create a 5 x 3 tensor on the device, and allocate space
+Tensor<gpu, 2> ts2(Shape2(5, 3));
+AllocSpace(&ts2);
+// allocate 5 x 3 x 2 tensor on the host, initialized by 0
+Tensor<cpu, 3> ts3 = NewTensor<cpu>(Shape3(5,3,2), 0.0f);
+// free space
+FreeSpace(&ts2); FreeSpace(&ts3);
+```
+All memory allocations in mshadow are **explicit**. There are **no** implicit memory allocations or de-allocations during any operations.
+This means ```Tensor<cpu, k>``` variable is more like a reference handle(pointer), instead of a object. If we assign a tensor to another variable, the two share the same content space.
+
+This also allows user to use mshadow in their existing project easily, simply give mshadow the pointer of the memory and you can get the benefit of all the mshadow expressions with zero cost:)
+
+We also have STL style container object called ```TensorContainer```, they behave exactly the same as Tensors, but the memory will be automatically freed during destruction.
+
+Elementwise Operations
+====
+All the operators(+, -, *, /, += etc.) in mshadow are element-wise. Consider the following SGD update code:
+```c++
+void UpdateSGD(Tensor<cpu, 2> weight, Tensor<cpu, 2> grad, float eta, float lambda) {
+  weight -= eta * (grad + lambda * weight);
+}
+```
+During compilation, this code will be translated to the following form:
+```c++
+void UpdateSGD(Tensor<cpu,2> weight, Tensor<cpu,2> grad, float eta, float lambda) {
+  for (index_t y = 0; y < weight.size(0); ++y) {
+    for (index_t x = 0; x < weight.size(1); ++x) {
+      weight[y][x] -= eta * (grad[y][x] + lambda * weight[y][x]);
+    }
+  }
+}
+```
+As we can see, *no memory allocation* happens in the translated code. For ```Tensor<gpu, k>```, the corresponding function will be translated into a CUDA kernel of the same spirit.
+Using an [Expression Template](exp-template), the translation happens at compile time. We can write simple lines of code while getting the full performance of the translated code.
+
+One code for both CPU and GPU
+====
+Since mshadow has an identical interface for ```Tensor<cpu, k>``` and ```Tensor<gpu, k>```, we can easily write code that works on both the CPU and GPU.
+For example, the following code compiles for both GPU and CPU Tensors.
+```c++
+template<typename xpu>
+void UpdateSGD(Tensor<xpu, 2> weight, const Tensor<xpu, 2> &grad,
+               float eta, float lambda) {
+  weight -= eta * (grad + lambda * weight);
+}
+```
+Matrix Multiplications
+====
+We also have a shorthand for dot product that will be translated to call standard packages such as MKL and CuBLAS.
+```c++
+template<typename xpu>
+void Backprop(Tensor<xpu, 2> gradin,
+              const Tensor<xpu, 2> &gradout,
+              const Tensor<xpu, 2> &netweight) {
+  gradin = dot(gradout, netweight.T());
+}
+```
+Again, the code can compile for both GPU and CPU Tensors.
+
+User Defined Operator
+====
+There are common cases when we want to define our own function. For example, assume we do not have an element-wise sigmoid transformation in mshadow.
+We simply use the following code to add ```sigmoid``` to mshadow
+```c++
+struct sigmoid {
+  MSHADOW_XINLINE static float Map(float a) {
+    return 1.0f / (1.0f + expf(-a));
+  }
+};
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  out = F<sigmoid>(in * 2.0f) + 1.0f;
+}
+```
+The translated code for CPU is given by
+```c++
+template<typename xpu>
+void ExampleSigmoid(Tensor<xpu, 2> out, const Tensor<xpu, 2> &in) {
+  for (index_t y = 0; y < out.size(0); ++y) {
+    for(index_t x = 0; x < out.size(1); ++x) {
+      out[y][x] = sigmoid::Map(in[y][x] * 2.0f) + 1.0f;
+    }
+  }
+}
+```
+Also note that the defined operation can be **composited into expressions**, not only we can write ```out = F<sigmoid>(in)```,
+we can also write ```out = F<sigmoid>+2.0``` or ```out = F<sigmoid>(F<sigmoid>(in))```.
+
+There will also be a translated CUDA kernel version that runs on the GPU. Check out [defop.cpp](defop.cpp) for a complete example.
+
+Complete Example
+====
+The following code is from [basic.cpp](basic.cpp). It illustrates basic usage of mshadow.
+
+```c++
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // assume we have a float space
+  float data[20];
+  // create a 2 x 5 x 2 tensor, from existing space
+  Tensor<cpu, 3> ts(data, Shape3(2,5,2));
+    // take first subscript of the tensor
+  Tensor<cpu, 2> mat = ts[0];
+  // Tensor object is only a handle, assignment means they have same data content
+  // we can specify content type of a Tensor, if not specified, it is float bydefault
+  Tensor<cpu, 2, float> mat2 = mat;
+
+  // shape of matrix, note size order is the same as numpy
+  printf("%u X %u matrix\n", mat.size(0), mat.size(1));
+
+  // initialize all element to zero
+  mat = 0.0f;
+  // assign some values
+  mat[0][1] = 1.0f; mat[1][0] = 2.0f;
+  // elementwise operations
+  mat += (mat + 10.0f) / 10.0f + 2.0f;
+
+  // print out matrix, note: mat2 and mat1 are handles(pointers)
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat2[i][j]);
+    }
+    printf("\n");
+  }
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
+```
+
diff --git a/3rdparty/mshadow/guide/basic.cpp b/3rdparty/mshadow/guide/basic.cpp
new file mode 100644
index 000000000000..8e90ac966053
--- /dev/null
+++ b/3rdparty/mshadow/guide/basic.cpp
@@ -0,0 +1,161 @@
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // assume we have a float space
+  float data[20];
+  // create a 2 x 5 x 2 tensor, from existing space
+  Tensor<cpu, 3> ts(data, Shape3(2,5,2));
+  // take first subscript of the tensor
+  Tensor<cpu, 2> mat = ts[0];
+  // Tensor object is only a handle, assignment means they have same data content
+  // we can specify content type of a Tensor, if not specified, it is float bydefault
+  Tensor<cpu, 2, float> mat2 = mat;
+  mat = Tensor<cpu, 1>(data, Shape1(10)).FlatTo2D();
+
+  // shaape of matrix, note size order is same as numpy
+  printf("%u X %u matrix\n", mat.size(0), mat.size(1));
+
+  // initialize all element to zero
+  mat = 0.0f;
+  // assign some values
+  mat[0][1] = 1.0f; mat[1][0] = 2.0f;
+  // elementwise operations
+  mat += (mat + 10.0f) / 10.0f + 2.0f;
+
+  // print out matrix, note: mat2 and mat1 are handles(pointers)
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat2[i][j]);
+    }
+    printf("\n");
+  }
+
+  TensorContainer<cpu, 2> lhs(Shape2(2, 3)), rhs(Shape2(2, 3)), ret(Shape2(2,2));
+  lhs = 1.0;
+  rhs = 1.0;
+  ret = implicit_dot(lhs, rhs.T());
+  VectorDot(ret[0].Slice(0, 1), lhs[0], rhs[0]);
+  printf("vdot=%f\n", ret[0][0]);
+  int cnt = 0;
+  for (index_t i = 0; i < ret.size(0); ++i) {
+    for (index_t j = 0; j < ret.size(1); ++j) {
+      printf("%.2f ", ret[i][j]);
+    }
+    printf("\n");
+  }
+
+  printf("\n");
+
+  for (index_t i = 0; i < lhs.size(0); ++i) {
+    for (index_t j = 0; j < lhs.size(1); ++j) {
+      lhs[i][j] = cnt++;
+      printf("%.2f ", lhs[i][j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  TensorContainer<cpu, 1> index(Shape1(2)), choosed(Shape1(2));
+  index[0] = 1; index[1] = 2;
+  choosed = mat_choose_row_element(lhs, index);
+  for (index_t i = 0; i < choosed.size(0); ++i) {
+    printf("%.2f ", choosed[i]);
+  }
+  printf("\n");
+
+  TensorContainer<cpu, 2> recover_lhs(Shape2(2, 3)), small_mat(Shape2(2, 3));
+  small_mat = -100.0f;
+  recover_lhs = mat_fill_row_element(small_mat, choosed, index);
+  for (index_t i = 0; i < recover_lhs.size(0); ++i) {
+    for (index_t j = 0; j < recover_lhs.size(1); ++j) {
+      printf("%.2f ", recover_lhs[i][j] - lhs[i][j]);
+    }
+  }
+  printf("\n");
+
+  rhs = one_hot_encode(index, 3);
+
+  for (index_t i = 0; i < lhs.size(0); ++i) {
+    for (index_t j = 0; j < lhs.size(1); ++j) {
+      printf("%.2f ", rhs[i][j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  TensorContainer<cpu, 1> idx(Shape1(3));
+  idx[0] = 8;
+  idx[1] = 0;
+  idx[2] = 1;
+
+  TensorContainer<cpu, 2> weight(Shape2(10, 5));
+  TensorContainer<cpu, 2> embed(Shape2(3, 5));
+
+  for (index_t i = 0; i < weight.size(0); ++i) {
+    for (index_t j = 0; j < weight.size(1); ++j) {
+      weight[i][j] = i;
+    }
+  }
+  embed = take(idx, weight);
+  for (index_t i = 0; i < embed.size(0); ++i) {
+    for (index_t j = 0; j < embed.size(1); ++j) {
+      printf("%.2f ", embed[i][j]);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+  weight = take_grad(idx, embed, 10);
+  for (index_t i = 0; i < weight.size(0); ++i) {
+    for (index_t j = 0; j < weight.size(1); ++j) {
+      printf("%.2f ", weight[i][j]);
+    }
+    printf("\n");
+  }
+
+  printf("upsampling\n");
+  TensorContainer<cpu, 2> small(Shape2(2, 2));
+  small[0][0] = 1.0f;
+  small[0][1] = 2.0f;
+  small[1][0] = 3.0f;
+  small[1][1] = 4.0f;
+  TensorContainer<cpu, 2> large(Shape2(6, 6));
+  large = upsampling_nearest(small, 3);
+  for (index_t i = 0; i < large.size(0); ++i) {
+    for (index_t j = 0; j < large.size(1); ++j) {
+      printf("%.2f ", large[i][j]);
+    }
+    printf("\n");
+  }
+  small = pool<red::sum>(large, small.shape_, 3, 3, 3, 3);
+  // shutdown tensor enigne after usage
+  for (index_t i = 0; i < small.size(0); ++i) {
+    for (index_t j = 0; j < small.size(1); ++j) {
+      printf("%.2f ", small[i][j]);
+    }
+    printf("\n");
+  }
+
+  printf("mask\n");
+  TensorContainer<cpu, 2> mask_data(Shape2(6, 8));
+  TensorContainer<cpu, 2> mask_out(Shape2(6, 8));
+  TensorContainer<cpu, 1> mask_src(Shape1(6));
+
+  mask_data = 1.0f;
+  for (int i = 0; i < 6; ++i) {
+    mask_src[i] = static_cast<float>(i);
+  }
+  mask_out = mask(mask_src, mask_data);
+  for (index_t i = 0; i < mask_out.size(0); ++i) {
+    for (index_t j = 0; j < mask_out.size(1); ++j) {
+      printf("%.2f ", mask_out[i][j]);
+    }
+    printf("\n");
+  }
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/basic_stream.cu b/3rdparty/mshadow/guide/basic_stream.cu
new file mode 100644
index 000000000000..930fc54710f7
--- /dev/null
+++ b/3rdparty/mshadow/guide/basic_stream.cu
@@ -0,0 +1,35 @@
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<gpu>();
+  // create a 2 x 5 tensor, from existing space
+  Stream<gpu> *sm1 = NewStream<gpu>();
+  Stream<gpu> *sm2 = NewStream<gpu>();
+  Tensor<gpu, 2, float> ts1 = NewTensor<gpu, float>(Shape2(2, 5), 0.0f, sm1);
+  Tensor<gpu, 2, float> ts2 = NewTensor<gpu, float>(Shape2(2, 5), 0.0f, sm2);
+  ts1 = 1; // Should use stream 0.
+  ts2 = 2; // Should use stream 1. Can run in parallel with stream 0.
+  Tensor<gpu, 2> res = NewTensor<gpu, float>(Shape2(2, 2), 0.0f);
+  res.stream_ = NewStream<gpu>();
+  res = dot(ts1, ts2.T()); //Should use stream 2.
+
+  Tensor<cpu, 2> cpu_res = NewTensor<cpu, float>(Shape2(2, 2), 0.0f);
+  Copy(cpu_res, res); // default stream, should be 0.
+  for (index_t i = 0; i < cpu_res.size(0); ++i){
+    for (index_t j = 0; j < cpu_res.size(1); ++j){
+      printf("%.2f ", cpu_res[i][j]);
+    }
+    printf("\n");
+  }
+  // shutdown tensor enigne after usage
+  DeleteStream(sm1);
+  DeleteStream(sm2);
+  ShutdownTensorEngine<gpu>();
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/config.mk b/3rdparty/mshadow/guide/config.mk
new file mode 100644
index 000000000000..26431bb06b3f
--- /dev/null
+++ b/3rdparty/mshadow/guide/config.mk
@@ -0,0 +1,39 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+#
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = blas
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+#
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
+
+# whether compile with rabit allreduce
+USE_RABIT_PS = 0
+RABIT_PATH = NONE
diff --git a/3rdparty/mshadow/guide/defop.cpp b/3rdparty/mshadow/guide/defop.cpp
new file mode 100644
index 000000000000..8aa1b5c8f371
--- /dev/null
+++ b/3rdparty/mshadow/guide/defop.cpp
@@ -0,0 +1,49 @@
+#include <cmath>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// user defined unary operator addone
+struct addone {
+  // map can be template function
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return  a + static_cast<DType>(1);
+  }
+};
+// user defined binary operator max of two
+struct maxoftwo {
+  // map can also be normal functions,
+  // however, this can only be applied to float tensor
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    if(a > b) return a;
+    else return b;
+  }
+};
+
+int main(void) {
+  // intialize tensor engine before using tensor operation, needed for CuBLAS
+  InitTensorEngine<cpu>();
+  // take first subscript of the tensor
+  Stream<cpu> *stream_ = NewStream<cpu>(0);
+  Tensor<cpu,2, float> mat = NewTensor<cpu>(Shape2(2,3), 0.0f, stream_);
+  Tensor<cpu,2, float> mat2= NewTensor<cpu>(Shape2(2,3), 0.0f, stream_);
+
+  mat[0][0] = -2.0f;
+  mat = F<maxoftwo>(F<addone>(mat) + 0.5f, mat2);
+
+  for (index_t i = 0; i < mat.size(0); ++i) {
+    for (index_t j = 0; j < mat.size(1); ++j) {
+      printf("%.2f ", mat[i][j]);
+    }
+    printf("\n");
+  }
+  FreeSpace(&mat); FreeSpace(&mat2);
+  DeleteStream(stream_);
+  // shutdown tensor enigne after usage
+  ShutdownTensorEngine<cpu>();
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/exp-template/.gitignore b/3rdparty/mshadow/guide/exp-template/.gitignore
new file mode 100644
index 000000000000..fc070ad5bd7e
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/.gitignore
@@ -0,0 +1 @@
+exp_*
\ No newline at end of file
diff --git a/3rdparty/mshadow/guide/exp-template/Makefile b/3rdparty/mshadow/guide/exp-template/Makefile
new file mode 100644
index 000000000000..90639896f751
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/Makefile
@@ -0,0 +1,20 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export CFLAGS = -Wall -O3 
+# specify tensor path
+BIN = exp_lazy exp_template exp_template_op
+
+.PHONY: clean all
+
+all: $(BIN) 
+
+exp_lazy: exp_lazy.cpp
+exp_template: exp_template.cpp
+exp_template_op: exp_template_op.cpp
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+
+clean:
+	rm -rf $(BIN) *~
diff --git a/3rdparty/mshadow/guide/exp-template/README.md b/3rdparty/mshadow/guide/exp-template/README.md
new file mode 100644
index 000000000000..46e0019e9583
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/README.md
@@ -0,0 +1,340 @@
+Expression Template Tutorial
+====
+This page explains how mshadow works. The main trick behind mshadow is called [Expression Template](http://en.wikipedia.org/wiki/Expression_templates).
+We will explain how it will affect the performance of compiled code. Expression template is the major trick behind the C++ matrix libraries such as Eigen, GSL, boost.uBLAS.
+
+How to write efficient machine learning code
+====
+Before we start, let us think of the question above. Assume we want to write down the update rule (for illustration purpose here, while typical update would be `weight += - eta * (grad + lambda * weight)` )
+```c++
+weight =  - eta * (grad + lambda * weight);
+```
+Where weight and grad are vectors of length ```n```. When you choose C++ as your programming language,
+I guess the major concern is efficiency. There is one principle that is important and used in most C/C++ programs:
+* Pre-allocate necessary memory, **no temporal memory allocation** during running.
+
+An example code is like
+```c++
+void UpdateWeight (const float *grad, float eta, float lambda,
+                   int n, float *weight) {
+  for (int i = 0; i < n; ++i) {
+    weight[i] =  - eta * (grad[i] + lambda * weight[i]);
+  }
+}
+``` 
+The function takes the pre-allocated space grad, and weight, and run the calculation. Writing these functions are simple,
+however, it can be annoying when we write them repeatedly. So the question is, can we write as follows, and get same performance as previous code?
+```c++
+void UpdateWeight (const Vec& grad, float eta, float lambda, Vec& weight) {
+  weight = -eta * (grad + lambda * weight);
+} 
+```
+The answer is yes, but not by the most obvious solution.
+
+A Naive Bad Solution
+====
+Let us first take a look at a most straight forward solution: operator overloading.
+```c++
+// Naive solution for vector operation overloading 
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(int len) : len(len) { 
+    dptr = new float[len];
+  }
+  Vec(const Vec& src) : len(src.len) {
+    dptr = new float[len];
+    memcpy(dptr, src.dptr, sizeof(float)*len ); 
+  }
+  ~Vec(void) {
+    delete [] dptr;
+  }
+};
+
+inline Vec operator+(const Vec &lhs, const Vec &rhs) {
+  Vec res(lhs.len);
+  for (int i = 0; i < lhs.len; ++i) {
+    res.dptr[i] = lhs.dptr[i] + rhs.dptr[i];
+  } 
+  return res;
+} 
+```
+If we add more operators overloading in the same style, we can get what we want, and write equations instead of loop.
+However, this kind of approach is inefficient, because temporal memory is allocated and de-allocated during each operation, while we could have done better.
+
+An alternative, more effective way is only overload operator+=, operator-=, which can be implemented without temporal memory allocation. But this limits the equations we can write.
+
+We will discuss why we still need expression template although C++11 provides move assignment operator and rvalue reference at the end of this tutorial. 
+
+Lazy Evaluation
+====
+Let us think why we need temporal memory allocation when doing operator+. This is because we *do not know* the target that will be assigned to in operator+,
+otherwise we could have directly storing into target memory instead of temporal memory. 
+
+What if we can know the target? The following code ([exp_lazy.cpp](exp_lazy.cpp)) achieves this. 
+```c++
+// Example Lazy evaluation code
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+struct Vec;
+// expression structure holds the expression
+struct BinaryAddExp {
+  const Vec &lhs;
+  const Vec &rhs;
+  BinaryAddExp(const Vec &lhs, const Vec &rhs)
+  : lhs(lhs), rhs(rhs) {}
+};
+// no constructor and destructor to allocate and de-allocate memory,
+//  allocation done by user
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  inline Vec &operator=(const BinaryAddExp &src) {
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
+    }
+    return *this;
+  }
+};
+// no evaluation happens here
+inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) {
+  return BinaryAddExp(lhs, rhs);
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression
+  A = B + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
+```
+The idea is that we do not actually do computation in operator+, but only return a expression structure (like abstract syntax tree),
+and when we overload operator=, we see the target, as well as all the operands, and we can run computation without introducing extra memory!
+Similarly, we can define a DotExp and lazily evaluate at operator=, and redirect matrix(vector) multiplications to BLAS.
+
+
+More Lengthy Expressions and Expression Template
+====
+By using lazy evaluation, we are cool by avoiding temporal memory allocations. But the ability of the code is limited:
+* We can only write ```A=B+C```, but not more lengthy expressions.
+* When we add more expression, we need to write more operator= to evaluate each equations.
+
+Here is where the magic of template programming comes to rescue. The following code ([exp_template.cpp](exp_template.cpp)),
+which is a bit more lengthy, also allows you to write lengthy equations.
+```c++
+// Example code, expression template, and more length equations
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+//  and put their type in subtype
+template<typename SubType>
+struct Exp {
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename TLhs, typename TRhs>
+struct BinaryAddExp: public Exp<BinaryAddExp<TLhs, TRhs> > {
+  const TLhs &lhs;
+  const TRhs &rhs;
+  BinaryAddExp(const TLhs& lhs, const TRhs& rhs)
+      : lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return lhs.Eval(i) + rhs.Eval(i);
+  }
+};
+// no constructor and destructor to allocate
+// and de-allocate memory, allocation done by user
+struct Vec: public Exp<Vec> {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      :len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator= (const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename TLhs, typename TRhs>
+inline BinaryAddExp<TLhs, TRhs>
+operator+(const Exp<TLhs> &lhs, const Exp<TRhs> &rhs) {
+  return BinaryAddExp<TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B + C + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f + %f + %f\n", i,
+           A.dptr[i], B.dptr[i],
+           C.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
+```
+The key idea of the code is the template ```Exp<SubType>``` takes type of its derived class as template argument, so it can convert itself to
+the SubType via ```self()```.  BinaryAddExp now is a template class that can composite expressions together, like a template version of Composite pattern.
+The evaluation is done through function Eval, which is done in a recursive way in BinaryAddExp.
+* Due to inlining, the function calls of ```src.Eval(i)``` in ```operator=``` will be compiled into ```B.dptr[i] + C.dptr[i] + C.dptr[i]``` in compile time.
+* We can write equations for element-wise operations with same efficiency as if we write a loop  
+
+Make it more flexible
+====
+As we can find in the previous example, template programming is a powerful to make things flexible in compile time, our final example,
+which is closer to mshadow, allows user customized binary operators ([exp_template_op.cpp](exp_template_op.cpp)). 
+```c++
+// Example code, expression template
+// with binary operator definition and extension
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+// and put their type in subtype
+template<typename SubType>
+struct Exp{
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary operators
+struct mul{
+  inline static float Map(float a, float b) {
+    return a * b;
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename OP, typename TLhs, typename TRhs>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TLhs, TRhs> >{
+  const TLhs& lhs;
+  const TRhs& rhs;
+  BinaryMapExp(const TLhs& lhs, const TRhs& rhs)
+      :lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return OP::Map(lhs.Eval(i), rhs.Eval(i));
+  }
+};
+// no constructor and destructor to allocate and de-allocate memory
+// allocation done by user
+struct Vec: public Exp<Vec>{
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator=(const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template binary operation, works for any expressions
+template<typename OP, typename TLhs, typename TRhs>
+inline BinaryMapExp<OP, TLhs, TRhs>
+F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return BinaryMapExp<OP, TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+template<typename TLhs, typename TRhs>
+inline BinaryMapExp<mul, TLhs, TRhs>
+operator*(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return F<mul>(lhs, rhs);
+}
+
+// user defined operation
+struct maximum{
+  inline static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B * F<maximum>(C, B);
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f * max(%f, %f)\n",
+           i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]);
+  }
+  return 0;
+}
+```
+
+Summary
+=====
+Up to this point, you should have understand basic ideas how it works:
+* Lazy evaluation, to allow us see all the operands and target
+* Template composition and recursive evaluation, to allows us evaluate arbitrary composite expressions for element-wise operations.
+* Due to template and inlining, writing expressions are as efficient as if we directly write a for loop to implement the update rule:)
+
+So write expressions when you write machine learning codes, and focus your energy on the algorithm part that matters.
+
+The Expression Template in MShadow
+=====
+Expression template in mshadow use the same key points as we introduced in the tutorial, with some minor differences:
+* We separate evaluation code from expression construction and composition code.  
+    - Instead of putting Eval in Exp class. A Plan class is created from expression, and used to evaluate the result. 
+    - This allows us to put less variables in Plan, for example, we do not need array length when we evaluate a data.
+    - One important reason is CUDA kernel cannot take class with const references 
+    - This design choice is debatable, but we find it is useful so far.
+* Lazy support for complex expressions such as matrix dot product
+    - Besides element-wise expressions, we also want to support sugars such as ```A = dot(B.T(), C)```,  again, lazy evaluation is used and no extra memory is allocated.
+* Type checking and array length checking.
+
+Notes
+====
+* Expression Template and C++11: in C++11, move constructor can be used to save repetitive allocation memory, which removes some need to expression template. However, the space still needs to be allocated at least once. 
+   - This only removes the need of expression template then expression generate space, say dst = A+B+C, dst does not contain space allocated before assignment.
+   - If we want to keep the syntax that everything is pre-allocated, and expression executes without memory allocation (which is what we did in mshadow), we still need expression template.
+
diff --git a/3rdparty/mshadow/guide/exp-template/exp_lazy.cpp b/3rdparty/mshadow/guide/exp-template/exp_lazy.cpp
new file mode 100644
index 000000000000..4e6a6b14b9de
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/exp_lazy.cpp
@@ -0,0 +1,45 @@
+// Example Lazy evaluation code
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+struct Vec;
+// expression structure holds the expression
+struct BinaryAddExp {
+  const Vec &lhs;
+  const Vec &rhs;
+  BinaryAddExp(const Vec &lhs, const Vec &rhs)
+  : lhs(lhs), rhs(rhs) {}
+};
+// no constructor and destructor to allocate and de-allocate memory,
+//  allocation done by user
+struct Vec {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  inline Vec &operator=(const BinaryAddExp &src) {
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i];
+    }
+    return *this;
+  }
+};
+// no evaluation happens here
+inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) {
+  return BinaryAddExp(lhs, rhs);
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression
+  A = B + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/exp-template/exp_template.cpp b/3rdparty/mshadow/guide/exp-template/exp_template.cpp
new file mode 100644
index 000000000000..556b10316a3b
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/exp_template.cpp
@@ -0,0 +1,72 @@
+// Example code, expression template, and more length equations
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+//  and put their type in subtype
+template<typename SubType>
+struct Exp {
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename TLhs, typename TRhs>
+struct BinaryAddExp: public Exp<BinaryAddExp<TLhs, TRhs> > {
+  const TLhs &lhs;
+  const TRhs &rhs;
+  BinaryAddExp(const TLhs& lhs, const TRhs& rhs)
+      : lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return lhs.Eval(i) + rhs.Eval(i);
+  }
+};
+// no constructor and destructor to allocate
+// and de-allocate memory, allocation done by user
+struct Vec: public Exp<Vec> {
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      :len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator= (const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template add, works for any expressions
+template<typename TLhs, typename TRhs>
+inline BinaryAddExp<TLhs, TRhs>
+operator+(const Exp<TLhs> &lhs, const Exp<TRhs> &rhs) {
+  return BinaryAddExp<TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B + C + C;
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f + %f + %f\n", i,
+           A.dptr[i], B.dptr[i],
+           C.dptr[i], C.dptr[i]);
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/exp-template/exp_template_op.cpp b/3rdparty/mshadow/guide/exp-template/exp_template_op.cpp
new file mode 100644
index 000000000000..ecc1be804f98
--- /dev/null
+++ b/3rdparty/mshadow/guide/exp-template/exp_template_op.cpp
@@ -0,0 +1,92 @@
+// Example code, expression template
+// with binary operator definition and extension
+// for simplicity, we use struct and make all members public
+#include <cstdio>
+
+// this is expression, all expressions must inheritate it,
+// and put their type in subtype
+template<typename SubType>
+struct Exp{
+  // returns const reference of the actual type of this expression
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+};
+
+// binary operators
+struct mul{
+  inline static float Map(float a, float b) {
+    return a * b;
+  }
+};
+
+// binary add expression
+// note how it is inheritates from Exp
+// and put its own type into the template argument
+template<typename OP, typename TLhs, typename TRhs>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TLhs, TRhs> >{
+  const TLhs& lhs;
+  const TRhs& rhs;
+  BinaryMapExp(const TLhs& lhs, const TRhs& rhs)
+      :lhs(lhs), rhs(rhs) {}
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return OP::Map(lhs.Eval(i), rhs.Eval(i));
+  }
+};
+// no constructor and destructor to allocate and de-allocate memory
+// allocation done by user
+struct Vec: public Exp<Vec>{
+  int len;
+  float* dptr;
+  Vec(void) {}
+  Vec(float *dptr, int len)
+      : len(len), dptr(dptr) {}
+  // here is where evaluation happens
+  template<typename EType>
+  inline Vec& operator=(const Exp<EType>& src_) {
+    const EType &src = src_.self();
+    for (int i = 0; i < len; ++i) {
+      dptr[i] = src.Eval(i);
+    }
+    return *this;
+  }
+  // evaluation function, evaluate this expression at position i
+  inline float Eval(int i) const {
+    return dptr[i];
+  }
+};
+// template binary operation, works for any expressions
+template<typename OP, typename TLhs, typename TRhs>
+inline BinaryMapExp<OP, TLhs, TRhs>
+F(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return BinaryMapExp<OP, TLhs, TRhs>(lhs.self(), rhs.self());
+}
+
+template<typename TLhs, typename TRhs>
+inline BinaryMapExp<mul, TLhs, TRhs>
+operator*(const Exp<TLhs>& lhs, const Exp<TRhs>& rhs) {
+  return F<mul>(lhs, rhs);
+}
+
+// user defined operation
+struct maximum{
+  inline static float Map(float a, float b) {
+    return a > b ? a : b;
+  }
+};
+
+const int n = 3;
+int main(void) {
+  float sa[n] = {1, 2, 3};
+  float sb[n] = {2, 3, 4};
+  float sc[n] = {3, 4, 5};
+  Vec A(sa, n), B(sb, n), C(sc, n);
+  // run expression, this expression is longer:)
+  A = B * F<maximum>(C, B);
+  for (int i = 0; i < n; ++i) {
+    printf("%d:%f == %f * max(%f, %f)\n",
+           i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]);
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/.gitignore b/3rdparty/mshadow/guide/mshadow-ps/.gitignore
new file mode 100644
index 000000000000..351b5b4d3dfc
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/.gitignore
@@ -0,0 +1,4 @@
+log
+*cpu
+*gpu
+core*
diff --git a/3rdparty/mshadow/guide/mshadow-ps/2-levels.png b/3rdparty/mshadow/guide/mshadow-ps/2-levels.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f8a0e8081c485be2d8f45debca88bdaeaf5da5a
GIT binary patch
literal 59413
zcmcG$by!qg`!CKgz)%AMN;eYH(hY)wfOI!VOLxNzrId7chkyu@l7hq_B_&crgM=`2
zpY8KL@AsVF@4C*PXEt*&vsi2Gwbs7#bKf^n>Z%G*TuNLtG&HE<%jcSCXz1=}XdrKF
zETHALo)<0f3*@1xAd6NzO1%So!Et?Q;DLtr=-uBpsBc!L0O&AnudVN?ukuRF(#46}
z!ph~fHMg&mE6^JaO~O|U_~>NqX+iJn<ml`n<}1ngPY*HR^WSD3M*4rccsfWj>Z_>J
z%elB)(+hG7aPu-s;nLI7OSoIvh-p5T|93d>O_I^h)6-RqhsVdqhuepr+r{0M=c%Zu
zC=V|m4<8>F(1Xjv&)L($m&@6M=|7YFcb@0g9+vL*uAcTT&h&rhwRr8~<tfR?`1eNt
z^ZL&^J?(A&_nn+Q{@oU^L7u;V;d#o<%kw{T14AYLwu;HQIJvr8dw2lzO9@K+)ARq+
z_P_7*pZ@B0E}kyHDY)BPDmr^wy90wgE&i@fitpdi|MwpM|GCuM?X7`b|1<jOzoY-p
zwtxFe@ciBQ|8WriIpu#^fn%1!mEifGM<#_^x5l@Kh9-li`23l+FK90pGw1QQ(}C*)
z*Yn;#1cbB#`YpXCaxkQS55r6J$1RWbL$Q2t;4%rwb=Ob_Z0EOMhKEM)uXE>p?mJuO
z7J(~sZqYw~%%!Rgc>O;8p!d;K=iJJms1N@C^1@n`$;YZvu2Nc+QKn`~oxE`?Trc{u
zmNdCj_tt}+he}8=#F(9`bU-pku+s05J3SBorLU;plXTq`C(Wil_-|Kwo|5S&Pg9db
z#@z_QCFJcCFMU&txw$^7E3aLyu`xv~W`h~PM8C2E+ZR6Bm)0jp3*|`*g%srtgEmS9
zGoMJZvLc2QN>U>w(>*G!q2J){p}*80y-M9VqQ<7+ggW_}sb?Lym1MG}lpnE#&`&aU
z2+(dpNxXOn`aF2ZHa8eUq*j{*tiJkIR`<20$$nIvx)n`GCoxKv)?LQfSS#gANl$$D
zR>H5tK9P1HGKv_|77q3U?I%%{$dW(|kus+6NNUQ+BT_e>vGPXs1^sW8vd|ZAw%tsJ
z<Rws_wM~^=s0mUKN`uO#+^>=Qv&Q>_TD6a!4!(DmqNq<uPkqU+EM;2To-%VmjeS{H
zMnu?(63Mq|C%)9+PRUG&|6!>bCrRek`#cJ)vmxW~)o_}s^eZtWOY{L^vNxeiy@Xal
zBBgT0*j$m9f1A)I%RCgv=E+0cV)=O5Td4s$q27z$kt6lGry?J$Xj-7mlcnlkE#Loe
zVR9?b)?F3hp`vuYJb58Picr+hWyx{<Y9Jp+c`ArPf$Zr(y(=;Xm&D=K;eqSc%sh0L
zBI(S(v8dH`Lm8dP)xE!72<JdK@umceDjcw{>E--Pit#1UQ<yf2v?Etra`K*r0CZR7
zCc~hbBBG8*&}Y8eA$!1L*XhQl^QiI&rw3&|62+2bmrKMXkH0-xu4a*CUq&H#{GFd{
zGsE(IaKosWLsue`g3X`K>w^lPCBO2WUP9@;>(f~?BU3p1+HZ7G_D1ONvpqf5PT)(F
zei-+|TKlNq!*ET5BM~g44W{+Ji*<8seVVmk8mxJWtF!%ifnI!mHV%q<K9I$-OXnqv
zoT@wnj|yGTiN-Q;6K8a*H`-mnx3n8sMy;j>eviVd*$AJKzy02amScsvDIoHM>REp(
zL$789e;RtB!zQa~hX3zMh0{OFUA)(O4d>ZTO><Wa9KW+B2FL9@Rq}Z{FTU<x737;f
zldg1dhERF~B%o^Q9IK2_FUUv;CWidI+Q1OmC^0lVXk%1Qi8(H>U<T=T0s>2!yBzf}
zr@p0tKPD|}@5OPYZw|o(sWUsnJYAf|9wUMgPR)x=IK?jE*5*?kiY*)&w6DNr7zB68
zXuE~7q=kD?ELBVpRQ;;Q&~Wq^x0SW&YPrSdTN!<Q*B(p-YWG2gh_|%<6|nRsuV*WJ
z5V}D1mcy@jcvLR!N85_5xt=A~^c-qL&gf-e299K+v0)|h#2J1KOLzL%#_f3$4Hd7r
z+hF#B+Dx3ix3v=ux*2Z$6bgIw!`u;6&Mk$oiRH2zULv*y&q<YtLb<o3Ho$`|sx7Q0
zeBuyNz^Z?FWcpipV?c~;xDZ@p`LAqp?t$NQR})Xdv_4g8TdI5GScW`YQ7}INp1qOp
zN(9+2?XntcSKg3|mA80Z7vFf~HM~i}PGZ8l&?!Ba^f0NyJlSrnzW1bvX`1Q=vM_M?
zS%%Q>46G?$+aR8lQ~T|G8X#Q?zTTgCUO3_*eRpBG(lqbPQT_Rc(u*(xiuU|~YLLLT
ztgZk?iqURrtZ2q~L5lFIK#uIJ2+lYx8??8ILC9k?c{cANeKo|I@2DmJ3vqZdsYd3#
z<hRAbLRy@dzp}74J95sR`@n8I;^wg>fz4wj>X+1Zjs)SFI&KlRFV>!<{EqFLm9Ok9
z)m(HNpve{!XG_;=VgD^SYpfMk3rh-0IV`7l8L!;asU!HIVOtdSW)P12Qoq_T{!U95
zMVmt77o4vmanx5#Aa#NKx|JVddf?{6z7MR4hqLMDYhWLiNAd^KWY}Tc60P>Ukx!uV
zPjS@WPYFsf#kqLJ^`~Gh;BIH#6AL-D?UWVYpLgARDd{3iZcxKKWJ1e<$EzszbN{G-
z+xE+)nI4=dFU_VJ%fUd@!Pi(o@<f5-zQNV22oQ`C94iqX=AQPJE;e;cx~6^`8UUTY
z7VXfV{(OD#<s<L;f=B!mIdX{#4LZ~w%5~b}DyBgOXaSCmc2`8SvQ9aytfqO%|MPH>
ziK2>%$~R{JGsQS7-r%_Fwo4N^daD<rI0jKY870rXR%Il>8|67<swNGOA}^a`xZAb|
z2k1iDIlh(apJ}N{>gd0YVwQIw`_ppnEn`xwFdn5?jiIl1T70V3PyOm;ib#UUxCpiS
z(^N{gQF$6{9lL`Famd57Uk)ExgtC2FU&m3^DYkCs_QIxi-O<Khj_VI{=tIH}M~PIX
zBB-$qhN#?DUbvy<W%nF<r&p^=o=*A33gRsLu0hY2M9D)`B!wUdCH(%4G(*wcihk;M
zQ8JZ8TS+bb&GSO<F^f-ATAOQhPj1hb$#Q&MUCe?nNb0yh5kFWmiEAAf9Lflz0Q3kk
z<o6hyJfoLn3sz3Fg;qf%j2-{;*8BvM%vhZxS8OzBLD~lihxBJ#ZfTaQ*HL;Dn(Nd)
zHDbxCz!P~;Pb{BPPdt1;=@zp_6kuIdSw`tMYvL1kjPKxu?DsKyXTGxxy7Vbv?p<zr
z_q=fpSYG+f&yZZ+H+>@rkL$O<GQ|&C45PmLo{R33B;z6oNkX$LCLKq-8s5)Li?NRV
z_YBFYvA<Eu=O|-a^ojD0!DX+vtddoPe)mA=sMS`c#a32hcxEy@ngRtfFhz{GPruKl
z31~&)=#(X-@~)-;Z-KPUo}VFUW74sKr_*{4T$F1IW2H08w+B9m=>CN>nO^Dk7J|M@
z>zT(Kye0AvNkCABe7U9Fk3F8G{gZaf&ufGi{da2)#&m*5Aq}DWRq;`lOBrjP{|b&@
z9`sl?Ps@s<dpt!@d(MRJ3s*}Tg`v8(Okm<H?LP0;ZSt)4VW4+3&~7_hFQ_UgX7
zAz)cao6l8iv#pX8<s|{4jbmhw;A;9Q`w`JR<$^sJ;MSX=b(UP+9Zs1LNJ=U5JL|CT
z+e}>0OOBkNvpFb}G?|H~Mm0LM%%Xrmjf3x|W0+HvH<8L;qDv4_HzKHjQ4h*~N~3fW
zcdxBgrslHul+sv5yM>-o;iLL<z@H=K_beE)DsTZp!wCzaW_pi`$5fd$3kgz=ujbx4
z_=t+rO$%5q3yy)cKi!h4t`2{yAfQ3!M$dx&o6^tpO`lVD9)iwi-l@5)4O@vrcw<s2
zxTx>8H5=$&uJEfV{AoJNdtN4uRhd<peUXA6ZfL_i$s1;l`UYx|*>Oe_ZeHT2`_0Rw
zq^7<WkZoc6-oS3qFUcTEa!L<&z4^2hj#t}{BdSvfzJ4q9K`i~aeXc;`rM<Xb*hoc4
zC)Lio=s9_b-q$hZg*HB4UCqI<qVtL8u>l;%<hd%wUc4%X0PLh-SJ7w4{b?Gn^jS7c
z-66i7dn@ph!?cbb%?ldo(pqJKS<$Z2UZK(`rvI?2x;TM|dBodq6xM7r`l}gl;g|Av
zn>JQl%j6Cp(}MRKXI~QzfBM`55_9=3D)Yg35)Dcs_Kbl8#b@=c8L#(UHksvBNbvjq
zAp1C_V2Af&G5g&)^s%BH;|v^UQGj9yXV==MWJOo47g!Yt7g*gy-+Co=xn6uh)g=EY
z|7&@Eu;C#l?Qeeuatze9hvjO9#1~$ce@@RyM7>J?^i>Oy@wciZLpp)W*UyTR^JOi{
zZZJ<xs4t=(#%Vxs)13t*e!QY!@eGyKJrPTx3Xi$Yq76EkkS9z1itGtfu<-1Z5#I!H
zCvKeUvUAwe|7DL1Y`Z>t-mPXAkxN66+797CfRm@#N;nBF=co4tqtcMeA(^`k$hC47
zes}mk64+z_u3k-GxMJa^s7d%urVcII>u3A%sCa$SeOGleYGbSp@6G}3REp+1l)4~y
z-y!FX@8gfuueRwUs>6jxLCOXb$D-Jr;T9)nkyOq<6Ndkk)?W@X#fR&uu4$<ykdU$R
zB9u^wo^b90U}=Lpm$TWsR?2dNTts@R;`^VOy6&!ae+*2!4%Dh1-q-qX)H>CE;R*ZJ
zH_<fLFF*mqiTbq(3r%8kD=4OZM^Ws)j>dJ;gbn>tI#JLi`ltQ)3T2@~K$WeZG!zWz
z@J>R>lX1o=Bg|{bcOY@PUk~C*bVlKT#E7J+K-Je%NGu=H-Crn}VYMKXYqV6GaIJ2-
zY$kU<c6B}&?hYxYRFTPiH=Y#{``pg3=)GHa6b##>_-5VOj$6Wb;O;0sBnNiuM)O4u
z`tkLTq)+X?&du(xW^WeGgbqd3RZ&RSYJ3W{e_2(BAmzAjJCS_Xf+6@89glO|SNy$E
zX8!dUA&H7WqG_vTPdj~@zH*0qj*>P5L{^vDSMbn;SW9<-szlg<2&-w!FQCOuxIZ<K
z;=VV%ErzCN;lJId6bVF##@|g|%vx^XvH0Je_TOUUNR9|h##%AzZYm1eB=J}Z1Z;2@
z+~El{e-qlV{~oZhCk?0t!*y8hU)EI+EeIuI|9NDC*b}Duj;w~1%|E#Rm|0VgV5d<2
zDp&D2@-4QD<!An1i%!3$(H=qbFbTt&Bl-C%xCZGmv&-kl)k3Z&zo+|F93o`EOjgXl
z?8x^4-H}Y~+D^POEy(Hbm5GgSVbDou#3bey^ULIY@OPm(t7M!Ek=m~&#nLIiDd*>F
zu3qemWE?!CR)0y%OyC&~E;MG;21O+4v~nXpoK02(s+{J@m)cp!x*n|Cj9#7>%z+cn
zFn3{&*NzS+nRdd8QNNHfhY0mSwM#aa)YMc1QjY4z8N=sa$d1lUNcbKQ7S0$xK)f7`
z-XCtx?tM<PE^|0B9QM)TkWKD36Z`uYfatc4y$*F7J6_$2yzYAkU(gvV3S!`qE_8TF
z%c0!QQMjDE^FxVmhF(qK_v)}5d$=IHU2zH*f(rei)a1$TR=^3-&MEZ8x74-2@DQSn
zH}u~r9X9=$_+08iHdogTiejU!>#-4FNz_lOijM?2(LAaB=Ad1>7f=r!21D@ICRDFu
zx4px#ylPo7mr>xlqo@ms5haOPbY)AOO1fpt?*!s|b>D+BZ^WMq0ltf^!^aNq{`*GV
z(wjhr-P$j;;7?T35LiNp^^2c2z0Y&EWVR{&veFAzpQ91_Tf_;@Y~Pq<DOaS{u4KDY
zEImKh`8~~5?vRrIew$Jl*BPG@{!5E2J@pTWF3bpi4OvD{2ahqc_kzS<0P63|UFI8O
zN<Ow+k64CExVvE<g}bx_7qxfZ-PLSY8-C)`<I%G@+x%F<ejAT5NuQ^k)4E31zT;vc
z$(zJ~U98xSpkfhYkcfJPxutp2vkB+<yb3RZ&nz4$Z&|B#TmNT9JZ1F_2#~e)jfS7l
zn8dMF;3mVoWxQm)&vKBkQPAmh1INeq`ub4z?}Ao?39%dP6INzKKWXO^tuR@%6J-a;
zv0R`rk^Cg^4Zog3-i;Jzl|6}R&DmFCJ0%qT#vnO?j}BA9ev%|tWyf(2H^c`e(UgrL
zSRY`y-M!`<$XN`4;vN%dM}gT%A60_&g4eDjsClq-yA!TaeI#nt2yRPhf|%xhigVwM
z+q?6w*-#FNo5S9<+}&4S(~O)e-GjpGO<sHPHy#8@8}GV11V%rmaOW+C;u^T^OeD^v
zQ`>-#(Yt1w3D^$LOl@gre>(gDe61phoRGUq%gNZAi?#TO7i1=%VTn?z{(=O&SGTko
zF16gCh0nEbxjQ+8aonzzk{)G@gK(Kti$~OEwwvO}sz_Px_|uxq<^%q8d^lpmI~^B<
z(E1kSOZv|@!6Lfp&8eIm+E2`--w-aUZhkasHTNFX89>o^B+nomK5%P{T)z7mG8P@a
zIlkp=B(a638KopLZo0zODnWDhnxMj*rr>pfHz`F1`Pke;ZhM_Dz!M0IZB}fsPB-yx
zU-h1|PInv;oc-Q;_4~uos2595TLZ<M@F2BJkx7#<mi2t@72&H2Sg2kkN|TlnVdR%H
z!l&$Ya*W4w5x3+hVPH`=M$Q{ET(cZ<AM{kMF_@RYDSnv>Ki+sSW<M_Ybw-6H_24n@
zmHFUUxW4P8j&kttLWgSc<++pW=n0lyJKGZ)zi|T!)D^f#!Cd5mS_12Y1+pm9N3dE_
zl2ZG4MV!~?I7$h@!ByP2wvnZ_q?@#3^xnl;)k8D4A{>mX2vGEUSNexB7CUJB1vAZt
zh{llnyG)nlHoAj@FMo7*;%ffnc#LK)de1u<2%Zn<w|Lol(;{?AtK8#+0&*(c2{4vX
z^~7O1N5yW_d{=;jboaXE!UMIJs%J`39%<y#;iaK#JX*yvBDPx{^z`a<CNhUyK0CSE
zGj9&1iCfGvwuR$c`3CW3NKMi+&Z`{ipL=3BFJxg<-1Yog@%O{rk!e=gm!3_eaQC%8
zwM*%3Zj0&qh)kZb-yF6Crm2IJhN9ny0SE4#LakZYwUYaAJTGdAT{N9w1@`r?TI`@!
z{}k?yv+?_r8^jLMxXR+uJDPO8cM<b(^{xfrOs+Mx;Rdf616J{~pOy`JW<iY&-_kow
zH>&g!ll|kgA2;#D5*As`yGUpZQ?s#hAfk3q-tuQ6k8H9k+4#89HCHG)8vN}AFSfE>
zIvvLgp3MOsZng>FJ5!k5e5^}3v!2|xrC2J}-e9_t!yhEIF(UEG0W-)&_L#}}Nzl-?
z22W=R%*f3O%q}*GR(?{MXc`xP>}434Ru*dlqiyDbW6j%}TbmDvP`{i7#|xj*D^L;p
zV0mhI7?y5qMBDLSv5Ag?%u?1D^4CB}(?kEER!?lm!wkT3q1k|;NDzWp+>_A(JXNQp
zMXB?4#c}n`%faGbwu@r;I>}NyPJSp;T(3okH4aP%do;ElG<~CR5CwDpay}()9D$Rk
ztY^OiIVee3sgTP$rK>+9i2b^^;4wCm2e4}F#~WMe%OSOT0KIrRZ(p9vCt*8RS)OP3
z?8*LcQ8W3U8u!;<WOet*k}UqhE@FM~gzRx)201OGtg!+^Z3mO&4_~fbj!`6hRz^%q
z6E@D1#{GWGn1SDqXu3PPDV4c%P4PA{V?yaAee)*KQrNj%ULClFEV{I-6L4{*kIK?Q
z$E4xu-Ax5(|A;HkBry9!1(S2Iecet}2DqiHl#d+$3_YSe&%cVDD#3`ok7u{R*k*wH
zW`!`s?upUcA3QC{P;~hoHiRh&psOT2K*D#eICFEAO3e+pX^j4y5!EeL>-41q&zKMJ
zK*KNw<a6~vib6TUs?eCg3HX@e!@ov+zCFBarV3ewXxQtzzjM~5Qx(^RNP`Jpg~@~z
zL?oK}!f<FF$B}!DtecA=Lw>!n;v~>T&xU_h5nj~31n<g>)=mi?g%a#vJaX;&_1c`)
z+{mfHDgVJ>jM9!b-){pvl)g#j?CuJ2`(hyX83O7oB^9l%xqOSweO!?FOUu;W<n9H<
zi;hL=H}soBM;kufS0BREK+}Ow!&d!C8x?r{b%(Va#D+oE18~Ea49byaS;+pCH{;Cd
z#+o<5#ydYU%w}e<`EE}pZQC0S<Ba0CdfSdSYohzkuv23m?#KDwwPY~d#`eM!60s#I
z>KWC#_zDf{>j6>lNLtY{^I7M1LHY|q8cW}Px}|~2s_;Ku_npIs`I;H~^|>2>;bWW{
za8@K+6cTZRK_@A6$245X6ofTh;J;0|JBm>pbrk}3On!Gp%%!>GR-#8KmYAfSLYRuA
zd@q<Lh4G^U$#sZFSfB#yKY2kbpcR0KabRrEdv}hvzPL{Lap~UokX|3-25UV|lYnGH
zv_OFYFx>(~3p9tc*k~cVYs*C$^2G(OWrnw$d=~!5*;ia@_S$5Ng6>1y6TA&ZtsCBG
z9(N;6q4P|0o0<>RO7e998&!ARUWS$oDr#oD!+GSkgzxgk`{roC7Vx|0M819d6yj^9
zvP$JuY+!hs!<SsGo&4$EOxH9rcG%WS`I9uNXh`@KM%nHC9!>nT+^k4cA#|CKf~|@5
zmkzOayM|?LV5*GEk#u*EWm5kgLR@m7H(To@%>*l$kQBoR*4)keU@8x_nuR?U|5CBy
z(}h~y<DV49Xv8|*aCM1CRw@R>kp2}J(NasS$)K1&^KbkNZv3T|(T4wGVc#hl0ya)Y
zB>@peaX*$Bgc@R<+sytFQS2YipeOSwc*eNsw&Z7Rl6Xd1h0|YC<3+r2!^(oUqSN{z
z2@|CUU{APv-`}2-NgEYY-yP@Qb0XsD0!%F;9{pk~4Ju#u+oFMV7LjGC8{qg2k)>EM
zkHwvRlM8;`j%!Mpb@rlhy<D}8rS}6lio_{e2vS}&;<m7Kc(;i+!_}%Drcao=#Iqr?
z{bO_RAp8SNS>`S<VAz4XcG}!&lpCrF8G7fCs<UIYDzhb0AUlyP{=}7ZORC_RIYS+%
zK8<4}!#Vn}L9uupP3aypfF9exL!#9aW5IWeg@?^uw>5i$xr*y5^@$6+@<@kEX-G_%
z-?QN5X#PQR<996VE3kS#26f#s5Pw)JY?#>}GJ}zQeY7n{DEXN`-XX^Um~b=H{n?!5
z%96<4T;U;r%Ec!)H$VcH>$>Up>VF!T)6_BQ<9~<Rfc(Whq3j$eCXH*6g2ENr-X}e=
zGR^yHuxq!su(+O}7yyr}7?m1s>6~ec{A2lER!Xo;=#6u-Q#M`8xi6OaJS!HeNRxdr
zeY0^4{k(V|XQ8~RdI^1x^$Se1UKP<)x4c;?xsf`<Y^Ev&fYdV^uU+NwGO|yeGgH5e
zE)8ecsZ$|f<M)0IC^6PYvDt1k)n#Z0xvt|&Dfg`ldjLSaw~MlE#QvkuZ*Cl@?9~ke
zeIsly!uX&Q_=$Si*Vu~4DQ?g1kZs)Iai>#1VYoDUT!)6ti*&m{f(vtjs|a`E`l5Dh
zR&)e{7LV)mIO@=(wF>vhq;}P3-85jo(Gj31p3X(%eP$gFe*jLhHFEqE5jN&*E~2iw
zFm?=e9mhA&P)#8_W~s^n7Xs!brKhZ9@r@mlP1-;9%oiZ7Ywg_t_muQ$+1bFWo$|R@
z{?Q7kA3^gJa)Q77276;GprDollu|fRKwMwF0hhpaC-#4;FG-BOjHv)Se)i-xE4(W-
zpvPQYDK$;6mb=v2?*?1HY$3|{*I3kYzPPG}NI@t71C0|Ah|qs4(B{75%*U9!$qP8B
z{5{}rn;&%c%9D9z!fgyeNDlreu*~|4Y(H*3omRmgcVI*2aBRx`XzT8J`M!3&+n~mF
z_?RQuzi%wwlX>U1xA;P6_>iDo^Yce-A*J|%b=9s^S9)JEi{MqOUm=;_1sC0ixo*7|
z+y;~oAK28mJmao|8Rid-Yc(f2?OfJ}tnK;RY%;0;BsO7<e`(@Xx48!kkPc7GJ&sPR
z_<=%9ml|i*Za@Bw2D$P^W_c>E%9!>HLgQbUf`SIo&ZFcte3;I(j<$%3b*e{UPFkHN
zLah0lZbfOL>QUA0-Kcf8JD0`{P0B-j9ucd+vmzX(1N*A-^2a=669<8Y;HDxY?I*tW
z)=ad4rxyXHzW$506I_PLA097@aPsc>#ypm6VUXKob1Ju=E}DI^EzgVp*{ZK`%XFIi
zqvW}&---<1dJ?&33YeXe2zT0jRX;)?3v@8ob$|P(crkFvWE+5#5q|-fPEIe0Y=`*$
zdX<UIgC9Bam9~dmR^Kd#EH8hXqke6qn;p0{7k)Wt<FMR?DTj=q&0F0<bJ1Q-j6PM^
zY38RM99A7vJPZ_AI<|H0IGx#R7+bw98C|Fnn!UOoe`T73%_hwtJ~1j%t784bq)iZ#
z#4uY!XJ*1~CMNRDZNj%$z7U50=YG35{_3Kp`9St9U9c~Fu0xr?f897zAFrj{>QNoK
zK6(r1Q{fEE76`MG1`NlV&5B9&o7<og`G81s*4=G4nDB%~$WedY>EbX!j({Srx?TRo
zX@y<;TweQ)!u3mT&Ck;BwM4ifq#NqZdy7aA@@|S1o?HbRI4E^rP7C;0i@6KGH~$`Y
zGns$){ByuK5YJgRHM8z(iUa@>wsqEZmahjB-P*%hZUt&12Pw`|$sz@0vahfgx(~3{
zeODNS=)WYIIF5uAbN8~!UA6ACC+>VBGdtP95vDa$k^F;k48MKG9GOKmxA^mp&$YJ}
z>|gZMo$($qgwQVeV$i|HLH^>J2M9=bNFDW&K&5-yS0bsR=_tw$La-<)%+Q3ZLNFIf
z4m2}S^qz_wbc!tIvK%EjEq@TrTJwx?lx~!+`RVSTK|H!O-;BzGKAbLpNSR|kIU1m|
z4HW)_BX_{<T=2AN4<oQr>2gA==H>=FC7*Zi9g7n6MF@dF``4TMsxp1B8+}TJg)e=M
z2BqE{IpM+%D`nfNG<U0l&XAh9`Kv(HLPGGPEjgr5H~gn}qdX<M7R*S7vhF@#%g-PR
zKFDkJLW;~fA}Ff)Q14}sU=n5K0+rABGf13`*>cR(u^?{~w9hGh|5Kc7667!H6(~+P
zI5zXLGi$bUhh?GY=E2f%!{tgv@%uNv!E`<gZWOf7p`R^AP6;WZn_k>woBTL*Dx!({
z-3PZCoQ^t@L^iRL65+BwR0ras$rsccSG;DwT>Sq?vd+gTa_(9Kn0_weRQFBA3hBOU
zlk{g%E-V{@;iPtvFMF9wx#aGzfuD-N=XN)OvBl|^6@Rw>bbainbyV6z7rVbXzQs^A
zTmNO<osf)O95f*<p{gwk8D;vgbeHy0>K?5L;uD7a5_A@D^~Yo@*T=NqXq;Mz<?^zG
ze^jJ6s_ibD^0!}%5IH*!JfV<xQzVtm%{-9cv7B>YUP%f<<Uv?NtB!=lcxCMCCqq4R
z7hMLj7yba-III$W7j@)Z<w-MUarjDgE+!a&uKx$N?}zV*W6XElLY;X}SKwAN?cP|c
zVkl}5^^7AeF1kL{&2tw3_@uJNUJ9%uH@c$sTDS6MRKOu3gTrmN-QGV<=WV4{+=`k+
zkw1Q~BKg%1E{BcM>4eb_lJSBE_;~~0sU-$>OAEx_-FW2g#=Yo7){Tno>i>!F0n`7k
zZU5Q1O}0s-7;G+EG$T6x>wEbr$K>47DCk^IjEGnJeJoXfFkA1ta|;;lXlDX5dC9eN
z{SXVZL6`*0Z)BaSlNCuwclSs+U`xZ+%yZlvPvn}6;1_qcGrpsq{qm81@pH^u+?Qd(
zuk2W?0i=zG_28rNLwpdyXjsqfp2kf|JeY?M2(p|RxP5!1_y&J9tYJ=xqh{)4(Pz0s
zh2~zhMnLSOA`s1!0oPhagg%D+-7p19jw%KO3kL4CHn%+#Vq0V_Ph5L!?{p|u*R|ml
zJK*fH{kGQcc;2PUWL5h9QmqRRfo+7Qrb)YMQylZYYjN3mKy;eOcWty>J|Eq>U()S&
z_PC(M!rW&2({wBB$j6lVVBEhEdcfz7{glAaEAjOmp0#*r&!6F~ya4BCcNuMCfDy8s
z7@&Kbwcj*f8zszq>%8#o;T?l|WtFi`HIPsMM572EJ0pK#*o8lz*Qs*>N#a04b78m_
zI_4Bn{p2^D`}OEQ(f1$(n3S%{usfu70mD8XIOqTf>cW+|8nyE;H?!k=uaffC0sprq
zD(+-iU^x09Pqe$UyO{+7kAm^7i$W}*q8l=gdudtLUv?B|coU0|aN^fu&BHBwoMrqH
z*Vkgqy$U9(tyyH3_0!tV^(~zR<R>BR=8fdn9@l@N#=;LGI?`eQP|~2x)A3nmv9^fx
zcT5K7=BdhdR{PD*JFC9wKNEsCxt8>AE~3kDg{UFg?+fX5N;KJYQ>*Q173^;C;sN}6
zRMa-tYeq_$yY=zgQwFRx$FAEmj=JSaLafcksa58J*Ku=h<evU|0;{jnZitQ?U_g{7
zZr?p&rnru5T#KLoCsl?J$9UKw&bE^P4b&96hfkeD^SK#aIcyFdR{Qv#8zVj}q3~Rv
zF*_ojU#<B&-|)#wmzJmw-PLBcOZ|w@eDxtRjPT8&nG)9$1cC4{f5uBij-d-`Iwcz!
zX!_PpFg27AhBJ;<N7zD*kLh1}hvtnTEKKL{`4u<~+)sz@fPVT`OrcYr9Q9h{*|F%_
z^2Emw-}OX=P21t;O(+CMvbHp;flYzdujEk+R01kHTVX3Z8%|U{I*~x~6z3%h%uk%U
z1w+sO2jExFeDhfmku<>g@J)Cgo=BZazur&V>-A(cT!-pU@+0X|8Kt&Vj&;vS=s4tO
z$(#$N*mYAY5rPk^m8L;I)xZ6&7P0B~U%JB*!l2~vPiz9`&6-a-_+Zslk91qexP*qy
zICS?^mQU6S97-K>)IA(`<w$*$6fF@Dt?rVuC8vtN{2J$tjnqSlMQtsWY1`Hf>RAV6
zAa@~BUS@JFURsKn>A>*d;COg=$ddXaSU@Q?NRi2ks18#E%@Iw&oH{l&1hTdC{5Pn>
zg*A-7=XEzue9$JwAwJv2&~C{-WKWq2bX`~y8@`0&_bG7AFUWGiAsFd`4yh+j_@vNG
zcrPTUxj$G#HSgG<vceN`86(_5D1nVaSsF`{wDpTQbVt-C@*q@P#wi5&H55c1M3*zo
zmO1zfLsD|pE7(=ig{G2{VTwIgK(>c9f2`VKP8q8M&RK1B(^p)MlU4xmzE+jKJ$dXU
zd0EraIBWVP>i+bGh-rmx;fSjaw*{{q+jwS@ce3spD;z-KtE_aY;S}$dahxGt;Ek8G
zq7U{lvB2q(clg|xbfI}lh)}FwH!_!O3IwVsgFj{MI`U<R2*i7L0dmeS09V@t8n=xB
zVCr{^*<?D_GiLI$zM!aTJKu}F6sYy{xr=&H<F|+O$Odi&f<8oS;7@Q4rau^3kVxqu
z2hX#Ybl8G@fF~C>4Lb0E5sGH?AsdDDrJWiNrUV&1<5m#nL5GihaX8=UId@=^>YJnA
z#W(USyD(Mm4y9N4?i5Ac=J~t3Y`(w8L<DSSB$NW;X=>*-U7FIvV*JrNRwZh-VUTSH
z5D*bVz{y)=hc<iPaai_4M_h~pqlII6lG=sO0pN3naO2m2w1}-}oYpk!xk#?xW+vcr
zx-9k0JLW&--PH(yh|rILywSitMo6auP&wZtO1H0|Zy*EBPgTBLTR7&TT!~n#%t+jV
zZ*AhPNIL9xGc|$j_31E~v7E6l)Taz?E{77>t!&VdW7>-(kQ?`1_K$d>a|&K_G({6R
zf5%Hw10m1tar6+LbJDOn`32T)+dcmHage^KA2lb`27SY`%1PZhhjS0?&u4E}169Sc
z4gJ<VZ7%ML&bAfA)XJGD?9x3rpc&q7H??yqfl-YWNRSTN;qV-xI=OD;X+QiLWs~F9
zFBRx3wp*1>pFnp0)w+FZLs|QvvOQ9)9V>;b5dVdp@#hi;$B;Y-HDC#mr-Qwt{{`NP
z83nc7#FV|8Bye2QI8Qo7l}CF&cb~9L>CB-?*d@LowHE`-wCmc|&8(bSf(XqG%PtkV
zz{6*evLWx2-hym<3-xRM<jdd_kFKkMH$4hw-f2yUk>kasjI!y?*TNh4^&d!JsbX+5
ziBkPh^yG{aB?rPrl2ARt{NdqfQ;|!OLU(=xk6(_yy1HwbXJ$amAp7oOK>ExCh`5O8
z(0n~k%k}<cPeJD}jW^&VFRe607)I{f3!5x1+6%|`3l=MAXHzb9(y?jZWJSu2lh~uh
z{i4OkY=FJ(7Dc`>U#`T8m?*2AFxOuGj_(Z;N4I9VI(*dk2ZYcZ9G`miP;bNc4i1Kp
zrqG>9>3X5Io7qCH=a+>#0DOF*YHWBr`%CNc7QGeQy1W5yMwx=4iRC@nN}SeMg$sb?
zeIA5sMohl$_hfV{z$6G~D1Qyb6o2Zrg0@f3sa;m64F%`%=ZBr3@ben#7k>TCemUQ8
zYKZMRU)8>j5}v6Ca9C0CjTEh{i}fh?TFs#FL#6^vXgJ?u{O3~mzIqg;6y6<%GdiV5
z*#~`^j1Mn%HOEcOy$+v<K#eh$Fc))Y%0XM9JMb}?JSEf-O`SwSx}O0~E3c<H!Q4}P
zgUo<X92R|okH0_rO7bm99Q0C^0<k;4w%e1hcsy2j*?mBsQg!o8Gj4uy%0M?6{d!-I
z5SB$R?{Dfr$r=4Q>K2Ns9Mi?HlLi?z#{8`;b2fc4yE;3WxzTi)M#av^r@}l(K|M|M
zCj=9W_79y1zf;NQpdL>N-eyX04|vf2O&23O1FV-$PPh&KShdQ};H$%4LjE*I{=1H{
zolM)j!tG8M&9mN=6DjOVUVIZva@0fAh99)$@NQl~04XMJU-Is`WG;B)p{kmT#~U**
zAf0iV_hA*kb)k=JAAL0=G;GwjBwGPfoGSwq=4}7aB4m}qg_HE+Omi6kt*siyqymH{
zU-4VxujT<5ZT4@cfa4M22EZJWv>Y=b30Tptpu};}&?R?e_a#E`e|)vYviM9y&0@Uc
z)#_8E3w>;BVCS2@Q@a~1M`#(k9(=OLvf#_al=j8rI{p^g9Q;T)4*W48;8J9~$|eW0
zAFY`1MdKs*h%ei1f~_oY@Y{BN_L=Gfh9Pcy-;eYtnavQVQQltsHLm;}uK)mLxV4Sk
zn=x`ZU6#oDIXp#{d2Ai2$EFqRkg#t3LR7;>)|63T1f<WTD)M07bUw}H!^T`1xI44p
zOLBdiBss1Ph%nTnXIe(elbK6l1riE{--=(n`uq|xBZx#VsKt>}1#&GnHU@^)__p89
zWvsU@7)}YQ;w~tZTj*W6_YEd8@k51KqXTKnD;zGkbC#OXuQnyEUYZ=!Vqa=VrKG2=
z4|w|7-0`kE>TOd%TLD>7<4#ZEUQ^GSTb%%2Nu5l<vsA_W@Dvd5#5(Y*mvV)aKITH&
zb9cBBm{RDbwpVBm*U?h;_wZ+4MAUrE0SryN+v{_lgfh+j-YtgqkVN(N<zmO>Rhf3E
z>@&(LUcg#J+k#lF>4iQscl7IHPOS|3hI$Vs1E)IcB?SJ$*~8p~9FT|+8AGZAiYI?-
zk3XB*YKB};B@mzwkXc$Ttv0G4qQ`WGje+DaAS;%H2<$c(T__QKgNTm&Lyz@C6vx)W
zyvaDO1)X-WsQLmYF<$Dbf-vM(@NABe!;FDLYErm6K<4tWH7z;yu=|O4|A?i;I!D^F
zE%S)2%{5H|SAe1`#`tak-cKU7t;MK7c?Y;jPn*oz39c)tFOH|+NI$d|iv!KCW+8>i
z&-u0!-R}~|Pcv9HGKFU2rQZ(`n(jVx)stv@O?Y!@3~CLIItHMIDo8kql350&>Pd|a
z8(~A(Jv@P7@;SLje=6OzJN^2ow|#e|OSqFF02Z-8qS5DUYEu~83cLuy-TAdw#1km1
z9OX19t7LR4A_+Vr{a1_`Gs(7zw1z6pJ7&h~!!N0)j;$;9v?B{SJI?S5__T4V$aRv4
z`jOdP{Vhr}u9B|!<l;X`#<bwz`3v#G@|!5`=#MX3A)#I{@MI<u{gBj705vz5yx61C
z4Y~(ESmzJKzTniwLOso^856T&UT0qK+p4MV`(&|{fG4V<8|X}{2Ki1ls~^cnG-l|R
zn2(6r`RZ@qhhxoR^{t(OC(?9HbnPkHfES|J+6l^yp4N)*OZif+yR^IQ0L>W`r%EX@
zA695idPpxqW(;)<;Tr1nv*Qp>Sw@{SciYmAN%P<(mf>GgrwCiihaY4``o5xP>_So2
z7j!<zhx=|aNPkK27WXAy-{^OUn9Za`C8slZrJ*b{WrLCdlN5j?a{xo3st05}1fLoQ
z8J4nXC$N7f4}{7MjlMMAkRVGSz(}*mYnj=#>>gVVTW74?Ludc09?9NEZmyHYh=shN
zr?F?(-N7k%Th3hH8Dh6%MOHc{-+CSD)Ps+tMAq+y6%g6+rnWq=lesMuRpq4}QH~z7
z5IGcutWt2WK5bzKGh%64v%Ft3CJ#bH9~us8o8qUEP0$5m%c{NxQmv_BsgHUSlGChS
zP~iCCgpG|m$ek7KQN8>^mV=L44}UJVO@`Dy6gSU|dyaDFs^b{{qT}#~s@XAdFXSy+
zzJsV?Zl9ja;7h<~*$d~)Bt?<(m2gHH`xCA`v(W#>PXOdnlyL4r_lj*{1MvRrF=2es
zHPA7KgImYOi@Co!>+GvATpl1YAt+ja!iz}f5&rR-*K7P9FN3CnQ^$EWl-zju$q?_T
zb|8C?MEla3dJ;ifp+uZG32CB@oj?94yY}SC9VZlpo72lk^YxF$$dMtInMZ?ok3EtG
ziS+0FNv(gM4wtADw70s&xp^dkOMN3~YRD)_Zk8rm`6>mWVroh~B_X8EjjCH@;IK%@
zWJ!U@LC(+N=Y$kb1ZQN0)@_yRiR(8S)0kG5E<r!rmarn>n)Ue5qjMRFUdTHKyP0e(
zbr37Q9$H+BTDOI{f-C>U!>Q~$70j!!QWzh}dP92r@00}Q{IEf3sx`mMqc#ayZ<K|E
zgIs<P{d$jjriOPKNDG`kxyX(p#m-G0qqFvR+0`;sNCH&7c0W8&p<*vOh2t*++Mzws
zQzgbf<~OnJdQIypnnCMp>TW&{c}^>5JMdLv#Tp09!SkgML+(ixxg|Z}22~+KJHmk+
zcdor3YKwl4S(;3}+l)O<ZinICE%Lw-&2iYQ=Mt%-j%S#cZH#Y&)rGm#)v?}ye<2Yx
zgObeapcP(Q@x$X$rp}f|g#$6^%*aXDZI2~V{sH()J!48Pxi%AitDT}QquHRrpjr4s
z1EM*?TC^B=Soco%)pXLH5hd~qIxBWQc-Iq<RsxhG3K5Z@jXnU60#msM64t3O2uBi)
z1+2<C<EK&_lSkHm3LMsr3|*ZIUwOo5MJ~4P(-(p+TYJWM_gFnm)|oMOmrE_&u|Mdg
z5+Is{AWD!AO&?#10wlBsL%e*Ep{LYOhUWj+__d}zmZ;qVuJ*g9GLfMIDjQ@7zK4Z~
ziD$ynNO=jH!OKH&RP-OXbsx-0;$TCji9UlYoLD;cf={M+=MB>9w9Kax7AQfcKepe^
zVD5JK)R)8)Q}mDq%#Ah%ii3TTu~js1I~*R49Y}Jwl)!4owxo|8-^xgT6d!`M309?a
z1~U*cup(qjA_9G>-Hs!2$PI(LujLo3&8E`rUaXfOUl+jL1oT0J!!9y<lW5N4+e^)w
zv=`KGK^p-E^!Iw`Ka`Q(uN}XuNs!46ska_w{TLS@8q;aTx%eLUmU?}IWfjQRAOF#3
z3qgF+hb~|)fi==LzDdeCEA1+$0U9&aP3D<0P}$%_5Khb=n^_}2?~e2@uw{1XtDJHM
z4S;R?^Y~*a>Ic$DNaA_xUb3@qtB10GQ%{Iwf0gPXB|!U5BC>FSYt0sEY90u2eKOwi
z1tMZ@*Iv;=6}3L`5k`xao@<K98phG1S3tQ=?}bB-MgZbIMO=?2@G%CKef7Qb+uG>y
z^;J)St|6Y6PzW!&#~RJ#_uzN?=Yj4fsn6%wsie*mh&H+byYGbDcTK+)NA&6ESP79m
z-kQPG^su3xQ6!eSQ_@GoL9Mw7vSb6$b?UWLnVZG#y)gNODx?-Y4VVB^9BrQ1-O4Bx
zB)1{o%8TFUCMDovyw#Y-{6yi|QF|)u26KTU8|#cU_B#j3dqI7=cKq*_hJ*2i)&{i}
z0J`K2q<$CdcZ!$*$=JA%lp2Yd%?rv0(R?A5N{8r3SWyu)g0fIa@ZksR$fU=H+QEg`
zI+5)C>a~&VqngkC*N?MW-rH$ZO1-QgI`kpQvVZ#3qPsNd6vmH|f85~?2~~mZi;?9J
znxfG#<VS(07?4DDlD?0$V^HqI;7=BV(P1@iavSvc!tqR8vd1qd@JQ}S64rb9k%aW?
z-?m)ZP!<)`SvGcAM}c;<lPUai!%_9`ytB&6F!HB)ddzx2Nbzw4N*U0<itA8H4;(iw
zUw&DE<W+yu_oCyR$(`jev#)_Lh)C@D<={Z7{UmC~ZHcd4iYJ|#v~nr|FY|C|^+ADi
zT{XS+%RjqskSgX<AMxg%KK$#{_JJ@30fDUNE<-U-ZHy&5l(wW+>P_B)TY;ui%dXk0
zUR9_s$(?GH`XQP(rRqq%x+dcCrk`2vV?KHySjnD(_yGMpEDomBrZKe9C)urPuIh>>
zLbCu`H()L~@_*vWD@mP$!QPQpz0a8MAYmp~C~>hDx3%&0^N(jDC<?=w9&?okVO6I~
z<l7@rwk}z2`IZhA!G}<aBSY)bU!Mgr-(<8CHleG*&XSxCYL}V=jpq4QQGRi67s5G*
z-nLWw8Am647^HGv_u|^gN}#fS7M?eeptWqzwyAjOPk2LpVVp`e{NV3Kzx{Wk=C5&Q
zdkd1ZS5lcCZOzl4<pe4ltA$5uKQU(UtYk!f@&iN<r(7T*r`~qZ19qK46g1-1XZ<en
zjA075&7=MAUft%0!scHnQw8+jqkF))dIXTVhhjgeUO@~Q=Qj#Y$P;{|F<D`?)V{A>
zW5xq6a{_IoIC_cb^UBABcsLN@15)D0A8}D*1MY8O38@K0$3Wt=)i?L{@_MRz;#2-T
z+_oH9!2)RzVHjA(OdSVGKe>W*awR#Wa3Qn-U&GR6d<xGf-I|7NAFoopFgvG+<b0xl
zNEj2hdTB5?tY+HvT!N0BpK)E~l4$dZ0%c4?s5tzV_qU<|@RYYi)WznvjqP^;*gs9Q
zfWh^#>&JItMg2y3SRRL*ye1;9QGv#Of?Pb!?0Yh73d6!9ys?cptv7!4u{t-VT2E?}
z`m=ky%&E*T7vZEuN!MX!=ze}gA~fyTq2E1l^f9QR?;)sa-Sy)Uro2|5stgh*FcuZq
zHsHUcicwmRBDMd3p5Hk3-8;gOje~)zECMhhC^ma6(vzIT@Tt*h(GYXu1u5%cOU;Gd
ziv3$iKo)GPMp8}FDAPP$?Kge(oFV}~71Rs6;ZwQ8TFoi6&x_xEM>=H76VV=S>`U$b
zneJ#ArWsGI_)Q`$2wHc~{I~e!CG~b1wMd%bfcXxmC{{nbj}@X7TG{j2+@F^Os8j=u
z+C?ly$dpiw;(wP0kCUu?_hCo!?O2%;7ra$}!7Xb+vj3oFScj?mQl@k{_a*H;K|5|B
z);S#7^GwF&Te6H?ZC(nUV~{Dqd#Nr(_?Q3>htzR<wM5^iNc}-Qsi~hp=~I1L`lDf>
zd<p;T)W2W4Sn4ButlNg(4lS{_C!qUG5>4Q2UBjpBvf}0o(!PLT8$SX_>)w8pEG%W@
z6~|{KeK;48PXXsknHA*Pq1sUBSQq?FY5mf~16!SSeS?by!zZy*s!kOk@qa40m?7FF
zE}?kcYCUn?bJfbU>_64(2}=Bmxl?Konx&OLpy9+i6nMvC-EWU<4AMRj74VVe3L@EF
zb2`bsdJ)wBGv(mXgvci-Vc4NS%gq|v4BJ%Fvn`nMo2kniK^Z6IVBWZvu)Rybl;kYe
z*8T0&tx@`^<dqqg<G?<W6dArva!AH2y^W)-!o)omuNkHq^Z-R7pBt&aO(f?PgKc72
zK|l;xMROt=+QDb&zrp!&7ep1yY3To5y(A>^2YTwSh1N7q)f>(SVP%C%x^}RLdL6|o
zSGk=x6_~GL;6KM5iefd_YB<{2yH<nQ_eo@Ym@Ze4$27h;#(13=1TtIj7te|lytr(q
zQW1#6b*<5_z8ui5)f_A{Q3D*&*nq?u20OGy5%lfU21%)pz3`xt^6@}{62rWfG~ZPZ
zVAzwtRfHC#SIl&2lY&y<k3L0hG>4Zz)4d@Lpz{fu|73(dCh&A1SnqJ-5`2s`M*fQF
z#IUg55hsz=rNu8OMe(z;J*(0zo~LK}QfXVn*bBFFbV+Q=LX@sM@(OxE=c?|H(NaRT
z6*ZH)+8P;4ezlhx)(ks!h8npq-CXV#ze=JCtzPy=vLu{$)_gzWE%@Y*T?bvv-7E(E
z2`z``eKZ6ZKDI3(KcO(~BzC9J0sk0fX`mXYi|Hqp9X1Q!BKe!N3M|nGNG$Ky`H`XC
z#NN~XgG;BtZRzH;NV4@)f3M4~4jlInN4z)YGK{RJzJS><ounBAjUwzWEJ#c<sT|d>
zH#rUs2@B}%7#T<UwTcE47K-3cY3a5E#R$Oo6=3Hul&ADhto05T#Fw$x7`|xu7>lW!
z#jA}%tA?vO+lko6{w%9*s+X0|BS{(Z$K1lwoU?wA{6nUcx!_UbWehSQ{}&(ByTDK6
zN;`ZWA$~_QZkCLT!z1U+V8e>3<Me#FSjN95dD}2Jp>5z+?L9s{rsTYW(`S|EPIp4=
z9gnw6`WxbmJ|{L4sR<_$&8FT^$glW#Za_8X{uYZxflY?s@BP=>>Qzk)&i*`U$JzUl
zD!d~zH}4S#zWZR#JWk|pHb;x3DCoI*_!C$A&4Z-I&8fl21+*4747-u*8PeW`W-rj8
zpD}3RewL=9ncPh(Mg=a`BLtwr{|FXc>TQ*!mYc9kCWwDW(<tlAlLO7|E*N;o3=R$v
zwDZDkC2AQsAaQ*>zBX$&Wfy6z(Ba~A<i*5yLA`{*5h`D{WYu>`9k}@iHYxc}sTg3N
z<cbsskHUlV4zzdJr>9F_&fmqbr_Lt6UYPphzMXBZGyX_GJR%QU3ccyv#T;{-ZHg_@
zSg;+KR=aqSSt7reNCsRqQPE7{)^jiSjx9#(xh(GgoVown658K0|56d`94<u|i?-`E
zA!*3&&7x{dXzZMPuN7Kb7S-CrjWj?8<wL|_Z~V}nRLN<p$*Ia!F$FA(O^t8Hq==ut
zxx>B#Z(_yCc1*uZzCTmUAEKrGr_d0PwFJ~ZQ<Z*GY{xCN_)2xIcvD<}S{@qU{R+<x
z#9snA%&taS9~|-jsKuoGWQLZWwQH?D)tm4k;vAaRq!i<J)`fJabNV3a;88D3<<_&J
z9Ch*PpM=K81OgHe|1ZMa_pmEBSLg-ljYLQh;%Mi|n6*lWMS7UMOozP<yDO(Jp#Xfh
z`$55Mg|0S-TrKTVbCd7(@RNfWZI(NNGR)HgtG>9Ew*0FjgKFLQw+RQC|CCCN(-J&(
zS!h}KQP+VJh-rB3Jl~?H6Z{!uAZ^zlnQErnkkb?*ekkHff1FIwg{1|rEvpq;#~Oz*
z!TPJ5zR)>>jo&ywG)#{U#Et|*cMLW8R{J;3n5geO7D_z-Q{geLPmS%e@FGs9rfs$#
zeye&YdSRdXrg&CG%0XuPbdqCs`!mL)cJ!8oz53-R=Adz%sYj8<mFFX+dtI>0FI7Z3
zY|YZvH+UC9r*7TBzos9`FK8`Ph!5K69rvOHQH$<;<&b}6C}kq}i)Q6gr*LiQ79R!c
z0hr(huQ8?a>q+C_IROycVs9@?tRIoFCV;qrp%d9~CwDUo!OARv>e;<9d<k7*FB1Oy
zzX0@YJ6w#y<WZd(xw`grqrrrEXeRc)EKIdGmg>5D4I%6yVx=tq$eF2h@DSCk(0x$Y
zi8L~Z*JXZgV0h)!77UHrg}d~6*}4<1kRqR%XPZI-!t9?3Ry0H!`z{LzoZ5(D3%8xu
zaCM4C`C_%Wcx*NC`5*)NRt;@`eag{&&tflWB$oZ3YPJC25gA)c_$ZzzBJ)b&1)7Et
zu`Z!wK+0?R#8-iB#sc^ejR=?%S4@sZa2?Bil1awrtW)EbRB>#)vO8V<4(B96Hw+LG
zLS$R?F&{|9Kj#W={AJ>o)D<+4B;neic5h@L?KV8CCQg}k>&CwO*=&$%LulSB$~H*Y
zEX-8woYWcZWT`Rzm*rsUap!F&)tj6XZNsog`)b^^+``M23zLMizQ&Ok(qn7Ge+y{-
zUX+pGFY3vkf!X7G$2{8cO);G1c@#L3F8&uyXBigP(zI(}fWbAm2iKs%-QC@Ty9Jlv
zu0exCaF^ij5?lfVcXx*XXYuauT=RcsR<G)+>gsy#qMIU380v^v`LyqT844Mr#h(WD
z#}h78iy-5S?~R4VXTy)Z52_3>C|?s&pt~6z*-jLa{F@ccg}Z=)FK1w8_vogVSoh!B
zf5m(M``Z<+KX#EX2#33t_FsQs20Yi@NR7jLv5hNkPeS_j?t&eOOH7v<C`GF{z1EGi
z%YPAA=>PG9ZH}u_k&)0V%*5x+A%3JEl9!SE&vo8z3SAAl#-6)4J@R&Q!WYB(*_?gb
zAgG}}0Aa*z`hW8N2YWCu_w$nZ0mhM4IaUU_CBSsld;M&?nbUK9^pYm0sHcMP$^7{2
z5tFe$5q$y5gieZBhWM?qoYKJuM!bOGzn=l5C?4BPXFkdut<c~pRR7;$Kf61GK%bO7
zQ7l)g!1iD-#&vL5ptevp`wn~Ps);)rbF~A~-A#uBdu^X-4vIsKH>)lyJMdnOv2i|S
z2CjeVHqN4_($D&DUvr}bbK0*>U;&~n11OYc!wFxRvxtlj6lKvM{PyC2$sRaIcr64Z
zXr?|Mt)mXvHO{mt6nqf_YHX4~SJ^6A2|ajG9*=`y5r=l!m;ZaE))SBxXVpPbHh%lr
zE~1_X(LNL&xVF&jd^ha!NEqr9kb=$aoF15#%~pVjK&TIBkmP1&Onnfnyizq;f|p?3
zG7?E1hOc35^-La>9e5AY6ez!<KyC5MW<trz`FAcx=uDm%2OK=OH=r5k?yl-qNXr9C
z>kJTR#F@ea)$%JJ;>z(ZDeWiz{V2)^p22{uW;uaQsGBK%n5$3wj%g)9gv}wxXXhIY
zhjst)Hc(u;3;h>*28fvQ?hQ(M?sPB6JqVM$MID(B{XX1VqrnnSi*cjS1lgVXK1cGo
zL){N>A2uTtC<=53qfC|Q{TyqZyg-`*QylSu|K!0@lSH|7mZ*umi#oT2b_o}MRE!X4
za(8{)hY}-(Jlo|V{ME>5E<bTx*#=!8j7h~lD<r~iyXk9VcdpysaI~-I2IF<qYtjUA
zN;Sa*+Uox~AJLjNo!gL3{=F-tl_VfM`2D!h{b;u%Tl`<#DjN*giLnu+SO4=IaNfga
z<fH_D=;ZvivXpFk*5gTc*_mwPhwJ?4;<W<=mDx^GC%!a93cAUFc66NIrS#u#kuTsK
z4K4B~oHjW1&N%-So$9f?pMTyzqg?UrPRW<F&g1esez`c?T+HZ}s4w5g42}uxJ7QpG
zHe@;+c0E)l;-h@((JnpPQT3SGQIH^gh4NVlX)O9-9Ezy6R2qmCsPPkn#E7>2VKYNl
zAc@-e^ppSdS=``W0(a@%)SJJV_|R=2tQ$llt10Ebv52e9)tl3^r9ao2oPDgu<D>p{
zDt-L6T3w=i^${20--Dw=(VysMKEuW>A%QOFl(8JCOE;$tLg%*rvU@<m`@fY>bt3iL
zrZYK(s9Qz8xyeLY`8lsW(>yiBRZ$Paac~PNT}asAH{v}9;lpDF#i~$->UG+73A$4{
zI^=(nc{tevcf`<fYz2M-MI5-gwLFo=N28}rDO{O>cl}=63{vZ;|7V$MW5D8<N2~KD
z$dhlFZTMAK&vbq$s$1_$5BXDDanbn_#bbH%<W?#>jE^y&dx!7rG>I2Mh!P6o&wHis
zPtWDVrST-U=lQEgdrSV4&qZ;c-v}kX#Fwx+^`xok{Cg6w6Djb?b$)m5$L;AcvrSOo
z*I8lB4s=gAGn=tf*j9Svg00<l2Tb)AVZgEH<Kw6628Q|beM3HhDT3rr;IZ!C^{iz8
zS3?<6IDNCoALYJ4tSb|oY2v2e*B*Tj?@DCv{?S#!uFncw#o28%HUISkr_$#6)I4X|
zmq*~M{YFK$^UB|2Ql|lvuRe4q=**iB#DL_5pxzgbXdoxNgOPjbO_qj10qnxN1oxg&
zI}3TB68Zl5I|P}E=<8BP4_>{SFJW<>l;g06K+)QV0@vsyJU8jZuH<WQo^1HSI%kFM
zdkYDyf>)kDqaRq86_Me%W~B`GY#<wXpFy12I9t`Wsc^(J^Pzs;-eAk6zP5P8q+R<6
z_M80iNr=%^Ix1P3`uzvJVBeqSpUc;!1I!J9GLBJ;!>hUa<{<Cy_@}xCw-=qsCT^Oz
z&Q3X<VxM99{9#v@Hz1fn2MTlQ5_%U#ZXd6txaXw~CtyF#pF1B-68S-QUS`w$_Y&v`
zrPi9AmJWn`ksUAG9*57Z=A)h*x1;<cznyxie{^3D=)YcT^5(8ezGu{oshzH3-x*vB
z9##Y{!EH7qb8aV?pzyO=|At51+1wlr5JD0|fP{h8Tt~d(Y=_6MJc<8*Pux^AGdNoT
zyW${|V73*t$J?j_)~YmUM_mZ?MT+RCAler0$|2kBcCAS!Lz`Zd>4*&reVxV+&+<SJ
z&JXgRhW=0G%IIdFw$z;s@eP+p^;faTxIRcjTxCBEJZcW2|6J-<APXnp47HlY;F8s7
zv1V`BuSYUHcb-XWCH4Vn99*@<wDbrC{`8lq^eJt+kD-f}Z1}vB59J$fd9aS+Ln}^9
z1f}=c2D&T^9q07U?(Ev1)rGR8STbXolYw7h{ABk3q5;qQ{vhaUb%iGk7lb;CuvSYC
zk6cAyH%K|km-$?GU#XXPY&N`m%*qKa265K3LHn^Bl<kX7+o|x!H2dt{Y(1@ZbN0xP
zPB#-S#)=v;XPB(){9nqb#QmP|PMFnQ2V@7?1dv47eN9h7cqQ;bz^?pa23FDd>AgP6
z(S(lFQnIikb_YEPGCN|~xZ3K^)tIA^(EH_%{n6cjTIkU^Ki(1v{ErhCd3768^gpxZ
z=?z#ri_s-2>Of`>U#S2J8$mrC+Go4ExtLhz_JCKCx1$hcGVJ2uYKA{>nx6HBP9yqC
z$&rqZW9N$J_3Z-W0X_Df2Xs3@<GVDl-(jA=`)V&Zuv-BU3M<{)nwaX;B3l44Kqlgc
zT}@bVFBsa_-<UD*wj}-0_7hkY0)qMYs8$qGXN(8vXlTuVUEzjOWne{kZ%L0U|KZ!B
zVwf#uG7Iy!`+kx5&)<1-P3d+x(f1WB1bF{&vqbH~XRrQb$_HA5MuDqTRs;;+#_%VB
zv$93%v>S{fBsID~(`kV0ZdT!$QNK9BuBco=71^5QCoFM(8$5%d@yw+Es}7Z<fjY(s
z_KhAbrl5U(*OaF=H;a#3Qk~rhULery>k~?ODPDF?Cta4b0YN4bb#H@22zAw>n_mSa
zELp;f=k$12*%zP-3JoHA#Oz?Uf95!w6*Rf&4a-T-eDA*x<~*y5)muTX){f=rg4UaO
zbmgr0MyZ~iELR<f<NwL_M(vzHo*=IECmBx6MGFy_-O>k7XxpB-B&oF=Fg+IxXSSNt
zZ_t<lYcg+zo2mDI28-R^UGuLvP{#nl{+=-y43021B0U&k_aE#4tvQ6U{qydr7E&Z8
zZL236`bqnJ(!%=!`l7`u9u?+n-!`JP4L#Li!GO>QZ2JZU{AHR>0=pM<%N3wy=RFy=
zO!(jTa<*q~`XSx{p-!>zqH|RQ#Gw;1xS}ZRkeh8Zyqh!2N~i}<IA^y!Vq_3soNd}K
z-iA$a#a=Uch9A*EGiQ14`Id*%z|Fz&?%YA|N1uO0&5gWztYpc27|r(wkV-rd-{(v$
zNf~KZpNsD8ST^>V&cKO5*$VtXG~6$=*n_)3adaql(5i;_%14*1k7Q@*s!hFx-C~QD
zKI7>6mRBfn4(}x&r7HSvcz*{VlY^d1ypGnmkZ~<;)`LL8hEub{7R_XE=gUL=w~JiO
z`_cx@5wA?l(ywBu(f4YiOvK5PKC%~Rj!W~>qmCOYUW$eDR*T+3lXZhO4rI@@63^xx
zg*~NN?`i7kb~lQt`;y)3o&F!-9UY*+kaz#wpau>eLik_V9UaTRo@dB@#piy5!VJik
z;a>%wV2|`%7Ck5Myqk05>&ACe#ezetxm%&Wy$`ufRh(0H>{p#@Q&V$;PtM&h67Z{0
zU{41<a{jgnnYy5~bwg*t>2qPhDb*5{b6uNL{e;9b?k8|fd$v?ZPyB4SHx3`nzq52M
z=yrJalXHmUplpga_F^f~NA=Jg7(^UZue(Vv_DPFYsY!ctYyC<tG*G9k8YCFBX;7fT
zb*yM9M|?6?KTS~lvEr7EE;*6QZ3;fvCu9t-QDRRJ;vrQK#Xc%9TLfe^JiWo6E7ApS
zz8~NJgRYYtrDxv>HQbaf0X$$v9Qck2j-2W3--%#GptpbB71bii!T6Ch1u8Lk^fH^j
z)Y$Cr7ZP8hOx-;_nWGRwOB-f}q3^g-{5Y3NujnZTIFB;3vrZ<CLT2Y{EU859Wva=9
zrh!*Zvs;UlAGSR`3?lgk_hE2AsVH$^NzgqqaD;T;k@L~4p>q7TT)07a!Fs?@DGK=p
zoeCNA#3#so@~42zotdj#w<+Iem%}JDAg;SisKqkkM&TTh{plGpr-85b{2n?zUV;#m
z*}aP52EoK}FE)cKh%g4*c@TjtX9p;C@U@avb~uoN<jv3y(eJt9X0FJ;S$XkPk$}}s
ziXpr3&wisc6tz#s$LYo~uYJ5}ztECG$6rT$uxnoEaod2O%CaI&UN6O~M_WUj04+ep
z%v#?S$IzbBZODCdm{OJ6-a~VAQ?ycY;mxdBL1y350OthBSkkf5s*3jT3H{)~v>ZN8
zMvd&mH7?-v;z~o{8M&m)Ch8dn?Ccl&f&K;9(OJ<(&sy=BOX-l*sFSbPayA#Ua42EH
zC3!8VBI2kUGH3NgC`gjL1f2$nqc7a1LVTr%>YJ?(L^E3RzU+V-?gRQ)N$Zj4BFKA$
zZ~?5B3OxdH#TJ*!Lc58PsP0msPa2A($VA|F7cUW$^yOSgH}FN}PJ>XPM5Zr|;<0lL
zuNGE2>MH^h#1#Wg6-%^7rp=s^sAS(usu20D&_<n&;>X(_v|LRMqO_7!=Uf#t64)P)
zM5Yv?=!2O~lVQp@WYXfJ>+~dX*Zh<xOp#0vFNhz9f;hSf610k$CwaH&iuoisNNk2-
z_`1lsAlt!>pf||X1Xz{Btn`HY&X*weWn$l$gT)B-etu2HzoCH>5LY*g)>t}2&TE@%
zfGp}*E6jS$ix}Y<XDVEmM9sQCp@Cs3Y!6^_dF|ht>Lcs9`9cAP^=gHCxfKEXx)jjL
zO{)xRtLvSbaBZS4prGtpEM6M${4qH|_sAea4Imqku7_P+-led)nXms1L8zAa!4B!z
z%{)e)V{Q<)jlcbf5+CPMCaGLZEJ4Jz$>z*G(A0ra+4rK`YPRN)Sd7nN+>aM)8#*{~
zXoQu!nFl`PC3Q`;Fcze1x<A#MF@%|ZLpqh6&bb<*z!_mzM*bTi@(LFVb=Ax}d#v_q
z>8R{(FXpnkPxWgOAnibPLJ;NoyoEO>{v7GiFp{i+@IZR_R}JdqqWB@GBxiX+;_1e%
z1yw@+ij+StHk5ZD_{dB$X%Tzlqf@!<?$YPb1KZH))N0rUNYvJ@d}$_9L$Ox-g$o#+
zH{!?2H|}g=z4^$}O7M!jZ4WpS);-{StcFX)Tz9&4R=hvwI%l+^+`{GE{wWkW&HbW-
z>qHy}L1w6nAqeXO^#}SM>YCnRQ<(H`4SRh0>0GonT;<zmq4SK;GGEHS))~P=y)M&T
z(cTKIOl88FD#J+|%%ta^pjLm($tSw)!z$ei3n?xY3!Trz<<{Fpc79pE<X&F)`Ge!1
z3{)b&IAS<Dlim1$fs*{JjhZ8+BiGdL_RotbJIe{@yo1-ibR?7e<hrXP`%dsv2aQ<~
z_!*#SL!@ip$!)%*dM^8VadZ0^z8n&vUJq5;P)G|d7H0W0YS>KYAmn094hXPM)ft&5
zeC7(ot+FL$$nwL1c`$eaXv5mzgF=F(s}c0C7Q*#0IB!r%;@&HH?YawdUTboZ`i8f-
z4rSagZpixU+wSvEt7*w?o=MF_J^N)$LX@gf^BxWc+3FeEjWYbT`oB&1QUR$z!w6A#
zYVOYuFCR%Hx>y+3@1uE_mSGM`GZ6F;Z~O(3#z=P2>^yV=@C{Hh3l^z5GVX{*re0@t
za34oQissYjASe9^y7L-3JzEVk2#4=uW;(Z=qTPX(isDoQ)w@u#ZZpO2xS$YFJe;U}
zpS&DYCVG`10P?X{q4}`)j$<TLDr_7jPNaYN69W1v5t9xf$2u*B>ATS6U<gx!E=@M^
zbbHKMS6X^_3h0(!eNi-_nJE8ZBZ2VU$usg!21wPE!KB|9ElD|-O7S**U8qj|EnkAZ
zpw+I2^q6;DFLvU*e0nn3VTramb)P@;#&=%!Cl_{KiTatD3BTj+Ucg=YE3s<QKePce
zbwa7uP+90E0aqyhTuhJ^X>lm2jnN-=Du2Vu5&wbUK;aXHW%QHEP<6NG24}a~_JQQC
z$g^5>*z|%_nU`Vx*xihP9`gqV_|{n$O9$RIm)8TA=>??gi}sgl_Hh&1UKUBpE<0Ku
z7)~SV2?EDKYO{ELzng>Ua=Y3A?4|FrOiSSH+#T%0nrW`?>rn;n-6Y#s?!93j`xpg4
z7^&?B(4ou#sfEIyV#S<KKbZf0=jlRJ>l87O;#%w{E!dFUC)07M>#6{|!Pr27H)VxA
z*27~(?*?!gu6!q)**8ucLLskl-IZA4h8|xt0sk;tz|us<g}<_JIAg8bEx$4W`8@b{
zt&z?>BTGdA@+@ST#0R-d*tlOi#g_^e4U8Fgsu(H=^{n2_eq90K1_BUvvb#F%Fs+Do
z;^G_^f>6s)$g>CDcl6uSP0<<VU93sb;7EAarytqRYlyLs`129?VIUS*@o&H5WCwqq
z;~`v&ldu!NuvSr9`KU<I=b>3%gKq}y0*g%>fZpJi$=2g4U1@GwMUdkLz)+j$eAtRQ
z0jO$YK&yNJY>D!3^=1H8R}VDV^7n0L0OAwvC1EfT`vB*<&rheXgi6C$UU|;Q;^oej
zILJGLyc{OjB3LJXHb}aAYAGWj$^N4IDwxQVPcS0gamP`EuOY#tzi;rbqb$9HDbz!l
zia(_O^g~sEhzmLsx%$G&Ebm_G=)#|z^E2Rgi6mr>1IOQZk`GrPNhB(KyV0dM(w*;?
z30r@kxm&?nL#sgR2<ZmK`r!ug)Uat=6%=ZaY>)2-Zf=;ez_Q;E&$egUUc()Cp9Cp&
zy?elEw|tCJo?4g_mr)KpLf=?t3g^y=%V^3I-_9uC=nqo^TEjnSAN;lhYvEhMVQ9Ls
znZ7VP(W)16asPJ4dAmyO$~*_qk~WoubyU_LP!3{dJ#@`~nAOo&HRmgraTxI5^qvAr
zF>%Ee4PCqBEWfA2rC+lOx=8n{?uMy4VU7ve5Xou3;WvNOi^9dU)rly2HH^IG-P_S<
zfqw1!lK=KF{f0Tco<F&C9{gfYS3_+uB7Z5PzznFhVlx3aoUMgCT(-r!@BL~pxn%4K
z@8eElT)|tIbLq$nuUF!v;G54E500{w_9ILEZ61x!Pt3+Y!tGFDJop2X<GAJv^Cr{w
zdDEW-e!nR0nO6t-yv;P%e+!w$d6Db0kgdVdf@+6QfT6={5@E*||Hk&~tjlJQJoq^<
zF2L9E<A&$;E^=#g_KwDo^5jAw_7;@|<ngN-qkY<$G7I<%VO@0zTL_>b58Z_1jJ~9w
z7~)7V@!?ZZ0xV7Enttxfm!$Dr0Di)a=alSNBYb7_)uOuV)yr*GK8Dw+-X2iF+Jx4A
z*Z=l%yCgwi$DZOX8j3W6YnvpfAE5{e;r~$KT*ikV+AT@_vGj;N9>d)_G&{n(=T9!<
zIf3@5${hcbb}+tkV$ItPQgm@h901kys~`WYdpAm7EoR7|tEEZ7Gu1ZsG9&v*z94|#
zaw@l~Szi4&-&Qzz-8?`y>D^7yRq*5B^98sDOP>2#spEXtV@ngf=PMR~@NO1Um#tT8
z@?Dl-X6X00+jZIQzBf1t@tgqHH||&5jxft{!DK&J7mWYGYwu<)8+8J<I#7c2m;tYh
zz96ytrQ2%_CKloic6tcuG|p2Wb3wv0vx)$Df8uJvj=Flj68ms)mkZ5gibesS^A`y`
zGH0IcQrH(Hw()i+I{}5O=DBnTx*F^wWHkyDDzB|*MaUvh)9cImx5>}Twru$kz$I$T
zS<Ew_kE;i0fYfa}-(@3NcU6v%*v6PqbXTeO*90uTU7$U{`hc$tf-Z1>(f!k#MXwV~
z@>*viJJq{zmPFV|$5mqBg++S)VZbH&yo&81?RO2(*lzT#OAEOwNH%Ln-dS8H00DOV
z3{aohE}0QE5u_FYqCTYUre?AA<_1Dlw&>!q5t+Bwm-=@}bA~<7yzS{o$Hd&v!`dNY
z?|T52sn&C>9wkt8zbt_LG6qRYr64w>^6g6-(A_+?VJNhHrho?eFrBW7loIQ7$DOw?
zF9(W+Bmf<IlRR)Z>ISIsn5PiOji~1s9-lWxeG+cjSI*HXY#R+|Z<k!T=-P1G*bx?;
z5HRnmi}SQyzgkrCkk?cEibUI8nIKYDGbChkz`?SJCO-#sw><8qa#nDj0kqPE=lDx=
zYqxjpKEqPefQFgU5>l|UhJ<*76)DUak{fir|GSm_T;pV7l${<rv1lB3Q|O=h8~H9M
z({Lc4z;830?uY`UqJE8kbZpGt0Q6IR&g)Haw#*zn3VS5F1?I)8`MdPmkne|!BNsW`
zPkDzx=R+r5-&4^mLdaN{49HY{qF)A(#R0NFa~*L(C){E-?gG1iER@1%CPUS`W+V2m
zV;{043NCZ*9M=&7e;*^S1pYdJaItO#oU~%+QCazu4+aM|c#x&r!vJQmgE9y3JISR*
zT&*pwd#dO34ui?6qUcs)Z#f^<8fkdS^Pn8^ScZSO+VQND5>64gkFgAWSXZ$tz#;4e
z1YalqAe7+=fI^96_9M$>f?M1{48|65*9Moh%r?}S;|!^Ytsql#`B|e%K>3^KG11G|
zge?)(7o#jN%xOz#ZUl8U*Ym#M+mFhQyT42LHWdl00Pkx_1hGY)8W5mONlEC<earUT
zi#Q<tNCIibvNPK(k@ex~SV%RNovz9@6uT?uHy1h~DiiG4=@Ur_$L{hM!`YvdR3>$G
zmvU_5YIEWM`)dpuKn<Qlzhi@QM9N%=mVLVrKeaoGJZ%TxO^d~h2M7Y(y(Pw`l`Vv!
zBahCkhl5&_bKPJdSl-pwsU3EsfufD2>WH|NIX8iOr`L-xL3X*AFl2%h#@2)UAHT1e
zimh>*3)IL$<N&<cte6F041}OH*ac83=WXXWED8jXzR2zjHtk<byP6z$a2otlIXmR@
zwoZF9G4*0w^5vk{XMo~qQC``oQ^fMU<b}{ujC?}lAlZg$*O2>H&X2TIWt3%c`~G#}
zF%<Tle?B9KLW)_5ROi#lA6)MR(3^5R^-gSpk7w8SS^(fOm5~+qJBKMMaU<vL-~JN7
z7V0@1EChfvjMw5Unj9Fes3;Wxp?XPKl4yW1Z@s$Y@>jP&Q5~n+=j2(9XYq1s7o7c&
zHIbien3b6;mgl)I@2Z@4UO{~f=UQ{K`)(MC9tyE2=Zd4~<#@lMz#|0!W8JRndcCc7
zqVom_iS_TuLOlCzTLA4f@{TsB-T&wegdsD4#i&Z<^7?OI5GNI5J985NTP0hK*#c1C
z0UqY%(Hy2hAeUM?2`7&o{#StDAk|vxlgVe7MMCXDghK4G0ci+_>%D(5Xa@jCkS|-%
zIaSX)?$-wP9};u^QFy;T?$-loX*XM6`e>SsJbzJvZ56X$9R+d@NcJ$*0eKrT%!0U3
zRZ!$Yn%_^u8^u<bd0aHt+C<IjPm<ojTuTh~V=6w`?|783(Zfm{jxu!3E54*ITs0Bm
zYiFJB9r=BVBd&yLh|MQFEY0wX_j)rYqvHWbU}J)OeUwam=RYPacfOw)0VERMvr7H?
z>s`~*H-H0kGOH-)y2R?DvQpB2<>2(Y56>cVx<p;>7BCjg&$z|Of>|)5xz2d+r&Q^%
z%gf<tJ7qs5q|JMLhdHJ?5TV<m@xan^d39&{CAWWm?VjAwt?wZ~6=PzmZrQvCfDIFR
zUWx2b<M)7rvJiY;nv7&4UlaMw1mt8}p5Gj4aX1m4HTJp#K=p|VJhbmb>Zs7b{TKUm
zZ%=j<fFAm^Wp2l5eI1%o9JRwOV4JBe$d3#N_w}6&XKboV4Ic<S2==3X9R1sVWksfl
zF-LB_7#w*9(g!0TF{T&bOjUKIe}*-urrBZZy~E~+2eQe&9@kv#HBS54kK!b3(U8AZ
zeJclM+BW*xXXO$%JBEEqjgr+hX#JX75baf!6YgGso##x*<)p;9Gn=(DpV1nDPMwj|
zhpPsh#~4XC9lWjBar6*jgT%YDHl!|4xWEhuWzvsN_;GGa5<O){!7g<#rkMgw{TQ;Q
zfH9gn*^B~_PBn?raKc*bs$Sj;!;_^9y*%|uEfnOZQ~slfdhGZmUeSyL&66I1aZGsJ
zL311$7J{>f0Z{{_>kl;uWf)Nsy(qh_<FGkM;6UgN3G!q)1$Y`vMF7gxU|{>53!Vmv
zHT!^~7SPm$0O+<}7LTJh256Z*K3s1qFm*>my{R#OMqL-CV;=s1b`nT{_`?{B`4xfY
zuGaG)DFk@`wafqy@(Vs&6;wv^Q1*}v6GPqb#}I(xSebuq@~J|#|LN(Jwv$6+huoVN
zfk892FV#yeCKVUq`Mm2}t-XgDqK7??TFho&7|wDCArZ<Sh|%p{pqh)Cq`9jhUa^r%
zJ{fX99<XELh>?am7j#nL1ziNe26ScbfdyfL0=@;@!#EhYg<CYV9src*Pc#mI{>*SQ
z2vwkD-fpNR?rPG4H<I-WNw^={C6_tvx6ET}-|l@~gm<icK*X*VzveN|FV`+T!)Fj{
z$wX5%H!Ko#EWB!vw6KkD%<L)EI-x6KP`yRYE^`g-g+LbNb9y%FI&{<PQ8Xm!V59nh
z&8HI}&1I|$Q~wF&FpIMXpz)ySg&6nB_u;9oLf&Bj+3HgF_F%BH?`hNUmW&Y#^Cw4!
zsb&{FM8#E)?R1e<bo}aE8ASL6hn<B&E+9Y4cW0Pc$#4QR;jzBAYYP|p_n%(IBG_y#
z#1;82#ikR(dNlPkVw@l1&8N4wU4ewp!%2FfP+I^6U%QUTeTC(^O`d64Mpd3eeY$NU
zoALLbhBFm9Sab1r!+*G*mL$RIa2!^u#rDDmX!d<yo>w>%44o_0IrO0xFe7WTjnQxf
zwetwSeiwlyuPA5^Fy(<o0xtx3d9Rm<M47go<b=)>)9AKaStvZLvhg3uMMfI9r{-JB
z-)X{bZoM)+3*7{T6SQJ%{E75Z=Dg~MiAw5=7vE*s@Z6p+`b0AokqlabHKJ&SZHGjG
z?G#_uu3MXCdQOc+%uBHFirQJ;j)m`J|Lk$Gv4x$brEGnfkDoI|zaJUBEu3#Z0BNT5
zs7fU&pqnhIyI>B>rk<Eq8>hMQ?|M>j!Z|v~$8WZ6FdV^a085i_Undon^1fqWrZ{8&
zdgrQKwvJA5WIXrm7{1>y%4$p+N6%6CQUG9vicZIWJ-mI_UVkAm)ZLE?nY-Kv&U!@q
zXDso?mvO!(4H~dnpjHR3|4~W}CdXdFy-#Yx-corKh}s^yQP9^oEgnRBu!Q0k&>XQ2
zjyfDx@;@RBE`Bn}Ej{R!7}?=`{YMy)NVjfYIdnA(_-0PGatT_)Op2$sUWKb8;nk_|
zMEPA)7Sjg$r2~oe4FBwVm~Lbt(TRvfaU<y|ygFph#dAWbWs3XYR?fLoUJ)YA*3{N<
zkyhp)@7S!G2jfGr)kx5&p&Uc6!0+FdOM_yAb=0yL<uguwHN&Z=0?kVn3qe}hKDvsE
zDkrH?dd8VnHv<G3>weF6nM|P!4Ih$QMHv65)&{nZAokf3pdu`e^!DaEr@!}tSdFrv
zuIX!Cn#WV1{GS1-S`Uk>_hJ^;U1E})5gf^UFZR10>yV<(bQBppz8K&S)AyKEd7RsI
zXIDA|8k-m=Q)b-6s3-QV4jyuagCeuJ&8E-CY6tR)Ze8mR45zps=_&cC6=r~Sm<#`1
zFB6(FYVqo$IhYku9=xjLP5OZ6LD-yBN9kxIQux6xyc6q`^($%2b!Eza@Yi?pL+96G
z0oVDSj<gsdRXG;%z2HIh`1Yo$kzdc9m>dk?q@H#@{7F^qC5G{&u5J%FFDtK1`qVY+
zbSA4VB$Az+rmzg28JWI|_7>Yj>U9bg?c)#=65Y$BoL_J=9+x_)ioPJ}8SniJNl0Wt
zgBM07@S(7C<C$&_eu}|9HH%N%pX$XXSAjF#$*G>}w?$q=58lY@bH!c85D{8zLMTcP
ze+)jMKv{Tu;h?tE4GK#Pie@mimcLMt>Iw$j*+5}43=Pba!zAx|m;kl+{aK5}q&9ck
zzQE*uz5As8kF3Yju?e;~33;Qrh;3-64j>;Q1o0zo{e;YxVue?P6n%%%|EW;z<q78H
zAlZcUGueG@U5{k&y`C?Z)sN|F<!riwDmY;X*W!BT17jR;Pd=gP@mNoX_d6CQGYgNT
z%1rc!3~6_6xcdBUdR?#oHwgB%>&olu8g)2-v<qK1zttt2`FtqA<vG~W<%N8WTExqM
z2FS#zn+}UIB59Rw1F>I(o_l;8w_`XXMM+dd_d0T245P}-<0@nP{jP0J$<I$B6J}9}
zT}?Fi^Q)?fMMEOIW(h>2?C^<lNTW_{(CP5o6%$S_gm*wOk|+^#ms4U`$K>&{)BxpQ
zbpSi|DxrWSAs+L)QK1}-hfJOIJA-2dIJKU(gF?vX;8pl3zA=AX0dEhwG?NU*LMom6
zR6gx{W(JhcjBL1Q^LuS;j_9YW=Sua4cXy+^!<(a6QDb^Zd^kC%f3zSxXtMY>C`ojA
z+~{X=8#{e7>E-&?M<WQ0Tklc-G`#!$(*n4C>~u5u@1?{sQTC8KEbJ$X`A(nu$auGM
z5bojI>`msBsXE<jXWPHV?Df5Cf<27q$5Z_-{nMu17mc{Ub+i}#^X%bkOP{p3i(ZRP
z|CYY~+~i=+N8lKFd{JkXJ?E%zT&YffY&m%r?0?sH(K#F9gqmACKo2oV%$MZ#B+u_%
zqm6se6xyNDk);b$Eq-R!M?MF}y5*al3|=Thea?H+xcBgUtif`QvnVk=Pb}4JM^?DS
z=OgRjsn%(<=%f?Sle%r^e5&?PkITHDp<VL+xjzb=$h0~Jt1IXP2tzZk-Pr5yH>K!2
zl7((3G!d<*Q76+h_{gbmHKLcEJv^{J<&70+WItAJD35qw+CL@Z7<iB+?kib#x9w}K
zGksR0U!1976<lyw95>yR-j5oO)>4w+Y+8Rt7CmeE7Ycd6>Zd#Ikl|5esxsZ7VbgH~
z9EU5mD7&#7C=6~<Pz{d@S2sl?-iCts`LNS(Q|udA%`Oxw=NqTAM4uW0*Oh$7j+?aq
zg}T$O<&0t7Y-(V-3M0I=uRp*F>^7ag+|}#HylqB!<%fz>282Ci`k6GeB&};)F2nj&
zysRQ(Tlc8kb6rQ{;D-<00QEe*OT@I+`vT7Z_my%FNuk4Mzsngbrh|oQ!}PfNIu#*O
z=0CBQzK)DtwP^5h9D5uZ#mt_yw3reT;G<-7)yq@^uTnBNIK~U0u%h-)rug>->{@5#
z7P;dBLxzjMlKX({^2cg2zKFO!)fRy0LAO?viwROY_c6`mCPHEzd6oZ7dUyH2>iq;{
zzdDyH&5>^p>Y*q^I+xbhW7*<82BXC!RB8<W_Do{t?EJdWW&B49ZKDb4Hc81jo8AoZ
zTIOHQRoI-{IUTN)F4wlOKH<eH2@wWq>3P8Wiua+u_^@Yn8Q_=riubO0BwO>qgvne^
zXkj_H6Y%JiCjE%`$r3QKtF;bJPUy?1D2dLj)=4%RWrlqg?Ajc0%U{<boj2L}EH^rA
zC3zw2SvZOLCDfB16`#X8>BbXF?cq*i$p2M>*;tJM8p@<nFIunG$TsJ%heZquXo1(=
zbcQSK2$&G0(JmugP$r;dPBodQlqTwstE+8O*$`;wJ@zeaYN*~>d(PqdVH~$*Y)#}U
zup`xQp!}Bdl^luCcbLHRH!tO@-e0lDex;iikpgn<f!HAQ5NTvq7c+}351fFJ!szJC
zZ>J(sE1{3b<|svHuw7)kRg?*`F$o~ioPm-+^YB;nITR-*`g}pJ>wt}>x;_Vov+wgi
zD_5w{;i4OgaJe<8CG3LzQAg=2duok=Vz~Vz|GjOq>n)!S{OwHFw{QFNuv^3ad7p@N
ztrAl+WumE&=6qF*!(!A)Z^!swJ>R6Be`pxE!FfpNebF_}I0E^}e)VB(6k5#RB21uS
zft|a<ZkuBGGGuHhEA^4&Lg(abZ5&;TiD-ItE}U7%IlWWbI$Tsu)VkMGOzh_#hvi4)
zhT~8Fiaz|S97<o?whiOwbldcaiV!Z&t4e43H4!6b-p!QxN+NDfjuRqd96SB?3-w4q
zuSH?#=la)&#-vy3uA<OOsnX#_Mpy@fPqC?sHqbW%peEuTe-<gH(TgRv_cE;ddCKs$
zTwO2?e4?SP)C0DP?Ip`qqvF<L_w~%jO!>iBB6?$bsRRjr#~CB%n<MU~9$12cTN?Ue
z7&ztxblZ$60gp+mu0dhys9)N_M*ek?MDAxT``^ixvja4rkIJi=SOy%k16gkItadVA
zthk{BpYZjLn#^AQtUG*qwrgI`zF9Wv+Qgl=3cMM@z+9Kiym0(S=D{C}=DzBxx#etw
z<<B+=pkt`kWe9zG`}6xsF5sY_^I!k?7LS^MQwgjtR#%9%0>LZ^oANtE0xKX?KGL%-
zj}>c(FLqF%I>|U<jD=WSZI6wr)}u0_I<<{j8FPeD>(BkYz%U>37Hx;-P1rE#LW{|4
z6`0;ewAc_H4BkTkC!Xq`sMA&8UEr|RDasdVC5RzN!6~&hWBMWS&+M6ED8viC1d3Gv
zt5XsjNKuQhpo96sDnSv4Dq0=8cH0thteDmL7uY<Ofn8J2DTKwzs-yqrH{N-a&tHcU
z{e&62Q%)|w*!8|Z4#dC_>bXBmZ6{m9mcd3Q{o&|LQ8ZA1cHh9eatzaB(wd}De{k6B
z(oI*_yeeAHCPSy9n9Yi1YLR^PqZ3~=CZw9Qs~G8*J7yN<&$cD<#((1F&<_&SOqgBz
zTlxkF=gFfr>a7(5Nv{)(#fSQ3eh8%Imc)j`ge3*MJ>Q*XAf)k`tTa1)bTw(<0S0b0
z3kSir=)14Zi<?16@8)Sp{YDeg8G{a`;2iF^3Q6-($|`33LB48Ok4R9dJ7+Na05jPG
z-CdABc#EQ*RW}V?-LTFmxuI$pl;GF4=n(#-Y>afdUYOHvmjRti4N^+)=Zy7lmnzUk
z>ocu2qEu7-+rP4T-5Z&NYYe%rOT$1R_1vOiP>UwDYSSRi+uhhY&ZdJ!l9oVeI3W9n
z&Lin5w*|RrT6UPupi;F5aS?SfGOkdw1>jNhdO&|xp$Xj-$kHx$L>9=&#hvg<_83s?
zkNKc|{TIk<%AJ8Sjb>uTGp;VyC-~^yA`(^v{pIcE_Vux)JgUC~T*7d$T@dkpitE)4
zkEAnLIX#7kGTyMYTR42IMLu%Xk#Tsmc}_mD>yetfk)lurMIPDIWZ7&$t!<8-?tUzj
z3krsa^gCZQ);;Xt-`ZDSmOpTUD%jQPhVxP)3*lj0gz2Fq+EZ^-|Ax!_c!xikQeVFE
zVQP{QVrM^UqoJ?_|L!$D0qCX&ABe~*EuPX!vG^>4@;t6kBED<r{%y$QF|pHF|HLsj
z`F7p)?XYRlG+HmRV{lnQt%J$;Y$1{}Qlau|b;^fxK>>la=Ekl0eB{(q^}>Wij5S3A
z`8h|=O@a%)Tl7H~&?rDB!lVE8_}h?f?T*pMD)kM_^N*#Int*(PQfGiinkEOJm&rY8
zJE~ls*N!euhxr=Khti>JhiTSpzG&irb*Q2Uw?v*cO6DybCuHB~qD4te_4p78ymGr1
zmz~^b6myEb?MP~wIw%UJw-B9tjVfHJCG5$2<2;_X5Z1-cEh%5-!G+ZxrvV;1#gES6
zIUWajVKN?sUoz|9O2r#`&E{JLJpKV#+>#oah1WwDGO*(EIX1bHMFk%D!qz=`9JL%6
z6I}kWT4uaLrn%KSgB7pBCwxnWsC`jifP!Z@$eW6SB24NeJ&$6hsbn99r=7)F86tTD
zcZ^pK$3z5=_|%2@QK~iGW1f6V8sj^xdCYTo-1|TTgv)K8l!VB8np36NCgif312QAJ
zJSm^MFuI_9@(o$Sek1?Os%*s*v7_qSKm4Zwc$BzKDP*3lWx6{1K)73GamGi(r6QDO
zk`p{n^V=fAp<6>?WlSq1WanUxVH;;x9pEi<rCZMU6&^gb9Ph@1bMMbNS?ZBtVKoGd
zbVIxU+{OfkK8ofr*>Uy&1BtpFx|qr!MGykV#Ex2h4Xi>uZ67ax_+mE74Ax`tUVchU
zdeT?t{MYwUKbiwyxl&VZDn;|&o~frcV-ljEO2mddQr9+gzx)n)Cb6E@5v#G!`L0Ms
zL8TR#3tL}5*ZS#*%XfY`Q=`4(yIj&*XsI(Wc}>me$r?R6Hx_xduN|F!Z27wev$rJ}
zA-ci-_b|-*Pj|JFhYt=|b8oh5DNVh#zu;)`q7a`x)NXtwJ0r(cGtaR5%pLnETOVb8
zB)y(#Q`JJD<yy0rB;7Ump{H2uLvr0Do-P?8G@iC4eH6u)O>|V1V{C>#Kh8LztXlJE
zz3VBew#0}Kb>7kld!XED8p7ekna~ekcSJ-*S0Gg;zScsjQkRRV=BW5Nx{8LzH|Wu(
z|E{v!IcOh>Xbrc8l$rAN<*+zM<9);j0_|l|-ZimbmX4SgV=@=H?==}AkAC8gzdH|x
z1X2d&T<v0}M1ngL20W>6=D2sFX8XNy2z*l(4fCJq1jKf&0w?MmpE{}7M*%NXZx#Y+
z*^o2Qu;F|tHoSsXW_P-lqOQD6X^|33<AZnsL*PyhBbPv$)K0iH#uSYa35hV8eh;nS
z55;18QKjkh)=wok6G<w{pQdngcs7H>ltE1`Z9me`jLAbK2WIKMgoGVMSUxFBEG*8z
z;Bmajw=xRg{)@9ipMmu=c_1iaD0JAgxcrveM?j7PqZAR>14o9e<lUH?@Ca0|XsoHJ
zFrVyFX6&ADF>y`&eW#cSCE<w(771An%@e2_;i_5QpD4<MOsckB(=tRH><a~Z@IH^D
zhEz(XL@8C*zU8P_qHFxbxY04mGR0>nX!O0%YK8UKNbaaq+{U#{cVIb`AI8Hn86UP%
z!u_9xrUx}Vwmp<2&Z1v(j3nyHS72P)id&<pJsOb*JS7XmZ5NRsko3v+9|&x9E(S7B
zxo&-#huwzVCG(ByW%@q@MULGh@&%aAQBAbt!dapul<1y%1dZ%Q{>EZZ0Zoi|;_-14
zhGaP`M}=<1-CArCRTIA@Ky^S=jG%}KVM~uKhz6#WY&2<iO-t=xxY?;eh4%$b4w6$r
z2(+uiD4rF#d@#YHZhU9f@4?U2dhRAyUdzb1woU+F#dO|uSpPBnMJxd+E8#0CfA>;g
zz);!mp&U%<j5t=NJ7JF(GHJEm3`If0o)=&uRC#tWv)i~w_7Pk968PPrj6-PUdux3#
zrWbn?y-n}`T(V>?2n?0{A{zNCge6N1W5$+9$JNjXa9KFllr>YNN#Uqsj4vrls=*r8
zYSxH}p<b%$WjzkeUD|rUgf2ewN;<3@vSQZLU$YuTz{Oz8Cp!Ypy$^H0cAN(Ipo%`z
zjKu0s+j|-RZ<}Pl1VY~;)dACLm?nn?y|6LS=*q`A*Yzk|7IDxGY2=Wk(&{h5L<NaY
z>T7%v7k>VE3qWy*)?<wSyTg?TSw=wBtASC*T5!T8LK<<#^Z^&JNREe@$5@O!2wv7-
z_)^`#apSL<N5TfVPt{x*6~&S67)x5aR}n9Be2H=kaaeZ6DVMPjK)Mr2FzxFvqu^cq
z{Je@6z++O$M1C*(78s)<)ctK^_4WDu+$UFe|3~2m8M!7n6|4m5-twr-CQfGortgT7
zjQ2rP?^hS!ahQ(|JD;WhfE}QjTIf3}CFacB#?YjqArj7|U*$!-t_m8z`;3Xc4^NXW
z!g7A`Qasnt7|Pa%f^P4ZtGkJ3O5mHmF?L|uXdj88_(dY<=QFP1;cd?8bz?bn|EFBT
z#$X8k%IbPG_9H870(LYKh0k+GV(_&}ONp#1?_QzPccSnU#<k5!Ppq1qRQXd#4J8Vw
z!C~><nTMXAaRPrmZ=<fus|Fn=4pCJ6SuQ#VTCN-IiLzy0J0L-@QmwcGqz&N#G74(s
zqNMcDJ~g)cg-&Mx2s+_4eclD1HdQ<>?wpeLO6%k+hXg+DLY3V&=wTux%-z%PSxvGo
z0?duq#$v%E0V7v*!zjQ^=Lm*0?gOH|O!v&hYvwQm1nRCw5a=GbjA}OS3ssLhM6C{R
zdgr2gFYniw^Lb8c@+bdZ%Q)d8p_00yN+h0?v>)DWS-mA294@@r!;_=Ppe{D)z*0kq
zlU0Lu-3i(nSf6)pcQ3zYx{38EHlS4EC%z4e3u|gBX+el?9g4++azekLgB?*l+V1B*
zhs3>a)T=U4xpfozPV%s55;wm4<1YSAxSMx^t9tLN0TCUyM#98nci%?{E}&0fr|m>N
zZ=%?sg6Tv29@+erLB+(72c{kBqYfGNJOj-`k+<#yosxksUDOrerPn;e5ci4D%l5DS
z1yDRYe0$EA!-Ir!Wf_HqG}%$<z6CDVzrNpC8W(`|IU+`e!_*uFUZSsIt3MhW4T!1i
zddX#`L=6)s2PBZie}<CUOuLYxIJ1kIVG?liTr*LpAdMzDS5Ir^qtEsJxN#bt<<9Jq
zw)4P1UB@MHwZ~sTglI#F%TaaRQYFqGH9Bv>fG(Yhs>gVwE2r1E?&l50JRl0(;mK)K
zz$O&X3BaEJ&W|C+&Lz+vaeaxC)%3$gn*Z-hUgGYr)3urCzx$YLQKK-^PUzIxCFaiz
z$P+_d7^Q^e9BP$P7>}gE_sT=fgJ7Omy1&2z%ai+$!s}+2SGCf)=w=tuA3Cf6mnkcc
z`#I;##r~V@tkCJ=5T%*-0$7QJzj)F<1V_Mp{)>?lrEbfieZT-*zSbG1#{3uyhR4iA
z9Xs|ZA>L<8ZU%-HbwB3*+056u5*#CJTq^PM6>=!eSkf$hri)IH4W}-r%n=xPq*;WQ
z9X)qD7vA+@b=cp23lEa;Q*5NhgK9t*>X>VDci?o~=>Cg?wSk0)r&bOCQ)8~-CyDNa
zBtnHSpxfOWEnREWV?R>ZEw(<^DEH36WMLn~30w!3ywnbe!bBk(B*3}Wpk|-6Q<sl%
zm&C=)h~|2<M&0~Pk(DmN?TGT6=i*2WE4n}<(VKr<@tP>kkj`VL??sae?FKd)-3+W+
z3a%?o)`f@|9n`gzDO^E-f1Kb#&kr&^s<F&CvjgT3;SPKXLMopc!zQ3|(n{pENaj%n
zxX^-W3EK)oU{F}^TsaZlmuS7frUTWi@B~^pWgT|66D{qAhwhjO3j;J!b-t|kady9m
z$#zML!h9;omzGIHbCcnW&Vz`MvuDjn$7p<X$ZN4~OZ!Ka%*ssy&@b??SJGHaj(8F_
zZA0s!zgNDa>i&LIds)nxrjC*m*bcewIV5u1l{h3biq?Z-NEjr;pw>{t5B3FphF;_|
z3JZFIU2ye#hDcCXEMm_Nk;Ehcx#&oN<mt)bp`<W@V>0pKw)E)8vjN3rNwY1|Zj8?}
zKdJ@W+)sh$*yH)~)U@LVh_i{}=c73-LQ<bC?}V|(Ur`tfXs8Z+UjXv+nZg3jp+A#<
z4Q=tb1kmjK*%{tZjN=fKpslnholclmaD8-xB8@q~D}uZrn1y}<z9f^`a@W%eo!A5V
zo`V%yCjw${Nx(HzR<$lm(wlipq7IpSzdmp0Qp#H*<D)-CYYKaM3*<8Qpp?h;qiWV<
z6ywnE{5?snfNjt<(WZ3tck=hgMnbp4gyupP0az3ybF#*Nfx|jQbG~9JmO%2s8~0vX
z-%1ZAnI&g&0E9Ls-1_^OsX#eX&YD~z8cmVAmzQ)&XvS$u7+grjzYzH`ncN53By}Gb
z56X(%f1IWl>T}L|tLkO&?FDzIez|DO*D@^}Fq*;!sZ{sYuR&j&IGbnHA|3gQNaG!#
z(JTd;Qm)f=?}r682lAcc{hD`mKWL`DxdfO+lJ&82*2L=M(m=e@3<cFH*UF4N3Ic->
zH=iBu5C#+avpaAln8OQnl2H_Z?kb@16nOQ9f#Fl+mpFJiz!q48LyOL?A!l|m`^c^s
ztjtm${B<pz<$b*)VcC*kjd>k^#GQuv5`K1Htf>S#e9nR~6^0ouN?7x-3s_<}EzQ`2
zAj<NA$=K8lj_@VcK$a!o{zuzF+GhypR@0Qj45w2}L+cGGkQS>RDuM@n1tN+Av{7&x
zeI&Mh0Wr$rE6XHMeH*r|T{6#@@2Fb4p5e+VVEWYc&$)=(A|}D+3VEA97_pgER33YF
z^YfRs+SXqCn4OiK-+gUi!T0}i8u!as^9F}qJ&^AdW8o~<XFJnwrhS<6Y9gQ@rQj$1
z5t=kAs?2lbR~v}NQ;hHNpvFld$b;3=zUo_{A<{$Xvne?4nE3tt$6&iu6XAQ&10K-+
zsj*rM37D6QaXGEDehuQzb0QXSN~5S-WULvGx52t+fmY}je~zTDivQ=;kKAjL9K33^
zpFP)_N=YuRhjiL{MVd?YVMYgyv8Eep#v?QM;sg-`(nmSqz1g&<FXEf3yPf1#Zv})X
zFiAC$O5S;UDDG3GHq_pSw7d9_e0}&qTw}RE(SV##{*F{b2sgXv77`c|_?watj{7Pw
zIN>w=neoNNclpF$XIAq0R)5AvkBH$)DCjE%nt@&<?#!SXHu3K<WVgU|8};;Ap9wu<
zu>8kV?C%;1<Zx(S!2?_YrL(#1;~2P{B_U2y)+tje#_0*zh3dqvy-7iy1lJ`MBe*Tl
zaK#O#&-4GE1@H)&W1emmtQ^IfF2M-BEk#$uE={fc*Vq*97gT_v;ENiR;m<+op!o0g
z7d_@Ngekz-lz6Fnci_4NPf#pHG2;TefC#X*Y`B6BG=~svk@}ev&NYv|+v9~v7Kb{m
zK5a*!VZ;eQPA9fQeVHTklq|%fs*`a<ad0X0+C6l2x?hC&t`M{6=MCHwtt_W^PV~mH
zBWGbgLlVisM<(c^v-S17^U=?Mdv*8nin>VhHjwi6=0WYy@|P&a?*c`AFLPP&>}gHa
zeK2=%WbrIeKH<uOq}o!d16{)(qqrJHQMknf&VX@XZCw0>j@^(|z}PqY%K;q4O0!^?
z{Qv%X4H|U{849PB(iB6$#tqjcjd4nM%4ZI8Bw_g$0F^Gqh21_&3%Wf4#vPslL%}?q
z+_w1gdT{Tf73(Ny^?|uDUmU8qv$m5KN{~Og@&I$Db`!?-wf9pvBox-|F}*2FJ?aXC
z9WNGpYRkGMn50bINo}DFDy%T_AAc44SpAtCwf+XJm5?ZU1#o8*)m>+>fe?F#G08~d
zDViTcjy^KLQbSeamL5%rSti|(*vZ}2r3|eVP97nGkOf<+-3yEnq%eo)_r6OyL1*x}
zARPZvI}<GyyRpn{!R(MmEd1UMoW_%Y3}1;Ih%QGLuE}Y?(u|^QQq^|GV|)b{jyk}o
z$i#F&qFPDi!9wxL9Ox!i`gEvnw6_L~>Z`pZQ=+5$7~MFVGB|BJ^KlxMTr^i5js{d<
z{aSD<)Zh%PWt1APUW&QV51=!suNdQHGB`aY#xOr}aeNg4GLvWqNn{nF9{Na;cMBu?
zr^gXf!<>pbmBRS|En4qyERjH|4Ey#=iN_p4E>zZ>-km&1%Yt@mvbSg9VEDOTtsMY8
zepA-@d7Z);K376S0vAKYy7r57)Npfm3Z?PvHGqXTKf=f|14bm>2?^nmrsc60m^MFL
zZw~sI?+hnYxu37)OZ2qGjiHq2J^}SZIuGW&5~moX06uh~!bl2UyU|&OfG5s;d<|q=
z$j+Z%_Im(vSwK8ZRr}TG$2<VLe7-WLs(7kYE>d=DUBK7Ea$yn2e}^_-N!)ve!eLE6
z4D6^9lbvWL;t}n1AePe&bSCqd7zzj5{coK%<_Eyox=L3mRE5CNZYG9^_YW``+3yN{
z;p(9q)Bgr&tdLfZ6oxnl`mXDPHh&4s-h~RKz>$s*y5Y|TOV~B%AI;O%LpO{^m8m|Y
zS$Cm*t*Gfm*qn8?1?FK@5*N$&m-T7Ha^pA9>Cn`53dN*oGA_Q6W()=*GHLM#!%LO}
z|1Ws>XB2?z{X6&<zFhs@t$(&cBTypO)u9d%DIf{mzvrVXHi^!cu0r8x=@Vb2yl&G3
zgm5Jat8%r{WAZq@EjSwyg6RLPSNeGSH2{7t_;Orf4&N7wj42stq+7=>E96C-ngnTb
z=?>hnyEVt$_$>N@?H)g`epPJ(B3e`{)*h<sK|Gmg**%Aj!L7s_XsACwercG}2lML(
z4RN|Uc`ZGrCd_SEi7H!_e!cG6u$@?lXq@sr_U(2uJE|MF31km{i=KYS58H4S1skJ$
z?hvN|s~7>lPGwnS{eJ}2dI3q&Dlp!f5l<GV#F!iF)UnD=x%GQSl*k$nYYBze9XrBN
zd>8=|qcZWH|3}qZheg${?Zd<XgVfM9!~`SlkPacz-O?pp(k+70T|<{jDUEcibP1ve
zh=fuGDWD)G-#vT3`#65zzt5vGtXb<`=XKUKzwu>68K0&lyc_e)uJWTPmFN<k(>B91
z7$z0CoQ&s+%UlIWTaXS%`_03&E;UUHEi?2{c&GjKh#S%|_*hE-c~8`dFmXplOS08U
zp7}>WxlQg&I5MAR9C37NQO-J$ye|a(J-C@JBPm+0Ja(4%m8?#WO@B%TEKK3E<Mdqi
zD`c9lkHZa7dV-B>JG?Eo9`|<#&wP^wgM=byftzY?P;LdI63>&~T|t7Cv)gR<dum3`
z!j<a~L;BO5H{&fQR~R=QCA74Q1-OLYI=Eq;wuhH)E7VN2FmZS?C9kxU0d8A^QVPN*
z3|jw>b79=4W9`*OpHXPf0-JzjY;z$6g68rU(N9cq+)4)=XfG$qw)kBs?XcJGw+CBO
zPgcCPjl&qfyh~<?1pA(3y6=%&7$x`n8rA1lEWX`cxCUD%O-)~YRjy+D%A#kY@$wI`
zx>9woh?G_Cdh4GsdwL~!`@O$=H*06)z$+-Js)2pme%t<8(pQVrBi(pnD#i~~j^oz|
z^uw~MZA)xAawu*6<1+K8z!NjuUcmxD3#fOLzmH5Wy_#%;au4=D0OlSMX|gyRzaYU*
zzWG|?agt<zY09#Gz4=cV?{&e>D@rQc$gY_4jHPYfcZZ%x!2r$%pN|?ulpMP7>^5=w
zGw-DyKD?{b*91x5SQ_b?$q}g-t>uqWC;L@zIZy>mgBNR=Vy87mmGD|UQD6?t)z*7b
z==R{Y`p6VmV^FfhQf1zn=L|(_5V=AhLq9<=wO@ip=rxkOGszJm;gOAZ50Z(Lwv+QT
zEz-s5Tey$AZNWB=DU*Yt#Raw=rkgZ9E40_X8)~)1)4Liatv?Stv`<v6VFKqv5_Ev6
z94P|~OQ{x5J@0eh`{O@35CA;u)x|uWzleT2KqaGyFqFbk&E7(!8|<m<3|o<pi(Ca&
z@~CcVfZOG+T8#*0Yij5;aJbzM*hy#oF-&(|(^k?nxi*?({MF$E1+kZ<K5*QLy~hxT
z&?g(JSbKNmEseI|!~vF0CklDD<l%H-L*5Gt*GFAxDEp0Mp=_uE+KdI`!ZZsm{50Fv
z1VP*m7yZh^bN9Sl7YF3U(lE&K?&Ysn?PqT>5u1phdllBBNLPW)=EIkRB0B?F1^Lyb
z0=>LE0jyb=4*kG;cFMP459BssVNCwwqxZvF4)N$az`n?=Q6Bzw<!2c9iL7+h2~9(f
zI2Lw+)%vnOov)fj_T>j*Ln2^xXEsxze2$KAi>iuub1F@=86xVlp!zGUo?4ct5N*}p
zVyr`qaEe6@J3EP*Qq;ir>8oMQj(5yVI20nfM>2#`zc<(5682ouY}oKXUi3r$)FnhW
zl2c~TS>^-lV4<81iL5ACgFu^*@f43RWPxBwyTL1mk8S7I-jgc=NmjO@F<D@6>0uCJ
zu)YDlGfgPg6|1308h0ODP(M^5H{u#Pq{=9qaN)S~?`s>E@L>4%Ns>N{eQhw)?G9w*
z^#}V>eZWsGSWf~*)5z-Zcf=}XU!bu0Z%^+1t^dy}##l#WK|GH>^~xB7LwsSJpF8+|
zQj&-s%YQBn%vdVQ;J(&tXo?P#eo2$|5a@-?hQYTIr@#*!wnn>50EPuE=4;=|0=Cte
zR4y?x;lq@L8r!q6$=i+c0F8*4+0cGn<i>N6Kvwwv!V*@$9{{o&)0e20`v*W8nTnyd
zb7Vol>!%*5V*A8ib`o^JWi9#N$&8m-Do(5ZD7R+t#=>mXcL=l7uR`^F&Gv8%q)-%t
zIT^t}r_B?R!T3*R=%^+GlX^8*?O?v8E4xO9JklK1YgTKs+PW{RQZ@md#%-mED<t)2
zbn#0oYVNdRSX7n@p8?mm_+zlnE46{g<0B_YEwA6QM|BUgnZ+OtaL0d~hNmL-Z6||e
z!B{*=Q=*M(%%X)_q%_g`Iscxmt$rXFn_8OPkZ<vEID+8y%I3-FzpaUA=HayzyK_>0
z7d%LZU{=rm3j~YxLg(}6KwI@$3x3Po6-X`;!T?}vA3|}8*F+u2H{CV3C*7Nu@EHdn
z-QbfV@L02>M>q?5rt@+PtPj$Yf5q+w{=6m3zJC>%_ZdPi-QcXU7mhi>#IZQrMzI+{
zX0HB+3u+zgP^r~?)!gFnNHyRlw2w!+KNR?UsT{r#?9m#{MuqLb0$a5Uk8b43rgS-k
zm9$MMIV#!+Q3=fv`~0rfqvv%dZdsY|Pf@x`3#}Scv62Q$-O-*>MFt5poES!4P<;l#
ztlE8pxSF+9Z3uyX)u^}=L*`U+1{{Mgx*bF(W5?3iQJu@~sS}&_spVcqv!5GXR}z#9
zGFl|-5rsH;%j(jADbGm!m@Qx-Oqqo$-1>D1Yngw5^)|92b0aZZSJSwJ;D>|uOnHJX
zw`}S?%g=^k?oN<GSo6gnus=}r>=q8$JYngSF0~50vK#LszRg%PUcSy}_fhloQ}HcP
zF$3QBNaFxUf6Lu63+%EQnL5!V{6Pb5x#f#P8aB~c%RQ6r2D&{l8coq4=5LAo7x)fn
z#z|tG@?gVxC(81gs+~L^1}^&xyww{MW%rG8>X<T!Yeyhs+hmcy54tK`M|~zHU|;m1
z0}<(Lptsjk@vaj(K(njm`Y1ZO(5jGYA8gw`1aI&htfFwZU-3EznnkmNC-S>dq6EVo
zxR9Y*QL=LAuGYbaMca^fqvZ!m-(usuUWjz%#v2xkjF(6IC#+p1p>(VsQnbR9OByxV
zE;`eepGk#0ri4uRYS)SOn^ImK+E%=rm_@!m3MTOn6N{S1Xr_BW9fY@R{`&FtN*6NO
z0xZoIkyJ)0E_XaqC;H4SKj6<4W9j6yp$Zq7*Yv!R(nex@_~Gwi=+L?duopZ2CPyV{
z2)a@V^0^ts&&O!_Ca}BmODUM$R^d$|dP!GEmxYG}LdFKc@@1hw_2+m0j<CpuMV5Xj
zq&C8&l{}swe#S(hFjiu8mv9&nFZw<`&9@F-Fdfaw4N`y!<vh4{XUf-1^U2e*V)SZ4
z(XBXpYJ?Y$A&tPs!uUG6JsFR9^%BZ!X;06XYf?Ffu)p$PRk&=EVGdV%xtR+Y!bQb5
zCYt&PEJ2p`5evDGFbsphFuds~DhTFUjrF@XUo8iC%g3B4ti3if6Zqb^M_KSb1=Fqg
z59~$F8!oGg!B~pas^3*P`V2Wd@E>U>N&{>?x&C8&=}N@0k9Fah=d<_zq3TRACHBo@
z>g+NY4sy2GOAFZyvyQxupJSK7_wB9$`SK$y=h^WX=jLdgw=?2R4dSX(!ctJ!Fn7e!
z;1Jso9QityedO6VnkuFFV1<nM7j6h5tsBwY=Q`Y}H*Ajx8&D-qKCRNqiE}_1)G<7!
z!mZIiXZmzy;=5Rr-$);bjiE#8-aGx?hhf!4)x0jwx}GOb<qi^Uv7rBrdk}wJ;Ee=f
zqrrQ7|I~BSHA_yCBA$#~G-O!#ulbL2(HMAIWgO5%S0KT*lIuxpCEAZn`NChTv1*0I
zlCjz}m)D_2hxelupJRQnLAjJ|NyDVlA;2_JF#60BYTDPW3$;Id19H1YiH^lb;PsR0
zbiWw0?w9B+%f>)8lP!z2&xm4`5j5o+E7zXEpCeACr&tK|^H4$FUjiq<sM5$}gQiZP
zq+(F7@TXKi0*>ECy9@{ttX@!FqNZUNS3P-WdZnEwL2;74%(s#ODW55t{kYguFp-_~
zXBq;|!-3e}=N*IDg=n_n2L>mOso}iP)tQ*dv`$W?#U>Fd5@8FF8ohGWVF<!bu~c|+
z&&JU|KIe5ss;E^uy9PN$l2|I&dte0<{AOh6$=VgsC(2Fose^;ATJ(vsa#Uea2B&kw
z&{agmqL${Yzp$u-IjivXbL$sTM)Lp8x6~RM$Gv=^x$MNhBr|9<i(|JacGw?$NRxvm
zlUUna5)Y^;OwywWtQl!b8ja7DJGZ&!dq<^g8rp0RvTS;*00W84@D@eBI+WF4vESw$
zgSuWfd}8-h);%t3sRcSbIm^!p0rWDlc`0@CJH*nJGYTctzWuRL4soo2Xsveb`%|FI
z+fr<~IYV=~h|xYEM;M%_Mjcrima55+i?6&7Pu0;px_erGbMj-FUjOsq5yxa*VQjX8
zNBFtb?+-A3MiLx<*~VahVzb$Ztc$#q^3+7R2=WXmXxd6o)`r9SG#6NdxZ&uZYW&H;
z^8W#!Uh|C|6%GWOtQp!+Btg`|0>uDf>%j?flhcNT<E~2%@ssblJg139@?wL&7*cw}
zTr2Fi?LbCEe|q80*Qcy~%bMG{_$`bsa|N4Hp$t41OMX=xvP?`Y-(d+?5_W_sHz(E(
zD~O5?4k)D=c<E2}4}k$vJ&V8OC)H)j{r>bhHSin;9p@$!_uVJ3CH>}m%$eOX+G-rZ
zJnw^)c2wr2!qcM3ZqyFOo_2$~yOlR8ZokkeBe}t6zOZ&QNsVQlZfHVmklzUN&bT7!
zIB1ZJ6-V`qD<1f;z!j0)2d5QRT?7aMnO%(gh>F&#X6zM}G`FmPeYLAVD_Rwh=K8E9
zknBt=(DgqUWz68;0L7#XN0V!V30bCo@@XZt!j|?&>=>;MrlRg*<-<R-w05nlK_F73
zVe2?f9j%Ot8i}$<wlxY9<|X!3MQSv!^P=7^w0&PC-~<`V(|aqNBreKCdNgOA4$)CB
zb=)JM<;(>!Y49J-Y>qDxkDSlhOWpd75<!A$iQe}mKNsO%q0kT@i}hG-Jl-6H-2tEg
zM#qa`6~6{EDw%Wx;qt7#;G>abzQk=&?E}UJg-*&UL1lUUxlPIJWQrGE9}*Pppji(x
zrl}0!jATtt*)><yyY&S`-&7y(SY!=O6s4Hbw@1E%4U|(<$6<3Gr6My7N^xQ#Z9{3L
z?=_hc`D<l9vm40!{y9Hx!q?DPbsoqOAeftOu|mu>I^AV`xr6!Iv=1Pn$Ut9Wlc6h3
zjp>j3^?ir&MaBV0=GJO&6)wSSRhE#YAl+?D{`yNx@^A<^MfoL2M*L8@w4aDIKgD!<
zbu5#i5-nA1ls#WD?<l8cq2qX!VhASe-s@nCyzQFnXK;0qQE#u((sZy13(~}@Y4>$k
zNe67~<%v=Ae7>ighc#addZWkEuW+dhlK}`-h`Ui&Rwrc3$fQnlY)CcnIe>YiY`KA0
zdx(+qY1=Dhs`x8kzh7r~_CmBz*U=Vnf7mam^PMYW?FtB_ig=hMHK%vMNJqeVPXU2i
zXe$~ON*4C<s#K?$HQ&ct`$M&e1Rr2)zo%~jYQAmhOABr@^vLF1f$#{5li-lBNnX$C
zxV;&YM8yIBntWB!=?SrrFzM(n{}2n0M|qn}zHdiV!98E7xX`x`7O67$nv(FU)6bE4
zZAdTK$JV?F&K%tL1{7Sjj;X00*vaTjafh7zQYXdO#6tWVQGR-ost%OzR=5Aw2>S*&
zrbE%h8Vj{mjA}NSa_NZbv~qpF+*HIluq5-pzf-7!J$ottO*b+ZJUYgRD2UJcK(zQp
z9D9QmP4cbF=Baq9{1;2A)@s_5fa1JawLkdnOil(!x_sk?g#vI+zo1vi2peHL*(#!9
zK*kSO7yr9HmB*=0`viUCi8M|9nguVUnMZ9d{YQuRtrG9fB4^t#AD%xC8*q3`RH?>7
zK1Vr<#|+dUm^F&^(uDeV42znc|NG_Gpoc^7^&d6MG*+i9?0Ne_nJA-g=*x^=I~pK0
zeN!49tYY{IiW={x&i<GAhRZ39j4g&%7ajiVyWO^wv^HyWhA;10YoDG6?y4|=?-`Vs
z6bt&-MaeBW7z7?whWdG!3$t|M_c4}H8#!}PC;Dot978~4_lu9gp2=QQGfOoL)!Zcz
z6-pl(*^si!G&~+4dH@E9p?*EU6*t{{ewV;r>;(zc{=j<k{z(F(nKZ=jr9%tBNMOcw
z(CbP1&2$xpE~G|$ydgqgWCU}~3WZn8jj>_qVcabQH_Uu|%dK2)!j&A!j!0v}Z+4gM
z7Wv^v76)%hd8^YV!<pbg)X4d+v5EK(F@JBWL!d4LB*~*%S<UfI^rYXIF?rCq%(uVq
zf!>JVO!*VdYKs7z+<>NWftCNK-8}ElgjrM(O^Ti=<X?ABbcIezS!Wf>*>~D+AMvNF
z)R+mlC31!M0sm(R=d8Jxq2-^r<Q_^bS$!4%h5ilk{fn0Xq`hU&Nyy+$w+1+IEsB`b
z^L^rSFc8^72=ngDtzsFaqMFztN5A=c2ptBV0xp6X#G*q&3>K`QTa%JSV*e(eWpKh_
zoa19h;t1WsF3r`|ti`PcxC(D5#J<%6BIa8@sZ>D-GHdO>0bb^*nE0R6iM_aTr7Sb2
zDZh<)R@pJSx*KEL!EbHN+q~nB1~-U3`u<A-5NV8z>+%y4Fa?iz7su{irb0$&;XYCx
zcR@oPjeCS)UE<%A*8w%pxI>IzA0Fwp2E)XtwOmBgyNj03JFhD>s=}GSS_Wt_?2dyo
zM~GzG_`ak^HrBU28cKx?B;+o+WYY1nsC1N@XDL=LEU@0nR19%<2iPn={rfq4Z?~NG
zj&6)SQS$N;AQwV$ZWoe?QNYWxfeCG?Q_}5U_N-qJCaWezJ9*Tbo<=*&nr2Ag*bB!d
z1hhM^aFS>~|MJ{u4!0cBE~?#FTy?)t-9C`j<+4VPZV(||rb2upXMX%p3Noi21+$22
z++FJn?QpMpFYGvE^1_5a!uI^0xW)h+IZ&epy`FXMAmNq^2Ux3kB#WrL!Dm-%>&R^M
zoRG=#I?=NIkZ>PvSGSQzvunCr?ZDD2I{K1>WK`LHzQ)MWNoBtti^mh6!cuG6DNkY3
zRwfqlQi~SD1Q7|7r^6W}bmc1Kuy;>1`Qu}{5d`r<4SC_)Ev>V#h_;Dc?WfxPH~P&T
zwvTwPYA1kSUvE7Tk2#zV+nR5C9gQEp+hk^ujp6>V!GaE8t4>&`uZi}%7*m)9SDcXq
zaxwae{PYFjuh(jKM=F%>19164|8q8X6keq`it$EamLV)9j`GrHKEP|}Ld!%)#b`@#
zd^9(h`S5RU6~C*ri}%<U{EJS5XE3;TrezZ}W5#p(tX$%#YESdHL|6vYvexu=2mo1d
z(GL#lv<7nI4YlTd;VL0bNl^QnvGeIm{zh?;@Xm50UAy_ybU2VAexhIbiJS^f^U35o
zNCF;S9HD3!4xMaeU{AI<+QJoJ%ISw%Duk3s0_VYA?ysUiF#G|cosWN>`<Um81<*VT
zs3NP{SO$m>KG)ooq)(lrHeJ_{WJpzGKK!6wK66l2U(U?&KeIZoE=gUHl`+ZwR;yvX
zpn~Q{PqBcw05(bYEVg(FBfuEBC=pS<1KbPWV+AX>j?7!q&YAg)?`J^@)=}<#UU=Re
zNo)EI&9q_-@U|dt8Ft|)X?Mzyd3<=k#uT5THu|RaP9Q0(s!P~<h)FN?tEM!4&agCF
zPpESSQHE0JhsZ8ek_Qvw<6SVYO1N!LMC)cO@FO`PA=e<h(&CV*7@{;A!$$3=_=!gz
zJgg%mr*-7ywA^(6T<;?HDL*kR-RG1d-0(E(>e=3Ne%@gBb+FuZ^(wmSfBYVMnl{R2
zE0H_nECdd>!ZwP+P4TUfujzUJ5F-?`FdFnc5&L>=&6#gLBk(HA36gRtST_qGRL$%l
zF2Uu@G$Bbs+T5F1R5e47uwWyiYgZ(!uo7yKHdCtAQ*wt9SgfKr9|VO()1FdB?Y#zh
zPeFV=C26)40v|hRGQ6<<tq^9(ypSzQ@J}Zw3egYT0G{S6!E#^w+f)lP>!>_U_qPoF
zUDX1o#S)huEE+#Ku_@*DOM<REb~U{$^cJQWLo|njQ$f$_RnO^4Lt3*Z07V!ygj>Av
z#F(ew9N*3;H&^2i5Ljw5JHAP58f|J&I`X5&)ZFg&N7^%}7|+6y3$?0oz3RKk4fp6O
zTG9Y;2TwrPRaM(>2PiKh;ap$>zquFWIo{WrvxU&y&YC6=M77u6ToQ-0Dif(^gjEl&
zKkYqw;*wgdZfp2N+;#8vZP2&!$5B6e(ij^W#u5C5)&($=$N5^XV=vQW&}b8=I5cs_
zmLScW@qx)XYty~a2Sfvqu(WJi*zFRSK%Jdh|I50>eY!v3%T0+P`pZChW)nEWZ25>z
z;d}smd)7VJ0NrrFpxWqZAe8a;LZKk7+zWoFeCxB2S1^7Ok`Xq(L~Lp%Z|%6CM@K8t
z<S~?ev#L|>*Oy9{m!O9SS>K!AW8<`qe9zqya-@qo{|qDzG|S5*dX)2M)bb;4hlZJT
zgp!{fQ{INs!*mE~B84#r{X&WRPc&|<auJ*fBqYMhnlvUtIi$hPE)P2*OC8INuRpL%
zJy*wSS@LIFLEjxCc@=pUOX~)I+4CeyU-r&iSeMXb*3x~0+chQm%=qvWJd2(38%i*z
zKho-=BxpwqM2x!IbFgbW^%5Cp+c2%1@aFEzM8o<ai$`1~)!%Y3JcIhQMlyrZ2yy|T
z5};L@m<%nw8Z09hNX6L<o*Z-`VDXA3wCYv<+Y$$YCUlGpTPc%>TgV6!V;>gzj-+sy
zLUeCpdOLoK6HvU0QSCNVqTTkFwL<f`>jy)8dp_nP>C<usi3NUuNnwYajvFpKboVWR
zTYyHrkNjclkd!Ncj8Dd}98M7xR~U|Oe#t7ZGOeG5j6;5cutzU|3UeX=!!UfMP0&wH
zW74j1YMYWSpKX2$KG=Xjbt&?=0ef>`FdF?H41^sJqC>&wsI*e;sfsDR-xk;8WFCx<
z#F8?lUQHT!y!3kU#W|7{xO8|aAQeLyM@z!j%wlJTY+}lWmj)J5pB{mLzm{%v(l2`^
zI`B3@`h|m#VO_R}z7T2LFNziRQa1bi3aB;2%k{){JBynaS)Ea1$WCqbwI;mSO6n4`
zTs{Q})))7QpiMD&#$m~D+ei0AVl0tk@aULtNVXnA3gd-MR^d&1-xO`=Cz?szgD(Fn
zy4xMY58g!5^IkqCj0^e>8b=m?fpRMT&4LE0Pfp=jJZAJHfCtZFX63=Bk+t6Rsd-(Z
z_qQ+jgyBT@ucu^n4hkRpyT#M1IR_5ZGmi-xI5zVtn9nZ6(#lSF1L!XG$Y*Tg<Ij+N
zOC^Y*oTK%a21gDV+PmhTnA1SM#24qPAO189+sx)RMszOE+y=)aZfR{;Hs251Om3}F
zVNh@wiu1f*F+aYL19-*>DDkI<_86Wj|EgRnoz|NX$7;1ogfAPj`E=@hz)OIZ9?f>6
z3;>MK-c@Cf;Fu9$TOEQ-DvDf@-2ViuG~7S^-a5_~CP~5^Yrafh(71&uiYp#Yl)dON
z8`|GuqFni$PQ)N1Wxf8ccNe1$&-Q|+5L6%9g&2J0Jw_^_E1Q-h4O9{VVJ|noL~X$y
zgm8!=mN3ii&d}}j0Rk#*{yH+B@GN<fWUkRK1hKzmf7JTjf~J!zk>XQnx3P?IGJV(u
zV1PnBeJ?TMfJaSqn33$^yZd*6BRYH~4)-dVFDxZbhLN&9#cG4TIa8^)pv%g1Z*rZ_
zIflVlcK<YsJHb2i1k3g|eCa(5iX_Tmlo~VH{0rC|aJO5FhNZ?%zPQm?N0-@}<5V%y
zlB9Ms_j}eInT*`m6R-(JV)^Ur)`d&doYK0jIa%e9;E#ldE-$He+|w>(ZdyszpV!8~
zMtQl>_$w*{ryR$yYLI`aGJ}9d>FE+HRSFHOH}w)tqN0scv0<N39|Tq0*z}BJH@xfP
zR5M=kJivX+^>VibL`{^U1+u!Pc}ekOimW@Z0lXb{^i{vR)f`?Eu9R@ye)}B58-D{?
z4|!+IV&z`0>Ct89wri?3baoqo1?rzU&!Y$P!u%6Xy+8rcLB})d;}CD)^$VBc?yf&g
z*`e!f@`om0-RXml7@S{W(TDv={M?57*zrvY;Pv-DOekQ+U71v@aWf}JX%LMFZ=J-|
zt!#YY%ePSeXD%mmE%_~szAe^R#%Y_0i|~buu~aIIq;cvp4pAXk3`<qia<AgwFdOtX
zhv$7)nNNkO7(J$%XPm46{>;lHZPVg9&i7Q6r+++*|4U2(&ng-BY(4<n=MzvDq3~LG
zi$%+aJ`h~4N-Enffii$%y}Qj{d@feldJI_==`u|eDY>q7x^^fmhI0v;&KEZt*BjGe
zSiR0+^@)^EWcQieDRLb1djyAPJ1=OWc}a}21u^XVw}-Tz_Ru9Ht7!|d@Cd&*{FcBj
zv%65KL1*-Hu~@!xc`_1!8Z&6tWOY7o^cK2VL-_{)NS`-bIJ^KhH|e*1H%Xt8bUWoM
zk@0yA8IQ}RpSvVGIAkzRi04@oh#?n8nZ1TKxZ@XGTo+8*BM#}#=Wv-df5M38KED`x
zoPVX1%j#!-?~k^HFqtG0pt(1y)1Uc+&AXYhc`wA~>wkQ>cW=AR(8VD(Zc;6*#f_my
zb7J2<0ovRRm0g4I1Ow$pE8=xOIPii8?lAwJ1Mh0Su<l<aqa4$ABCfeqEqlW{tM1}p
zAk!ODzUy#vjG|~g9`ZX)r^eJ$YyOfBLsx0lTZhYuZ`7E?H2v->P64#Zq#Pnl2N_C(
zpIzzOZ`IkZoTSi&H<(ETXkw4Pry{G?u_c;4v13~pe5&$`43}i0w9@6Voxs9bupRkY
z7dQ+j5vLZ9K77&>P9dy07hR~2pdy$U>C@s^HRf3{v*-Y>52a!OT*q1cbwL#Xgh&QG
zK6@UvC=cGbBBu482U!zhKU>u3LnuH~PEoB{<v@#54{ua@=^P>vKIbCKJzp2mZSBBQ
z9c%+;IG1C_M|KOhpwzmy_^EoyL3BV$#=$pbq6N>aFR7ZTlNA+l<ANEogKCKnUWC&*
zjjdW;c|l73VI-N6kIAU8NX6-~@o2-Ib^b{@p6Y2WFx^g_`WmK7_^jfLY&T6_c^W4s
z^2^CIaz@7hD!}dUTi_LGg1V<knY|xQr2BgEPP6yOepIv9G|rnYI;qcbyN1)__{i0?
zwqI-8>U6C4Yao7tjFMBL@G)>SKMTh_0goF+!rd}EH7O42Jmy~2fA{bG`^m?Tm9Fc>
z&Fy*Cx}HXau3Yq*(zu%HmvSm()JRyj$ZFOfu(?;%tpXhYe!o4x(V%TsWiu2w1++FD
z(B7xb$n!1G?t+?&c70g%F|sC%)i%h;2r!=0HFpqE6dd&@px5HL&oDlK;5rO`^5Pc>
z8d4M0a{ZEl;GkO$cP9EH+pQ8+(WJD$FG-`zRB8L%V@Pt!(VCju(7o4!6$&`$#&;OC
z1d}Nb8&DvWt8D@%`4)5({(ir~e$y5CT>>rQu<gh7X;@Q`-th&jwK6|c`*eo+4@H`Q
zK6(4+1Wq3qXH=8IxqpdDF#|@tibVG=byTYp+177h8BcbaAOr!hLsGj6Q0&<ipxlYF
zBVP@O(pa;dkarbZ_`$=!lsn!_`KfFk$5DtC91P2_we<xISvzdtA2oS~42c%Y>q{{s
zU91iRfZK%2m+tv~QfpV%yhQoSIx`75_#;M^2S^nQKj~K9g*&y_y*mha#+L^7yk>_9
z_lfz}!RT-^G37M(hW^gn*=7Bb<bpO}9@eHco$orxPbco9VR%%Ty({Pt4sX~m{smok
zRy?`wsMV3Jp3bp=7%^z<Ei?Nl_xjTBZZK%6@4FnIB={kfpQZ-!zT8%t7JjjBxHq$W
z+yGR}R|fuJikr-f?`ym*?PSbubui8_8zGuwQC#M5<UQQ3nWSBzTiJtj36`x~+p}8%
zZrke>lxGyfZ+vv&_oL}><f|Uo0D@n&+v5Y!9=yj#q8C8D(bus|o-6o15Mai8K__4N
z^c`Syp`Lm!`mMLE!XKTqsMF9@@}SMWu=%0<*<^#<3*MQ<{22Y2Zo<`&6(=F^5*Sz)
zRtlNKC}g(MB<jXf{>4TV+sD@xd}Vmarhbr4gz(R}c4l!EVDS)7{xg5~&`<Y6-|K%q
zkym`>`%DEwk?~@}=sAcwOq519kI0)}e%`XmE~AhYR^INIcdh5$yMM@PxQALcM_!ES
z?1O836TY2NKDIkcCk{gK$AS+`)OV0apiyX|E{__5JF)k*1eIL#>Qxn-*TUO1x$K|K
z+3Vwp7$~y6Xh-59^|K1Uh1XcK13i|$N1jYxKAe`AWomd3aWdIy!}N*zRyogA=LuNX
zYjsYw#g;q(U@n#aQxm-EI?Wae{&cD=F6>$ThouF?Br2-=^(=Jz<wPXtt=pWM)o!t+
zl#}M)K@*ud-*R`08H_6Q0%Y`s-d5DN@$3JL%vuhlKQ&(<B%zNvaBOJAe3@o^GFfsY
zC!?Mm{am*~<gadE#j94~PYLmKGFp!565ps9C>>Q4N|LlO8YzvdlF_ze4Zfz#>SYC<
zTMf%~RogfFVr)G(46d1a**i>GluBu<wLTfM=DHK|OylXDA3p5@$xXBO@j52feeYmc
zh$Cts5#K<Hp7yzlp@lWGtHsB~(&xA_bZmfU`^rm7$bbGv1J;RHs5Diy_Y2bKiT^6U
zEd+X<^i8Y^4{@larU$M#=;6GLhj1Y!l;^4%N<SivQqo-m?jQHPynT$K&S)C#haAE?
zfN@Fqbm=#sS=YenjOKU@EEH=5jeh2773NS}y@~SGK8s2jH8krI#gO51+vLu2g{sD-
zX(^qNcE=%aw_2<NPhsKYpH^Vxl%&dAGbbhf#oKA^Y&0NrohJBRy2i)wLNX$O?-SI~
zXkXw;(fjiU2uZf^pZqx|vHph)tnY_Nj)q2kSRXTa->@ss5$5%yy4ibe88*Q{Ow+%T
zF^3z~d&ISDcJ2KaZaV+8fj8OGlE41QH<>59;MA<RXAfy-mw=s9H8Oq8ybtUD>%AXQ
z%#JL;?{L~1T9h8ZTj2firNZz1PKV6_NQYFKz=r7;y4!fMfZGV~SuqGTvC06<T?YdF
zaSS5fsqHC~{ahY^L3SKJ1BH2L&-Z;_pX~wG{Hcv;X0qVw$PD%wu6RK^@0}{%_Wj(0
zYfUdOD_Y>6z!Lr?-fNKNWEDZ((zX!3p<>063SH$m-q`Td-Mp$o<q!4VcmAHf4Xmve
z3+Nf}qWj;^HitC7iCZ@1&{&z@=k;(cKa9#dBlQ+KchP#}xD7?GKDl{a%;zNolgMvC
z6J%RRjo@^aZ+A@hpWSv+0|z{yweA;~a~{YfL?P^<Y$OSap&Df-Y4D~uaW^K4Ymtwc
zs8PpVM+$>d#c4Ebss-^UL)ym!r^PnST3fe0Zg*b&?<D}AKl&FmHw9n+p){yzAo9G(
z?E(FkDH~Hoxj`Xz@Bp;T?Z}s<Ya?=rn|2`S!gt1^q{BrPng08$NBhJmmb1$ke}adt
zM-!|#p}?c;v@JA9ngIP;;?<fgcQS$EIxfrA^t%>zWKY0Vdx&>|G~>sml#KrQ`m#Id
z<ftb9+6giO>g)mT8TCzU{8xdY3#}Gp6ym4DaL~s*^Iigkjp+V$y%6{eUNaE2g}=I+
z%N^Zs`ptFhl|=!fP_U&t?LS{Azc4Cv=TSjQ?q|FPw|`X~@)I>>eXJ@66G_Yi@Mh0u
z<8V#Ly*6?jt?8@A-37`h6~ap7z?p~Eqj~*uIIe)wKVT713$)ds%$GQsJF}|EJ(t|9
z#?irtP=4k4&962QpWzsGimbRBc;&WB`Z7gyzX{<}*gQl$U_Gw;zfXZi{Q$-VT)-&)
zoHg$g;DA2^BZr><*F|&kHnq4NAXyt$DS>j7azbU`8fd8zjiv%6gZHR%y1rDThGc#6
zhwpGtE>EIZF=oA4y;gg-gN58;gcu&hGc#y-(P@cQ1Oe4gyGb#?kb|AXlGomq`q)UV
zZ2%bz(s98~;~DAqDs4T$P2ruch+yyW-x}-D%E4&Sss-Q17@vWP0pmz^#>@Il0x-|F
zU=VK&FO?5PlVflwNNkLW!9qQ#WXyy}!2zFFN_U}TN`)U6(-eG)s&>Nji!^BJwF6G7
zkJG2Se7PUEO3c_?OvTI%mNGk4cr!d7sKM|{AZC9{W8j7TO)-gt3|pjk4xRR8H_?#Z
znx~US9`nF@Z&Dl|$UU;j<3JvhS;cO2QsaBXe!C>%XkWZgj42zBLW)zbde%3xL?kwW
zI);S{sTy9eO`52Iv*(bvPzE~5>9TW)WykKJLZr>t_fJ1YoX>(*HKD{9L3P^o??f}$
zVd~Z^Qx7FumH|cR;)e<zQDL3-6zf;>c)oy=A@Mi)CTy=+sy7#`@Gp%5t@54Sk*|n-
zc>70I{3OMr%4mD9*3*NP&=b-F^ICdEMrFwa;MbS@7GnPZ5SS8^p^>1&4{zcKK#b$3
zS-m~t?|*?Q3$kZTE|KuEchsKB>U+tb<RBjm6$Z+;*(2bQp|-!4Z{bLJ&1Nm9nWY$r
zhHHd2j5$)1IN(kWEExMOczgI<)}l%Kd;98ctkoZsUPI?<fR2&J@IW-DU5)Od`a!Cq
zp;YE_hT1H_a;4eKT?HEO<f_&qzKgj<Ws!r1;9sp4z;#e8whs=j0J|AB3+na~b`_C^
z7z~f5g^6{PYZTb*?9=T)k(1eNL0NV-4%3Vxdd=!bE&r*|X%&EO<cA+L(o2f_gJQOG
z&x+USy#s~5U1QLxTAxl_de2&PNdS-X;S9R?`y+Hfls|m&mZ8Id-%i3d5z2mKL0gl?
zkjvJ8lo_cv;9tQa7Ci`WsiS4cAv{bUSuCCL>l^tOX7IUzOCBf&Jrh?=QU7zpvCfY)
z9haPA6yXc<bkBgHWetFo8!nr;&iCPaJ>1vAC&n`_X3_>J+fug-mN7i*x4^BFwV9$x
zlvV%)${V_oLn1mWY0)PH+2JY(GxQS1ZWgn9=fga4wJ08b{ab2PW`z+^D(|M*E=^7l
zU@=2n0hag5)9=~CcLG4G%OSw)O=nVUf*6KUxM<HdL!$Er_-a`2$DbeGJ9Gu@tGgWy
zG53D-edqxpYYQ}#;4?h6Ij89K$KC~A)O<FUEipOIdH%$wY!1>X>K;yND?GrC777ec
zxWEAZ<rV$s3gC7FrJ&DMwG$&#im}aZ4cJ8!K4(|zmo6?R_<to`4A#*^TQobg#ptRJ
zAPV)xSs4^1l6tFEGG~mH%?0mIq^dWH|0GImEh0P{zZag|H9eJ7vyF|T#vg%MVX^#)
zZN%}Y59p)}1TCXEHTGgdcHlp#6*Kk2@M4w}_Gg>e%UFMGa@fGocL)14R*_i&jz{Ut
zX$T=HtcJJi{Cg<G^xI%Q`3#$3>f9`wN)2bs3V<vh4f~XpB*^!_V4c`ZhHI1dOqWyH
zRkEh+$6yV<FrlM3usRQi*ti@-AKg~2-jZFiZ?HqJ$T2pQWbs(DXXm$rTWc9RsXnE<
z2DCh0owAV$(KVl@L~JCHVD*{}cL6|cXv!G&g>c)%CTL`^vaA20l#^+N7pie$vFPYM
zn2~>;l?JDAcocPwmXRzQx7k<TG!T-ZMW);sIL;NSNx?UJ`E41?Hi{insBjo>RI3pd
zN1vcVBZQwqa0v@ON?!22`;8Od3N8gJ!FQOn=1lEASkN7){!xi=P#wRjH@~96LMP>5
zZ}8m7#?NMQn(eMMmDb9(%5g{JZF+nE;XiRR(jRn^e1)fNvs$rE1GnjBShBxk#ct+{
zagZKNkkn%pT+W|2r@_||lonuGQrmqzZOv2@)+UYvhx)UcH+np=Mk6<#P^*_Uj@5Z&
z9gO88VH#2fWOTr`O!*@gFZmJ%S{mcxD2DPO0^Qj~V$Mz{<>$-K`k}n^!^528WxNh|
z8go0?NzeQokn`s3m155Erfd(z|J>AkvLH_67A#3Hq?pbON0m2i+{3U;vq29k<9>*<
zD;$NP@;Dh8S}2SjIx03+xl%7Rj!8KwHpa4(#>G{9aiu<OR{oXg%y5I+@n#IO07UE^
z)VR`u>}$t|k1C2%aCf<wDy1(nERs$GkL7eZxP@wpGa1u~1KEFBFQpMzEAHMHbEmd+
zU>@tIzTiZeCv-W3k0P$BXbN!ZMZYc@$1wQHVX7?7LHc%Z3+w1J3TtctOhaqR!Fg#j
zt5|GcTCCj270<Ws-)x+d3Es@(4y)j1&+jawZ-@y?csd{Zc|TN)KFYG#v#xfeh<dWq
z7|Wrul&17VNkpglH)tlAo&iTdK;b9}RRiudTQ{)=g>shtmGQtbAeG!OX$q@iO$I05
zvfXY0tC`vmQs4GcqgDDFqP{VJ*s@VDbE(;>XFv?!yv;f~B4y1e;pEOD)AvLeinVJ-
zV5p`DjX;Q2sHsZR(anDWCnlxfiDaW6hn<QlJvC|>nn(P#1FFxO_zU41{A|`9Cr13d
zj%Ka_Uz2ag+PV_Z*ndG;Ix5O(7}$q5l3Y@PXsZ!JR(hbTZeb{kP`&Bz19CJKCukc=
z2XD_Jv61w#DDU?m0DNUGKz_{Oy@leS3DW-x!{GPL7OG(89;8r1N}O70Z=t6Yt#(NI
ze?eaqA`3>W%U~EkRP*%W-|wM`!4b09c{i6a#f6r4aX%3p5J4Rp;@oD-qE+tkG=xq=
zyO9gl2@CR5`3E!v2u#3#aM*xd(}URPVzDY)uENRI?OoHH%y2evt3Cb9^0B|xEcFD_
z!2Lg*N%+kqlN1lr;M^XdKEASh`>Ms;IU6vDLOGMr5VMgf4>#nJxauV-)c^1?K-$#N
zU|ux#9Gt*+l-nlTJi<B*D%Q@l=lz~Eix#`p$3X0&6|27P_W%DY`Pcvgr-90#6}Z2g
zo|GsBj9SUz2EFxYRXcSOMQ+d!td%0L|8#_gQ`)PLZT&I(R7EpQXA1lOuD<{(OAe*M
zJ7iR;u4R9H^DXeBQwdzHl^UgYJ_bxyFd;*)x#h=Ype^wdS1AWZWAsbdAkVN-Pfq@a
zy5G+){@poVwF1^MKDhxf8m-jTwVwt`J=_9shxeTa^PL{nCEol_=@B^xUQ=Z|MT8fI
zt3&H=fJMuqpzR~K^8P#ms$z4uO{M2YRKt!=N)){=`!u=FhCiPgs0-;Bmi7VqRUn}(
zZa5V~vka<JR0TCmd{-B>W*EBkjb`0ihg{@p|Js?n^UoCfSz&j%V;TVToosE<ErqT=
z+qWRB&2COmM6uHcOLvzOBKdZPvSC!fP5k+K>wq(L7Yg5e|0{(97Xg`;MZ(-EQ4|pn
z8Fb_fup}RXf@-6mNi1ZGEl%UIb9)5n&{8i6Y|*&6Qph*`v1dIm<4MV>{Rk0-su09K
z1BPBenxMPUM$fF(G_ModtY1{k)yW1Wen$R%OqXLt6_`-*XhZ84ZS#|nw2iV@+Fcm?
z^P+&{+793>)wydKQ^g`O7b)RAcL(c$_;EmRkTKl7=JWeIy}L$WdApII)zL=#ipK3_
zx)aY^FW6)_bWr9N4rK4npF>Be6z}%{kBY4EBc}Mm+~nz*{5XJIXs^1z;swU--^{Yr
zF4@w{Pjl#TV2g{hk@8dtyK7;&_j?(JZy+3NT=feqN+?=ghdd|D%>?p*!1?R9Lk^Gv
z|G>LQ&3d5F_alkuV7<Vf#t8GD>NzA-IrtE?=?q{a_LGaNdv1s^YkM-_Ymn^kb+1RY
z!uC~7`yQp)n8aH}dI2lhA-JL;1cX&zpFR2Qm{u)bbu$nnTzRG*HTjX?W&-iH!tF40
zwG;M<Q@xeJ{jBNDS8w?L{&;?occdolnYCZ77G`k$d_DFsL+;Rz@Ls6-!)a^e<R}I@
zEV@A9e4otcQi;YoDO=jFJdAkYM3ZUx)w81=6w4__v$>M;QwOV~A}7VA#~i7}-r~9<
z{UPETB`JlON5%)&y`0g=cAA8%Pke#!UEZasAUE*!O|`)n-8+B!21uxqOn|)eKuo>q
znw>|38I#tD9*%1sbne~lSqa~3bXvGRn`)?Z*+}IkAW|Gr*68HB56*+%x^1i{{JrYY
zd^F}(K1cf<U?Z)JMS|I>g|Fo`IUS2e_5G0Is!ODmz@Xfv2RGW^PNd@z{pn@d@JS7b
z4Ot2J75i8kWsQ~3H^~vlI|Q53nMZC%l?inHOPopfy^DX3>OxnikS_gSpZ(sn^Lg7r
zNg|wYzX{ke-{k36+Lslo7O}eN+PT9|VgeL?ZZirb##4Ih#99o{*QJti-F9oyQ46=p
ztf4c3G%9yDUHrIraoJ)+Gl}s7c+Oa!q1!I{q)>?*%ug&&*CT-Yc$9v9HE?!@`UuJN
zr~3L|EfbBw-alJB7ce*o1N5#t{Pu=cWe^d!U7U|!PuWGB4pABMronejz25l6hyz~J
z3qUZR8L78SH^^gp2trlQRc=a-kV??4<FZ-y9ji--k~UjunbtY*(&E;MYEj>?X923I
zH5xcg0%G$afNM3XN>Wsc9vq_+miSp~tr#x{CeKl){OGPpOi2=}NSZFNr=LVn^e9ld
z(NYJB?PK^QvSs8QwDRKD0O8*AEW_s~cK;P17RIUK@lL`2Ua8O}hlX@TOs;jmj!-$C
z@@3X5#Tl{n0I)nnXz_2XQ*Zb@bhK{IRFJiu$$3?uMbotx%igg{N@8#~Xzw0gDtdux
zY-Yq`c=+qv=l?q7fW|cX=E<p)ZZ&yBn8n|Q?#&d(hAASqib4Giz?d*&GV%&W3$!VG
zg&u)Mj#>-(b_vwDa;^CjyT^8oYc%`13D7c2M{d-%VITctC8#Zk%l3xcvZ^=>x}%<v
zL0s^}|0dMzcfs>NYVv^=GrT>5avSNPFJ(R_eaAP`XeYIHtR>B^@U6yl&Ym@DL<x(A
z^+f`RXUq<KCT_J?X=s)w#l|M=13_q}4@f1CPOpr5zFLI#gQ1mn_EpH=2bJRgK-A*U
zZ%N`8d$iBY#9y$A;;Xadqz3C4Es~XAI>_a+nQKz{E%u}Uc%KZ2-+M!v6H%{3-+MUU
z#O>Bpj&!OW7{9q|XEasW2+oEEoPJBzrd*5(T&o+1Tzn*&+bqCFEX?zIuJyGrfGS>?
zJgWY9pV^7vj|;J%wfaVS2DxE*222u(CaqsOnW7bLUDt^tDvvPyrugL9uR842u))El
z@~IC58!w>dOe$E^%$e!rhibh5NitO9m_~!wieC!Q){3q5lmR@BCn8GK2d~Cb8{GSS
zylD6cIwR4q@mgtb@g=S1wn+zDgb`Lc$@d3f;nz2eFa90)-rFO{vJ>YPH5Mzk*9(4)
z=JlTh&PWwNzAm`BpA-Gl*qj*?s|s<!1hC~u5x<FFzpD1=cRP(K>m1=~h(Mr$fxuuf
z^`pcW_u$dj^Y7UQ!gx}-{@BR@)pef}G~kVt{OtGr`}>T#+_qyFsjl2iS6vyD7F{g@
z!jX3ocG8I5>=;-;)L;BFAdSYNScP-Q7#aF<3>0L-x^tfwDYmF^ZNK@jP*D*NG%N!-
zJ8p;t8`iws@{%a8-)ih+Z28uT{vxIE_YNxhO(zhG!jCfEo_JRV1oOwX#Y={Az2LEa
z>*+T-ZLb(WRw)5CgGfW1Ek)IB(+YgD@J`C!8g$jI{W&2`Zsr1>7OJ-3mtVkTxQsgh
zLs2Mb<WG&$fM#6rHaEBUA_XomhY}HnW<#a>`BSh)#BpkX%?TBtB!>;a(QLGY1Aa<d
zP;Xi?H=2yX_))ON?wNGJV1a&OU)JLUA|k#HEmb8Gdq;$YM<0W<Y{OzqXEa32MDB5j
zcUMQf6qFaAirlwpUh(RRc&@2I*sl@Ps`lt<SOV~5cpw#F%T}Hyg>t{RESZ)E7w>>Y
z63o@*^eFXia0D9In_x2TnZR-|&s+pvu=QL#6_6#~Q3kN)X={Jq<GsnKfqGgkIO&?a
zSxLg2PeUw`E{*XaZ;N}`1%2eRHE1n`C0B*6^z`fUwf7-n%fpT}i^`>2m+uD|iwPDt
zvk>BmOGJx;V{AQE6YW3!OSyPu2L$UIHi136P}=u+b|y9&^^Dci1q1(PalrEAsY`mA
zqdoRjMvirhOP>Cbz4&NWPPYH!5!~>t)d`p;XEdPCgRF2z)GRyD%g}|_v?CQq;JwRw
zDi9dL6wjJ&^tOFVEN9|pmv%ACrfBDUEk6I|-P>Kj;;a*xj^7jYeJXV@La&%ichu}K
zynOdKij?BO%#~KkoJpm=8|eA&EW~j3AqM*~_HWBz<JP=aOMHK3mYUO8TCL^PHej>T
z`~5%>si8?kox3s!Ykz$;zLEI2f7+?TXnX&cI?OPET#C-Kj1U6Eujt$s$&U6b6>{3v
z{wYXxbLaijJLas^R+e7l(Ban^40V9Qxom8!Te8yf^p}3$awreI3FHzsAdcDs>;Xc)
z?7TqOyqyJXMrX*O`$d<<AC}F=WDXuv!84~dwUhLN9l(R$j8qz%FfDK95kN*U39<!8
zkukobo6uTrQsn^Wv&p8OT%QE$?c5OxeChZ5z*Sfya5R&9)(bmx&FQGzNibm^0%??$
z-lKXgpMY>kxRGj9sjqVs_J9>w-%3^G2cto%5-<nL$@)on`a<Rd^Ilz5lDc)L>(}Q+
z8g6-uA$IzI#HIz{&>|x#oazS%gCQYK?G1=7-XC?kWcq28^`<bw3}iHAJg$Co49%P9
zNW-J<&j3#CGe)l%T^nv6HKhwgw`k3-EI<0s303brV6i7aVxy;f&U5e2>qd~M_Zf&H
z_F!6T71s=l`2Jes#rP*60|gOmxG#1c>0WXZqXWWX>-AjE41`}YW+~VKfRSOp+96Ms
zdc(S2%aVRT{}8MZ%TX93xeX5aE4OYqzU9=~1{QJWf8b%fqPqLR)_dEEF!vk)>rMui
zjm@c3r%X4k+v6=*)FvduCpN9Ct&&K~e23_az>e$zd&?Y>%Krka-{JQ#GXqhSA0Xb>
zmF{MW7!a}GW1|Su3N_!|*%}|521Z6pL>Vm|SdNf4LOHW5qb9^E@B^^%yP)tuD6Ld$
zJ0M)8<_3ObaD=eHOy-)@!*Q^AN3!DINLfjLDgmZcP&^07*XR6b;}9^EIez5HmxPvP
zs>~gTkm;3I2#H6sdWSovpF#Sc>22iDT`YN9l~p<MTsWrQ$1BQkwhg9P!q(;0a28AX
z_03({k1pv8gw_N?49=B$ZGi=s`opYU96x~gdkrk28HsLmKYh`vux$nMy*hl?3d8~|
zkzD-00n5b_?Ah0&c8k&Iq5U^d@1*!2f;!!_`ZMRd>)&6gIPyLN54>c6wMSF@F<<X>
z-78w-1llX_@u5pCKluO~d7FDN2<yfZ*D+H*6ZS@m{RQ6PuZtE=>ANvlnuIT4?@fW`
z)5$~S0mnkM)(5#o_Npx^m@SC$QU8I%i!|G!-3+u(Wx8~>uSe1DeKT9jdHgeQXtl!{
zZ@d(G(vM^e>8rU>S^Q#(bw%1FL6>}n!Qymxp+444zs-gDRH2G&>EE?)(-O(ef}X#v
zY@dKlNpClt2UIf-bmi!&-x6|r0x3W)?HbW@GEy2al2(SdQ#qXYgW`lsj1;lUT&-{0
zU-aFo;$pk#x|MZ+!!;WcOlq4p%OP*KeNYs!j6v)^a;lNBLD<)e=y|SY<E?qWxzFhC
zcTcaQqFCS3Yrqi<Jt0n;?g-<rAi%5r{a01mDp=Je?B7L5Id?j44p4gV8G~mh(#$Ix
z$h3K0P)b}RG_i|-qGef~zqRk{s0~X3Ox}}cW~IeK0)VoMCmjL)A3#gflWAO4Wm&7v
zqDrZd7?HC9oHSuQUeAe44_K4xbSqN=(kntj2pATMc>!OTcj_R(q`T@F@Et&wHmZ6W
zdO;CWx6Fs=Q*n0*_*D3XD*2V7_^b3}!0+#4>D<t<XvMH19INrs4DB5=sa&D+;Wp^*
z;X0eqqHII8i$zps)%o@xk1;5tJ$f4ZFTi%30Qc7xk{hdW_eBK1%7eNKdBaXOHWCrV
zb)X65RCl(q$95ioo4{aX!vDH-$#$TZxs`o1S(S(GG)pP&Z&z9n=<_=aT#{zQ)ZI^t
zJJtL+fhbLI;*dUx+gH|3`)Mfm8pMOYnAzQSY00|`x8(l*LY~`k77!9ZP}6}JdVt5T
zQI%kX4<zh<SrPNMpNgTh*4eyE@cky+|Jz99L4R7Kyj=P!L>E4g)dwFd$kM6F{Ko6F
zeWLj8fx~}km{*-pSQZOq7(mU-o#`WWvCuCM;`~TPFdPe>eGJ~Q4x{?wbmHowbh!uY
z|3`6nr$&u&miqMWfWN@OE$9K)J{9(MlOXpl>6F^jcSy5Ygj)anf9VkCIK^5vnf_>)
zZS{Rgg-g~W6@ZfNmIJ*MqoV5|D4o*V&^&%JvI``2`p`QMv~+R*56>|{5p8_C@h(uV
zH|cXb93*sCfbxN98Uah`Vu$}ln_MMyO9<)kC-yfZp{SOrBx2PRHb*mWux~G0{L$JL
z*(V9r1A+g297S)4HXpuzEiGs^tMu+%mDb^-0=s~R_x|-byYuoR#|ib6K|E%ga~R!1
zUS0TmjU~M(8arzOQ1;^UDK=gEucA2s<0m7Dcio060AY`!7dYf>=Sz%|z!%zgNJ#bw
zyaEE-4k5n3>oW44T|TTC<#{KoJ7J5%O(=}E<ABiQCbK7b8EmV5%2$o@c}p-?ADs;S
z(NAIS!5M&BC;fwHfmowd<R++PKLayLuqx3wTYjkG_pkl^!4<nM<JqXJYR~u6Y@5%o
zuoV7Bk%Fu3`Z)hb>U3c50o2WWBPjAd1Rn(d$Ai$nFw#Q#Oq{Lh`rjRcjPq+XiVm*%
zwjq`TUqBBv9WqAJ4zyehuB?J7dW+zGzY3b|f^jdeYpzSL`l)EO(aYEF+UPxJmVdkZ
z!NY#EiYV|mxW&v$VpDNxK}^Fh)cPNSF;L5-E}SviR7*ksNVD<;nN{G)EkQoS0aMIV
zBPwG41g`Whn<#0_Z3nfpWRMG%6(`RkaR>cg1g<G)(+}n{u(azAeu)lxel$wsR|Y99
zZos?!wL0x7*X=!mOql$lLJtu<0j46so*8@Y3E#waFOY*~_27s;w?Bct)PSk*Wqi9J
zSb>bu*P4C<p#+*!Ef$E&n1XpM0IP%zj+{@V0UU{PT9-+k9;>F%vsoMeRy6`@*+_m?
zAUL5q(JqP6TTjK_fpWD|7*--79IB^4m(vIO0w#YXHIoBJ+H-f_b2E-SR$^nhjgj#m
z(sbd#La}CQZp){1`yp_A%X1ekOPtIz<!b%1)l>()NB)kLoq$kDIhKEF4<Mo+!B*Cx
zk|oS}%H{di9(gl6b~tRq|I%O?@4}5!LC<7>SmtOrBsE{Y{T&E?Jvo#5hF<)>M?_)x
zkB`9$A`QUWh_GCvc=N;t5UoS}R{$&zE>W?{e*X8s_YhDGdGkU*{tsEZvmdj`3rO`@
zBs3v%udyoxT}yYyEaCL!T=b3f&jAVeB+ixEoySkm@kHZJl*kjK+w_Vnu|=0YycACU
zjJ<i%s8Ny13yCI66<ENz{QM(1CA||WnG)~3JwY~+cxm=EZ@HP_+?P4R5}$2Nv;lGj
zY?gEDQN7QQ)o--8)Z+9JH+K!Kg>|!~=K-U?&5i7zt%)A!dseUcfrtkxB=Mf&7Elz4
zt3SKlP<zxn&}F%9@v$hk%>CM<kjTxNPZuDOlOoCC_Ww2Z)nQS7QMa@V-CYBS2q*&x
z3@9Nbji59FGL+N+3L-sp3n+qu0!m1?gdpA0NJ&YT!q7F$cjkBR_uPN){5Q|MapsM)
z&t7}&wU}IHNP{&iozE$}bM3oetP3LA@+QS?@N3QY)8rcg$(=Jnr!*WRP9)3gK5ds8
zy|x9KgOky*riAs6qPt+ehZ_mZFAuPN0+`#jr%5;qwn`=IN>a7HK#-vn&c!<O{8xmG
z-txeRfw!q&Z$}-s_&Fo}%&XF7)bC@p8vugkEbt+O8Yn*gK9$XoGTK1~5u{+D;Hdi(
z-DmqGwp^hjo!PLEuF&PG84PHZvrUDakr|cc#P!=?p1HR;l!CB^8_0twlO?Qpfo9dW
zBv4A+JJ+d=LVvcW0CPuKhOq<*6Cu1kOk;p0q`Tjy>mhkjR`0b!@n1|?tW%(xtP;X>
z=PW9g(VjEx2=%Z480!Twrdy9IOnqZZxrPb?aLnfke~^97g6<MV*3p=X=io$9_R+(}
zaiVGDq?;#Tj=g!@k0YAa{DGre6I(NIx8>LJIbp#9STO$c1_VEuFxwN*XiK~H54<IB
zJc=io>|W<BzB7080BQ|Q)-;A?)fEPt(rpk}$m4SiP)SxX%wFvV70^dWeR-4nWHU4y
zl3!ugD=(Bi72C{qsNN>B4MR(DDi;Ea@Davr*^s0^xs1R@gEB^$&1B?Rl4Irm1J7uF
z@jj?Z#c~61QQK<%C1()w-u{C|TFBP@3w?DW@F*0(2k!$WscuU2@)QW_>(*V?LOc=(
z_qH8-*L@U!_%5rAa7BtTncHJ3qT#H?#7Mjl)iKbcAp7i7Wny3`RrIb{G$|c9^%iZ-
zFq!3K%tFa1Lv{LL`t`%Nf$0-Sxyy+$WbCP{X>-YDEh@0c=et;$<!ikkgvS0oF}IO>
zLZLAZ4~57@$`w1cKqKgKsn7Dym{fT~AAxnKLek@VU-aw2l3O^?&J10Y6q8D0*w`8n
zxi2RzI<!(BXgK!A;*@*$G4wxH&dU3H(si%g<ovbru}ufzZ_zD;)GuTwd+<d$i%x&}
zrPk)er=)fn`tvOGJ$|HPcrv$t8}lG*NkNUVETP25{9{ZimHKbF&`G<dqa2%6R(-kg
zRm-vy%pyy33C=F(+t-CP^zWa67oiIqwTBCpQG`LO+p^&cfxETuFQsZ1nD&1IE15Z9
zC4=150D4;6bm@B<%}%K=|F|OC@(ZyCpL`<c-Ku@#xN)5avszf2OZNd#EldaJeKF-v
zvk~(Wmh9KmU3K^oEHsU;PU4<)ri=CLmR_B9nsjP$tgu=VSfG!KpFElWlW`nbhskdr
zKm{%Nr!HBqGRKCnFW-n_#9o}3pkujTIaQeu7h0f4$2Avz1PBMxmHI^7_7U2zwP8H@
zo-=vj%e^h#HA5C?{u6m{PN`3Wa20A6n;^T%uy;iECZXopWi%^BDu01SOmwx@5c4gG
z+d1gO8<E5^A-lnP=T-B()c1+&o*5P5K#s=vg|@Fmc+4b2Z&RF1j~}uhzRxBI<lh1-
z_$Pf3K{9K(3yh+YWuHz$$Zt+QVExKfbkZ*2!6<tDooCZ{(flgoUuusFy%K_D|J1Z0
z0Nu{Sq$(LVZbmS49;^fMe?i$ko+)BAH<J;m%L~lI4+hQAv3X_#TkaMWF&9rpw*#=T
zY|d5Z%d!(!nb^5!#a^<DqE0eNw2E>%`jc7m{6;ut>qNHOCltNt?KnyOR$dL|7jbuP
zs=H4`<?8eO+HV=(_J|7}gfCOjN+J-<vs`{VVolXF7u6MJK+K^toA9T8x|b?T|34t-
zXA>O1<p4l04`A=ia3nI#6&LdwP$amEU5DdPzhp6SVqp$G?(G!#HNp&_h?zhoK{8(W
zg!Y6_r<)ZbL6Wm}n9FOxQCvr&v_#hd`7Yz~05VM|8S@K>$o)$Ie1dvotTlbyz~{{I
z6;t2Y<4%=eF%%QwBkns)G^`$N3w`%;7Su^9?w`coes8`VuzjFbE%+g%QoWhHw`b&8
zd3!5YBsB&^slQ<U^Fr65UCOkDEJDrJamc=p`aGZnc~TOFzqU=YKCfd|%N16V>+q`i
zh9aa@VTv8?=+k<_ze4(~_q6AQs`ehN5&j@+3)S0sKxSVW_4&MXX7tHWr~Ux_-3oxI
z{sSpT$p=`A|Hv+mP9;-%3aBh?1gd~A0h<u9|JAF777(F-{CLfH`<dU31BQhAQ{Rbi
zIu5B(=(;?HRAp27??4)L1qF{>TqmNb{lu3fD>npu3B8}E@iN9Q!O?c{jA_;p#8?!B
za6$8w(B&jnEvkcD3r5)L<5nXyQ{|x7J@K(1#iXKPoYDSe;59p!yX5Th4`H@b|9qa4
zBsm_S$>^SnT8~o6tk5L2(ZN65`yQ47(TpLDydUxUKT<`5WjupOUOUp|rr7DRJ%u5}
ze=h`dw2Tyt8SMEK1$MW_K<IwoUox|K6UIQ_oe5Qr=pm5kahD8Xl=<{N*dqZB@<LTY
zlq)Z*sdb@87EDNrpaEu|#KGScOt;{X5-1VTw(=|9Q&4aue!C-5h2|O0w-k|<k*|r|
z-=i4b22OhV{3H|3S@6fQAT@<B_8h#=xM9}8pM*Moq#-M4hE1+hppGB#lsa+kt^zDY
zH>U(<3=2??e3<F`J4xuFeb=G|6X4jmHd3a*ty^8b#J5}|E;vGC(me8JZtg_#wA7)$
zB@g~i^HSvB=t#{yOn0ON@};Sr%;=Svz7Bj{VM$`Ux1fb(@~Qf?LWJT8jzz^aBxUq9
z>>BhEpPpboxp=P~&_y=W{e#7x29Q2MRpV(03S5T~<Dy}ES<96u4}r)B^;bQoz<1l3
zzmkA!Fei;d+)%w)CgG4>Qh0=(lro%%iQPR{jlW?wl$B1O1@?$nTf<$s(~O@?vl-$O
z&nb8mKgy63&uAdcAEMrDTD(87oNi>h`7+zENOjA^h*>%7BW+aChZ|K8v6Kdh&Q2ln
zH`M%}^3-l2ciSuQFEm`rtaq#zZy%t1AueTMhv|!39X@=L{N7%+h_BZ%0$E788$ZBu
z&B(<RsQd7WXp7LbRq(Y>uADVHky#sg_^brwjKc5P$ERpEdwf<|D(QGh9Z?!J3L2#j
zZ%xntZ%)!fgU>JLV~@yNRMw3O;KL6Jt<W(%*Ri$=x%8w)g+8@!BSN$qy)H*90ks<g
z9ek+(kx%6c#DwE*nx@vO)39&<i6DvI(uJxw#kxcy<1&s(fqKRN&7EX;f}1Sl9&o(@
zGR@&)oLp5ch1#RYs-*s>!!vyU(F^~}O<QPs2tfBorQ)x?xOhDDIgiSU+W%%>!O&qA
z0+6w@(AoHyc4Ya+-#5N+Y7EJ;Hi?)qo|Gpvn$oY9ptWTZa0B`p$aQ<O-~W4Dlr~Mz
z$E0>gdNv7rZ2?LlrR*<i@pN5b!C4wnCQnojj?ZS286Hjfp}TlttAKVu|33apIb-er
zD|Hj!&k|PCO&&Y<y(+%gGI=UDiFS;`0wrXo=-lT(RJk5U)iZe7Sy#f4dU=qEeH(S$
zK}xnB?wW@W#M&C$F>EhiEUDGd)22si<E!)v7zEJg1U$UR|A+mpV(R)T*0-M4GwX@Q
z?+E=DbOge;gf(PTe;7A0)mSmqml{cfN`KUAMLsKqkAlmomdl#F+HYF8Qk?~!kKRq0
zQ;Xs7hTW)pW#!&V?Df21A~<jCX^vk<6;y#+zxBK3tMSdX5H21=ie$CXpB7-I=S_01
zEUY=VEm_u)&&#N#^&{gokRku-%|AmzTgU_PP#_o6W=+-{eKXG2?{`BtL+}qfPNGeO
z8({}-rs5#!owlA_BT3J}q}ars-YIoJ8hwy!MM=LM39-&iF-ks^d4`e%9!KgtJrH_<
zo-pkYUoRT<ZPr(*%C~jP&0K4Kg`o#7&gNEMfk5;9>uC~IS!JJ)>$J-Z2|^rj1O%L{
zKrY4cO5{NT$NQK&a{6}Sb??L1_WrbA-w=jO(I?(YrVDncWdqF2VLCNkcBKw`sW0cG
zfSzJM{*DwTv!zBD`o?b0r#-$I&^;}H6Z{;==9YmB*@V<+9*zwI(AlY^{C$wxK4VGU
zqa?DsPCNuchhRmK`s=rEWhfNB7-E~jVyx0ppl%ZoVaBv2Loi7fM?rJakL=CoLcbB6
zgLaGGg-=?$E&;uHUsNd;2WpK5X1VHw7-#sgKV4y|b_%j3gi|$m>%1c!z81<pR2j&s
zlu>;b=aKYv9UamYhac_pUc>MJ*K54;+QO}V)!^A6aOLFZ90dm#El7I!;6_c8R;~?%
z><q}c@+;x#qkH>jij2#Ma=l(T*K+x#MZ>=J#DXNf$Q$z5RIiX#d35%N`mk88FF-rC
zNmOqO8&y-9k2ew-9<kK#sX$6$&pD7*<W6SK9={-}MjGg02Id9B@E~6SYNiwE%N6|-
z5HT4VK3ibhq5xYj*~VPs@aPUCdKO^HJ2;WYNdBT7DjIdRXoldex+odmG5?p4KF!td
zpfGFRLl8`S|NC^#lAy9#1yI6+R^+hJM*vu?pVMnx-$v_G>bTH-tmB3elOA9ScmUo5
z&G+L7H@E#IcpQi7whDy4n}j}pR0cZ+ngKcS|Asc`ts*k~ggt$ExV3gxW_}sn)?v{&
z0Zzf&J@4%6jf|i)!}TIte%b3E3O4}Syx|7YRa0x6>`rn~6=&KyFsAQ==Gs3!0F^n(
zaGldU0P^cGA=JlOoq+s-NK-rCpK31O4JfYN3di5A2T$YO*&R66nR{IB#L?<kK)&ae
zn5S<jQ=y;T+F>KmL20DT65VPGX*$-KTYr*yCo|u5SSru-TZ$d&oftd}&8WxmLC|q6
zA%TjBVoi*S8=$2#Ke(NvXD1+uXdw*5>EQrb--0`lM2D~_!fR9v*D`>0=inbAV|xRM
zh#vtLF}>FY?OBr&x}DG-EHWP-zZg{^_-tOo0|Yf!;_w3Yu$OsnV!i+E$*((I_svL<
zycH&;)POxxxNs(2iN(Vt%YQ}0=5HhCo)`b7*C*+UXJ4NO#<Dn14;g_Iae8k{xE4;D
zvOpREm^8F&d*RHMiSU9g?WFTiPa&Eci6E03y9MA@899%A1l!Y<x>$`zH->}sZg&N;
z64di$VG^5;`*|K2e}kxP_kZ$sG!JQher=k|k(B8C^RdoxLk+uPwXj%g&JKVuqx(0h
zh<R4Ev_>a6_hOt789%9Hvw1KNA*2^UuiP-*pcX4sXmQt`)Hb9AV7LXVfq#OAsZ+8a
zMMNGH_{ZJ+N}S`BFPA7@sDO0{qC{QdE`4q0l}6@xV4r2sMP@EviGm;Jb2~S^P0L`z
zI(mXu`RPsDag;9*A^P$L;%70R*o8Ulx4PP=%#_u<+idCVr_3WkgoA5m8z7E{4GSBQ
zE{9rwzt~-oZsXZ&Z?0&3Pg4QH4o1){gjP4}(eMwm33T3(v33n-!{jokQ|_zq^yrN+
zd)?lswDRuk(SLO-Y(X~d{o|>?!;!W!b_x5RqM#@x65ZUP`d`Q6ELevA0|pz1w9u>P
zp+Q~Emn8JeZyAQCH}iMA4ivPM<D^S!zl9%{`|LyH;I^PmHvumo(IfA<qPwLaB7a=A
zYEmrU)dmS9&B^fpzG)3C@EQp$!Wy4EneN%t*<tN~1vFq=u~*>>|Mo_cEaxR(Ide;|
zGRxbKh`Lmwzp~`U;I;BKL0wkCLKzeO8Y(r<oM0twAd^BRtemfHQ8+Y|4eK&-AB%p|
zD&l&;;=d2r1sG^#!xV_d_ZI=(#)$uvmIEO>I3>L4pN&m@TIMyO@u>SFm#8v(al+ql
zya1S!|A^*CKG~W&5F14&(l>Wto-lWqmrFAkkObz+JJa3wR_174<V$FtcGBUiJl#?>
z7L;B+Xop`rc1nm|UX@W>UvMMYPinB3a)%9pb2)Mdi;R>2i>y56kwVW-zgnwDo@=%%
zb_j_*X1S-}y)&hdz#_IZ33+b!j*__u8B0?KP%#+dv^QvZ;mGh|G>6~c%yI3DTh09{
zX2_55P`^PO&aD_+p=xEDF+%NC>@|I3Tg;PzV7cp42-I#R@M?EGvT5zc2WdbU^Ah0v
zW_6~++Q1{x-30C-4a)r*rMD%fu5Gv+iXwO8_+nMLZgzh2B(nc7l1$GGc`Tb);|eQI
z4FbipZI`+~5s%I;3!1e*2;}cMnh={u;OSpJh-XuEo5%VJxV<fl`XIIe|6<kk!P;yh
z>YRE4-Ia2grxW)*$PYg(GB^l4eEu|~j^eZi8*uVLhs%hKqK{v>__`@))uAxk7cY)&
zs+Lb^)_uywJu;}pJuazP=}#o6j%%d|&OLrn{gm$+-}B1dt;iqR?DBF+^nNji^{<`^
zDiB@{m2*WvbN=Qv#bv+7tn4#$n+9wE=}YO@)IQAQHJ%!=RaCU{*Y)@<KE)U3K<1Mu
z-9e%Dak*+!A5sWEA6dES^QHOSvy8JX&42<4SL-48*m2Lx&ev77Kvn*X7=wl%d};m(
ziqH`)0R7A(_TLH`z7d*!tk;w`mR2;WU(+s;^pHixArhWXi1LIF=Ct#77)#WhV9BN@
zA19RuFc8diPdP7)HhH_GNuRS}57L{!;4hSO`De&R1iHwUZ`WmR81@rlD(rw^S-K47
z_yYUn3nJe6OP9&GSWtn!w6fwws+2i)(xd&8Sh&utR-j)yQa$vnw1_n{L9VGgGr6Mg
zhX{k4w5!b+vj2_z4RkQ@a#0NV^DbB4XlS3;9R!3#^0Ul&)L}dJsC{o%ZF!@1=H*$l
zric8VtPHFmOqvBQkhGXL={TJsAx^Z!llAef2EkI$N7&Rh|L33SS7npLC$m-pHjOZE
z7SWv~cZnZ?2hFZASWd){m|XYKB~`**&Z?{OBq&XEZ~~*Q_@@_-?A2JMj|=~yvd185
z-CWDbl~a9+vQjJ(vR#4OK@z2TmkY;tk?cN&Lwd98poEG&!bRbz8898>#0N{K;xhA&
zIDE+I%^&_bi`YZ<H3}DO^T#R;Pqr8oj9sd@ErBv}k;5Xw&x1v`h0>%dJ4VUsp4A4r
z<2^T1#~Y&*5jnYO#LI#QF%Z7>a?kNAc(y9XHw&a`Y^FVG*WG@sm92Z`ahDufg(j^J
z1<Hj)>KZ%oJ0*eAT`rF>$>d2>-pQaaKF{t}3i4W(Sr^$m9O*QEYke~Hc^}_`J%WQT
zHRY!T_y4|)@4z3rpI~uzW0&hPQJAuYvF^=m2twO=<NL_T7b`V|p@%0I?(krUf#ogu
z;VF_7R@d|%+i~K3#kE^@CPWw%{bze)NM9a)=0ze>lggdJwf9#>Rl}_?9?_QHIuLNo
zfGBm2ZMjaw(N<|v^>oUoE+X`!PITJ(@~+NeOylR?#k)VndT6zrZ@+8ubXEL%j}`w_
zs242Mc^b~KDE{uua=BOGkc@7XG6QAh_yfH*kfA)+jaG;cWJz%_sgEdOKxvV-&=Qe1
zKRk^H<i<#yBgXLup;Z40)_#*aY?n3WklwN`O(u@dT&kHw|17W~8iXoV@Y3-FkWZ7?
zBpmxJ=~8==lOs9S_Bc1ni{sy?uP+(Lv>-@ZSnDHHpRsfV5Vo)>oc!!PxM3&F>#&l>
zoxri?fA>K~nw1@l+Olt8?%oUJVKWp@?_pRg#qHLck#|IpKQL^{KZqTue_Y2lPJ}UM
z-@Iqsc(bR4x+f1`j0UkLGQt(xGDh2DlN&{gn2F=0_YUk??{nw^JM_x1LBhIuo&V9k
zi9Ai=#Un69)aRAbI8oZU4y?|Y*&+i<Ab-^wgEDDLTW)m7EhvpzPbzbqt;4&|Qg{==
zpu-y40I~ikg^;s`<b-2!<*6m&+it%WiA-6G4s@m_Xtwcgbv8fSktj6rsCw{wb?xBX
zwru7|-pwE$Nhf-Z{(}$s*Ntuxn3I|~Pd(mFNQ)<0fKN(r2O-;<+G#VRFH$A$7%bnr
zJMfNL>j$la4<GY3kMzGy|4U6;8hj^>>~}Oa6HFO4jtIS2r2j1a<i1&lo(=jFfme(o
zZ5<iWihy^?Hp$Y}M7&=~38u?RfgoLH@!!a_-gRw>`<DH9wf6=*-%6N7iWgfwY1z6M
z`f9en^441^g||F9c$OChrP3{ssiQ??W>bkp9-xQF!p&|Hs4kw<1lN%TN>TX#;&5GK
zExH9pZwr#inKW-sho+ak4*et=gm%`5%!sYT2ik+HJko#sW^JF`B#3Tlm@qMaPT?`m
zD)#mqG30elFE5J84dO~wi*2dlP0Y~2n(fw})F==+@<w@^HVuAhXMKlH>G80A4KKC6
zx9RXT^F}r)5kc=4j$!qu1LYCo!%!-uNyqbdzuqIr_b3D@8Bx3siDFs?cou`Z2)7ZO
zJL2zToHM<0NsDRSY1(<su1O)@J0w&rN1I<Gt2Q747fBHOp9xxbYx--G(>LPZKQbr!
z6skz&_-VcT+x&Ca`Md#wIqfYq+-6y%3@COb9zVo6q}Ax&^DSn4+9|f5an1u}xBkzM
nxKsakK-`x9zhB%BNH6gZp7CbvYl_X`fscl&u1cwrb;$n!`B=2J

literal 0
HcmV?d00001

diff --git a/3rdparty/mshadow/guide/mshadow-ps/Makefile b/3rdparty/mshadow/guide/mshadow-ps/Makefile
new file mode 100644
index 000000000000..58d64a304009
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/Makefile
@@ -0,0 +1,45 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export NVCC =nvcc
+include config.mk
+include ../../make/mshadow.mk
+export CFLAGS = -Wall -O3 -std=c++11 -fopenmp -I../../ $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+
+# specify tensor path local_sum.cpu
+BIN = local_sum.cpu
+OBJ =
+CUOBJ =
+CUBIN = local_sum.gpu
+
+ifeq ($(USE_DIST_PS),1)
+BIN = dist_async_sum.cpu
+LDFLAGS += -lunwind
+endif
+
+.PHONY: clean all
+
+all: $(BIN) #$(CUBIN)
+
+local_sum.cpu: local_sum.cpp
+local_sum.gpu: local_sum.cu
+
+dist_async_sum.cpu: dist_async_sum.cpp dist_async_sum-inl.h
+dist_sync_sum.cpu: dist_sync_sum.cpp
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS) $(PS_LIB)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(CUOBJ) :
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+
+$(CUBIN) :
+	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
diff --git a/3rdparty/mshadow/guide/mshadow-ps/README.md b/3rdparty/mshadow/guide/mshadow-ps/README.md
new file mode 100644
index 000000000000..146c7f04498e
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/README.md
@@ -0,0 +1,227 @@
+mshadow-ps
+====
+### Parameter Server Interface for GPU Tensor
+
+mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor.
+This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in
+an ***easy*** and ***unified*** way.
+
+mshadow-ps implemented a two-level parameter server. The architecture is shown
+in the following figure. Typically, a GPU card or a cpu core runs a worker node,
+then a level-1 server node communicates with the worker nodes on the same
+machine.  Inter-machine communication is then via the level-2 server nodes.
+
+The rational is that both the bandwidth and latency between the worker nodes in
+a single machine is usually 10x better than the inter-machine ones. By using the
+two level parameter server, we can use different consistency models at different
+level to better trade-off the algorithm efficiency and system performance. For
+example, we can use a sequential consistency model, also known as
+[BSP](http://en.wikipedia.org/wiki/Bulk_synchronous_parallel), on level 1 for
+guaranteed algorithm convergence, but use a
+[eventual consistency model](http://en.wikipedia.org/wiki/Eventual_consistency)
+on level 2 to hide the network latency. See our
+[OSDI'14 paper](https://www.usenix.org/conference/osdi14/technical-sessions/presentation/li_mu)
+for more details.
+
+![Arch](2-levels.png?raw=true "arch")
+
+####List of Resources
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc/namespacemshadow_1_1mshadow_ps.html)
+* [Library Interface Header](../../mshadow-ps/mshadow_ps.h)
+* Tutorial in this page
+
+Working with Level-1 Server
+====
+Suppose that we are now implementing a Multi-GPU learning program.
+One way to do that is through data parallelism. We can launch many
+threads, with each thread compute gradient on one GPU, and aggregate
+the statistics together.
+However, the gradient synchronization step could be cost time, and in
+many cases, we can do the computation in an smarter way, so that
+we ***overlaps the computation with the synchronization***.
+
+mshadow-ps provides interface to do such synchronization in an easy way.
+The following documents provides a way
+
+### Getting Sum from Multiple GPUs
+We first get familiar with the interface of mshadow-mshadow_ps. Through the following
+program in [local_sum-inl.h](local_sum-inl.h). You can compile the program
+by setup the [config.mk](config.mk) according to your computers's enviroment, and type make.
+
+In the following program, each thread first does some computation locally, then tries to get the sum
+of ```data``` through mshadow-ps interface.
+There are four key functions in ```ISharedModel``` interface
+* [InitKey](../../mshadow-ps/mshadow_ps.h#L76) allocates a key to specific tensor shape
+* [Push](../../mshadow-ps/mshadow_ps.h#L100) pushes out the local data to the synchronization interface
+  - The data pushed by different devices will be aggregated together by key
+  - Push is an asynchronize call and returns immediately
+* [PullReq](../../mshadow-ps/mshadow_ps.h#L122) requests the result of synchronization to be copied back
+  - In the local default case, the synchronized result is the sum of pushed data
+  - mshadow-ps also support the weight update on server side, where the result of PullReq is the updated weight instead of sum of gradient
+  - PullReq is also asynchronize
+* [PullWait](../../mshadow-ps/mshadow_ps.h#L87) wait until the pull request of corresponding key finishes
+
+```c++
+// this function is runed by specific thread
+template<typename xpu>
+inline void RunWorkerThread(int devid,
+                            mshadow::ps::ISharedModel<xpu, float> *ps) {
+  // initialize tensor engine
+  mshadow::InitTensorEngine<xpu>(devid);
+  mshadow::Stream<xpu> *stream  = mshadow::NewStream<xpu>();
+  // allocate tensor on xpu
+  mshadow::TensorContainer<xpu, 2> data(mshadow::Shape2(2, 3));
+  // set the computation stream to the new allocated stream
+  // this will make subsequent computation whose target is data
+  // to use the stream, stream is needed for async execution in GPU
+  data.set_stream(stream);
+  // assume these operations sets the content of dataient
+  data[0] = 1.0f;
+  data[1] = devid + data[0];
+  printf("dev%d: before sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  // intiaialize the key, register the shape on parameter server
+  ps->InitKey(data[0].shape_, 0, devid);
+  ps->InitKey(data[1].shape_, 1, devid);
+  // push data[0] out, for update, or aggregation
+  // 0 is the key of the data, devid is the current device id
+  ps->Push(data[0], 0, devid);
+  // pull request is used to request the data to be copied back
+  // once computation is done
+  ps->PullReq(data[0], 0, devid);
+  // computation can be done here..
+  // the pull request handler will be overlapped with
+  // similar as previous call
+  ps->Push(data[1], 1, devid);
+  ps->PullReq(data[1], 1, devid);
+  // more computation can be done here...
+  // the computation will be overlapped
+  // PullWait will block until these request finishes
+  ps->PullWait(0, devid);
+  ps->PullWait(1, devid);
+  printf("dev%d: after sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  mshadow::DeleteStream(stream);
+  mshadow::ShutdownTensorEngine<xpu>();
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: device list\n"\
+           "\tfor CPU the device list can be arbitrary\n"\
+           "\tfor GPU the device list need to be actual device index\n");
+    return 0;
+  }
+  // list of device ids
+  std::vector<int> devs;
+  // initialization
+  for (int i = 1; i < argc; ++i) {
+    // record the device id
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, float>
+      *ps = mshadow::ps::CreateSharedModel<xpu, float>("local");
+  // intiaialize the ps
+  ps->Init(devs);
+  // use openmp to launch #devs threads
+  #pragma omp parallel num_threads(devs.size())
+  {
+    int tid = omp_get_thread_num();
+    RunWorkerThread<xpu>(devs[tid], ps);
+  }
+  delete ps;
+  return 0;
+}
+```
+In the above example, we did not do weight update on server side, so the synchronization result is
+simply the sum of data on each device. The key property of this interface is that the Push and PullReq are asynchronize.
+* We can call these two functions once the gradient is ready, and the mshadow-ps will do the data synchronization in the background.
+* When we need the result of synchronization, we simply call PullWait to wait the synchronization task to finish.
+* Such interface allows us to do additional computation between the Push/PullReq and PullWait
+
+### A MultiGPU Neural Net
+To get a more concrete understanding of the interface. We give an example of multi-GPU two layer neuralnet
+in [../neuralnet/nnet_ps.cu](../neuralnet/nnet_ps.cu). The general idea is follows
+* Push and PullReq is called once we get the gradient of certain layer
+* PullWait is called before we do forward on that layer next time
+* This creates a ***time lag*** between the backprop and next forward to that layer
+  - mshadow-ps do synchronization concurrently with computations during the time lag
+  - The time lag is big for latter layers, which also usually need more time to synchronize
+
+There are several note of the mshadow-ps on the neural net code
+* Callback function in PullReq
+  - A callback function can be pass to PullReq to be called when the request complete
+  - We place weight update in the callback to perform update when we get the gradient sum
+* Computing stream
+  - Due to GPU's programming model, we need to do computation on non-default stream
+  - Use set_stream in mshadow tensors to set stream to computation stream
+  - To report error when you did not use stream, you can compile with -DMSHADOW_FORCE_STREAM
+
+We should note thate because the example runs on MNIST, which is an quite small dataset, you may not observe
+speedup with multiple cards. However, you will find significant speedup when you run on other tasks.
+The newest version of [cxxnet](https://github.com/antinucleon/cxxnet)
+
+### Moving Parameter Update to the Server
+In all the examples so far, we use mshadow-ps to get the aggregated sum of gradients, and update
+weights locally on each GPU. For more advanced usage of mshadow-ps, we can move the weight update
+to the server. The communication pattern is as follows
+* Each thread still call Push to push out gradient
+* The server will apply the update rule to update the weight
+* Each thread call PullReq to pull back the weight from server
+
+Such update pattern is suitable under distributed setting. To do so, user need to implement an
+[IModelUpdater](../../mshadow-ps/mshadow_ps.h#L202) interface. And define the following CreateModelUpdater function
+in the program
+```c++
+namespace mshadow {
+namespace ps {
+template<>
+IModelUpdater<float> *CreateModelUpdater() {
+  return new MyModelUpdater();
+}
+}
+}
+```
+Before calling ISharedModel.Init, user need to call ```ps->SetParam("update_on_server", "1")``` to set the update
+mode on the server side. If user uses distributed shared model, user must define ModelUpdater.
+
+Working with Level-2 Server
+====
+
+First build the parameter server (replace `ps_dir` to any convenient directory)
+
+```bash
+git clone https://github.com/dmlc/parameter_server -b dev ps_dir
+cd ps_dir
+./script/install_third.sh
+make -j8
+```
+
+Next change `config.mk` to
+```bash
+USE_DIST_PS = 1
+PS_PATH = ps_dir
+```
+
+Then `make`.
+
+Next start 1 server node, 3 worker nodes with 2 devices in each worker node:
+```bash
+./local.sh 1 3 ./dist_async_sum.cpu 1 2
+```
+
+The `dist_async_sum-inl.h` is similar to `local_sum-inl.h`. The main differences
+are 1) we create the server at a remote node, and set
+`update_on_server` to be true.
+```c++
+auto* ps = mshadow::ps::CreateSharedModel<xpu, float>("dist");
+ps->SetParam("update_on_server", "1");
+```
+2) we explicitly create server node and worker node at `dist_async_sum.cpp`
diff --git a/3rdparty/mshadow/guide/mshadow-ps/config.mk b/3rdparty/mshadow/guide/mshadow-ps/config.mk
new file mode 100644
index 000000000000..ed7af28a5096
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/config.mk
@@ -0,0 +1,40 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+#
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = blas
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+#
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = ../../../parameter_server/
+PS_THIRD_PATH = NONE
+
+# whether compile with rabit allreduce
+USE_RABIT_PS = 1
+RABIT_PATH = ../../rabit
+
diff --git a/3rdparty/mshadow/guide/mshadow-ps/dbstr.h b/3rdparty/mshadow/guide/mshadow-ps/dbstr.h
new file mode 100644
index 000000000000..59a7135cd540
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/dbstr.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <mshadow/tensor.h>
+#include <sstream>
+
+template<typename DType>
+std::string dbstr(mshadow::Tensor<mshadow::cpu, 1, DType> ts) {
+  std::stringstream ss;
+  for (mshadow::index_t i = 0; i < ts.size(0); ++i)
+    ss << ts[i] << " ";
+  ss << "\n";
+  return ss.str();
+}
+
+template<typename DType>
+std::string dbstr(mshadow::Tensor<mshadow::cpu, 2, DType> ts) {
+  std::stringstream ss;
+  for (mshadow::index_t i = 0; i < ts.size(0); ++i) {
+    for (mshadow::index_t j = 0; j < ts.size(1); ++j) {
+      ss << ts[i][j] << " ";
+    }
+    ss << "\n";
+  }
+  ss << "\n";
+  return ss.str();
+}
+
+template<typename DType>
+std::string dbstr(mshadow::Tensor<mshadow::cpu, 3, DType> ts) {
+  std::stringstream ss;
+  for (mshadow::index_t i = 0; i < ts.size(0); ++i) {
+    ss << dbstr(ts[i]) << "\n";
+  }
+  ss << "\n";
+  return ss.str();
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h b/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h
new file mode 100644
index 000000000000..9b60460e3c44
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h
@@ -0,0 +1,124 @@
+/**
+ * @brief  Simple test of KVLayer
+ */
+#include "ps.h"
+#include "parameter/kv_layer.h"
+#include <cstdio>
+#include <iostream>
+#include <omp.h>
+#include <map>
+#include <mshadow/tensor.h>
+#include <mshadow-ps/mshadow_ps.h>
+#include "dbstr.h"
+#include "glog/logging.h"
+
+namespace mshadow {
+namespace ps {
+
+
+template<typename DType>
+class Updater : public IModelUpdater<DType> {
+ protected:
+  void InitModel_(int key, Tensor<cpu, 1, DType> data) {
+    data = 0;
+    data_[key] = data;
+  }
+
+  void Update_(int key, Tensor<cpu, 1, DType> data) {
+    data_[key] += data;
+    // LOG(ERROR) << dbstr(data_[key]);
+  }
+  std::map<int, Tensor<cpu, 1, DType> > data_;
+};
+
+template<typename DType>
+IModelUpdater<DType> *CreateModelUpdater(void) {
+  return new Updater<DType>();
+}
+
+}  // namespace ps
+}  // namespace mshadow
+
+// this function is runed by specific thread
+template<typename xpu>
+inline void RunWorkerThread(int devid,
+                            mshadow::ps::ISharedModel<xpu, float> *ps) {
+  // initialize tensor engine
+  mshadow::InitTensorEngine<xpu>(devid);
+  mshadow::Stream<xpu> *stream  = mshadow::NewStream<xpu>();
+  // allocate tensor on xpu
+  mshadow::TensorContainer<xpu, 2> data(mshadow::Shape2(2, 3));
+  // set the computation stream to the new allocated stream
+  // this will make subsequent computation whose target is data
+  // to use the stream, stream is needed for async execution in GPU
+  data.set_stream(stream);
+  // intiaialize the key, register the shape on parameter server
+  ps->InitKey(data[0].shape_, 0, devid);
+  ps->InitKey(data[1].shape_, 1, devid);
+  // first step, pull the data back from server
+  ps->PullReq(data[0], 0, devid);
+  ps->PullReq(data[1], 1, devid);
+
+  // PullWait will block until these request finishes
+  ps->PullWait(0, devid);
+  ps->PullWait(1, devid);
+
+  data[1] = devid + data[0];
+
+  LOG(ERROR) << "node " << ::ps::MyNodeID() << ", dev " << devid << ": before sync\n"
+             << dbstr(data);
+
+  // push data[0] out, for update, or aggregation
+  // 0 is the key of the data, devid is the current device id
+  ps->Push(data[0], 0, devid);
+  // pull request is used to request the data to be copied back
+  // once computation is done
+  ps->PullReq(data[0], 0, devid);
+  // computation can be done here..
+  // the pull request handler will be overlapped with
+  // similar as previous call
+  ps->PullWait(0, devid);
+
+  ps->Push(data[1], 1, devid);
+  ps->PullReq(data[1], 1, devid);
+  // more computation can be done here...
+  // the computation will be overlapped
+  // PullWait will block until these request finishes
+  ps->PullWait(1, devid);
+
+  LOG(ERROR) << "node " << ::ps::MyNodeID() << ", dev " << devid
+             << ": after sync\n" << dbstr(data);
+
+  mshadow::DeleteStream(stream);
+  mshadow::ShutdownTensorEngine<xpu>();
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: device list\n"\
+           "\tfor CPU the device list can be arbitrary\n"\
+           "\tfor GPU the device list need to be actual device index\n");
+    return 0;
+  }
+  // list of device ids
+  std::vector<int> devs;
+  // initialization
+  for (int i = 1; i < argc; ++i) {
+    // record the device id
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, float>
+      *ps = mshadow::ps::CreateSharedModel<xpu, float>("dist");
+  // intiaialize the ps
+  ps->SetParam("update_on_server", "1");
+  ps->Init(devs);
+  // use openmp to launch #devs threads
+  #pragma omp parallel num_threads(devs.size())
+  {
+    int tid = omp_get_thread_num();
+    RunWorkerThread<xpu>(devs[tid], ps);
+  }
+  delete ps;
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp b/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp
new file mode 100644
index 000000000000..11891053fd4b
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp
@@ -0,0 +1,11 @@
+#include "./dist_async_sum-inl.h"
+
+int CreateServerNode(int argc, char *argv[]) {
+  mshadow::ps::MShadowServerNode<float> server(argc, argv);
+  return 0;
+}
+
+
+int WorkerNodeMain(int argc, char *argv[]) {
+  return Run<mshadow::cpu>(argc, argv);
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/local.sh b/3rdparty/mshadow/guide/mshadow-ps/local.sh
new file mode 100755
index 000000000000..a965ea86805f
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/local.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# set -x
+# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../third_party/lib
+if [ $# -lt 3 ]; then
+    echo "usage: $0 num_servers num_workers bin [args..]"
+    exit -1;
+fi
+
+num_servers=$1
+shift
+num_workers=$1
+shift
+bin=$1
+shift
+arg="-num_servers ${num_servers} -num_workers ${num_workers} -log_dir log $@"
+
+
+# killall -q $(basename ${bin})
+# killall -q ${bin}
+
+# start the scheduler
+Sch="role:SCHEDULER,hostname:'127.0.0.1',port:8001,id:'H'"
+${bin} -my_node ${Sch} -scheduler ${Sch} ${arg} &
+
+# start servers
+for ((i=0; i<${num_servers}; ++i)); do
+    port=$((9600 + ${i}))
+    N="role:SERVER,hostname:'127.0.0.1',port:${port},id:'S${i}'"
+    ${bin} -my_node ${N} -scheduler ${Sch} ${arg} &
+done
+
+# start workers
+for ((i=0; i<${num_workers}; ++i)); do
+    port=$((9500 + ${i}))
+    N="role:WORKER,hostname:'127.0.0.1',port:${port},id:'W${i}'"
+    ${bin} -my_node ${N} -scheduler ${Sch} ${arg} &
+done
+
+wait
diff --git a/3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h b/3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h
new file mode 100644
index 000000000000..cfd503706c76
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h
@@ -0,0 +1,119 @@
+// This is an example demonstrating the usage of mshadow ps
+#include <cstdio>
+// use openmp to launch multiple threads
+#include <omp.h>
+#include <mshadow/tensor.h>
+#include <mshadow-ps/mshadow_ps.h>
+
+// simple util to print result
+void Print_(mshadow::Tensor<mshadow::cpu, 2, float> ts) {
+  for (mshadow::index_t i = 0; i < ts.size(0); ++i) {
+    for (mshadow::index_t j = 0; j < ts.size(1); ++j) {
+      printf("%g ", ts[i][j]);
+    }
+    printf("\n");
+  }
+}
+template<typename xpu>
+inline void Print(mshadow::Tensor<xpu, 2, float> ts) {
+  mshadow::TensorContainer<mshadow::cpu, 2, float> tmp;
+  tmp.Resize(ts.shape_);
+  mshadow::Copy(tmp, ts);
+  Print_(tmp);
+}
+
+// this function is runed by specific thread
+template<typename xpu>
+inline void RunWorkerThread(int devid,
+                            mshadow::ps::ISharedModel<xpu, float> *ps) {
+  // initialize tensor engine
+  mshadow::InitTensorEngine<xpu>(devid);
+  mshadow::Stream<xpu> *stream  = mshadow::NewStream<xpu>(devid);
+  // allocate tensor on xpu
+  mshadow::TensorContainer<xpu, 2> data(mshadow::Shape2(2, 3));
+  // set the computation stream to the new allocated stream
+  // this will make subsequent computation whose target is data
+  // to use the stream, stream is needed for async execution in GPU
+  data.set_stream(stream);
+  // assume these operations sets the content of dataient
+  data[0] = 1.0f;
+  data[1] = devid + data[0];
+  printf("dev%d: before sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  // intiaialize the key, register the shape on parameter server
+  ps->InitKey(data[0].shape_, 0, devid);
+  ps->InitKey(data[1].shape_, 1, devid);
+  // push data[0] out, for update, or aggregation
+  // 0 is the key of the data, devid is the current device id
+  ps->Push(data[0], 0, devid);
+  // pull request is used to request the data to be copied back
+  // once computation is done
+  ps->PullReq(data[0], 0, devid);
+  // computation can be done here..
+  // the pull request handler will be overlapped with
+  // similar as previous call
+  ps->Push(data[1], 1, devid);
+  ps->PullReq(data[1], 1, devid);
+  // more computation can be done here...
+  // the computation will be overlapped
+  // PullWait will block until these request finishes
+  ps->PullWait(0, devid);
+  ps->PullWait(1, devid);
+  printf("dev%d: after sync, data:\n", devid);
+  // use print to show result, do not call
+  // print normally since Copy will block
+  Print(data);
+  printf("====================\n");
+  mshadow::DeleteStream(stream);
+  mshadow::ShutdownTensorEngine<xpu>();
+}
+
+namespace mshadow {
+namespace ps {
+// model updater is used when update is happening on server side
+// if we only use parameter server for sum aggregation
+// this is not needed, but we must declare this function to return NULL
+template<>
+IModelUpdater<float> *CreateModelUpdater(void) {
+  return NULL;
+}
+}
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: device list\n"\
+           "\tfor CPU the device list can be arbitrary\n"\
+           "\tfor GPU the device list need to be actual device index\n");
+    return 0;
+  }
+#if MSHADOW_RABIT_PS
+  rabit::Init(argc, argv);
+#endif
+  // list of device ids
+  std::vector<int> devs;
+  // initialization
+  for (int i = 1; i < argc; ++i) {
+    // record the device id
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, float>
+      *ps = mshadow::ps::CreateSharedModel<xpu, float>("local");
+  // intiaialize the ps
+  ps->Init(devs);
+  // use openmp to launch #devs threads
+  #pragma omp parallel num_threads(devs.size())
+  {
+    int tid = omp_get_thread_num();
+    RunWorkerThread<xpu>(devs[tid], ps);
+  }
+  delete ps;
+#if MSHADOW_RABIT_PS
+  rabit::Finalize();
+#endif
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp b/3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp
new file mode 100644
index 000000000000..7f0eed0df42e
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp
@@ -0,0 +1,4 @@
+#include "./local_sum-inl.h"
+int main(int argc, char *argv[]) {
+  return Run<mshadow::cpu>(argc, argv);
+}
diff --git a/3rdparty/mshadow/guide/mshadow-ps/local_sum.cu b/3rdparty/mshadow/guide/mshadow-ps/local_sum.cu
new file mode 100644
index 000000000000..6e839601a265
--- /dev/null
+++ b/3rdparty/mshadow/guide/mshadow-ps/local_sum.cu
@@ -0,0 +1,4 @@
+#include "./local_sum-inl.h"
+int main(int argc, char *argv[]) {
+  return Run<mshadow::gpu>(argc, argv);
+}
diff --git a/3rdparty/mshadow/guide/neuralnet/Makefile b/3rdparty/mshadow/guide/neuralnet/Makefile
new file mode 100644
index 000000000000..826384b5f3b0
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/Makefile
@@ -0,0 +1,38 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export NVCC =nvcc
+include config.mk
+include ../../make/mshadow.mk
+export CFLAGS = -Wall -O3 -I../../ -fopenmp $(MSHADOW_CFLAGS)
+export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+
+# specify tensor path
+BIN =
+OBJ =
+CUOBJ =
+CUBIN = nnet convnet nnet_ps
+.PHONY: clean all
+
+all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ)
+
+nnet: nnet.cu
+nnet_ps: nnet_ps.cu
+convnet: convnet.cu
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)  $(LDFLAGS)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(CUOBJ) :
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+
+$(CUBIN) :
+	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
+
diff --git a/3rdparty/mshadow/guide/neuralnet/README.md b/3rdparty/mshadow/guide/neuralnet/README.md
new file mode 100644
index 000000000000..dd181e758c65
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/README.md
@@ -0,0 +1,16 @@
+Example Neural Net code with MShadow
+====
+
+To compile the code, modify ```config.mk``` to the setting you like and type make
+* You will need to have CUDA and  a version of BLAS
+
+To run the demo, download  MNIST dataset from: http://yann.lecun.com/exdb/mnist/
+unzip all the files into current folder
+
+and run by  ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu
+
+MultiGPU Version
+====
+* If you have two GPUs, you can run it by ```./nnet_ps gpu 0 1```.
+* You can run it using CPUs ```./nnet_ps cpu 0 1```.
+* This is an demonstration of mshadow-ps interface, see introduction in [../mshadow-ps](../mshadow-ps)
diff --git a/3rdparty/mshadow/guide/neuralnet/config.mk b/3rdparty/mshadow/guide/neuralnet/config.mk
new file mode 100644
index 000000000000..6c10b79903bf
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/config.mk
@@ -0,0 +1,35 @@
+#---------------------------------------------------------------------------------------
+#  mshadow: the configuration compile script
+#
+#  This is configuration script that you can use to compile mshadow
+#  Usage:
+#
+#  include config.mk in your Makefile, or directly include the definition of variables
+#  include mshadow.mk after the variables are set
+#
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA libary to link and compile flag
+# if you have already add them to enviroment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+#
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas, apple
+USE_BLAS = openblas
+#
+# add path to intel library, you may need it
+# for MKL, if you did not add the path to enviroment variable
+#
+USE_INTEL_PATH = NONE
+
+# whether compile with parameter server
+USE_DIST_PS = 0
+PS_PATH = NONE
+PS_THIRD_PATH = NONE
diff --git a/3rdparty/mshadow/guide/neuralnet/convnet.cu b/3rdparty/mshadow/guide/neuralnet/convnet.cu
new file mode 100644
index 000000000000..5276338ec71c
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/convnet.cu
@@ -0,0 +1,282 @@
+// this implements a simple convolution neural net: conv-maxpool-fullc
+#include <vector>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// helper function to load mnist dataset
+#include "util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define operations
+struct relu{
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    using namespace std;
+    return max(a, 0.0f);
+  }
+};
+struct relu_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return a > 0.0f ? 1.0f : 0.0f;
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 4, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;
+  virtual void Update(void) = 0;
+  virtual ~INNet() {}
+};
+
+/*!
+ * \brief simple two layer conv-net conv-pool-flat-fullc
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class ConvNet : public INNet {
+ public:
+  // initialize the network
+  ConvNet(int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out)
+      :rnd(0), ksize(ksize), kstride(kstride), psize(psize) {
+    // setup stream
+    Stream<xpu> *stream = NewStream<xpu>();
+    ninput.set_stream(stream);
+    nhidden.set_stream(stream);
+    nhiddenbak.set_stream(stream);
+    npool.set_stream(stream);
+    npoolbak.set_stream(stream);
+    nflat.set_stream(stream);
+    nout.set_stream(stream);
+    hbias.set_stream(stream); g_hbias.set_stream(stream);
+    obias.set_stream(stream);  g_obias.set_stream(stream);
+    Ki2h.set_stream(stream);  g_Ki2h.set_stream(stream);
+    Wh2o.set_stream(stream);   g_Wh2o.set_stream(stream);
+    tmp_col.set_stream(stream);
+    tmp_dst.set_stream(stream);
+    // setup nodes
+    ninput.Resize(Shape4(batch_size, 1, insize, insize));
+    nhidden.Resize(Shape4(batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1));
+    nhiddenbak.Resize(nhidden.shape_);
+    npool.Resize(Shape4(batch_size, nchannel, (nhidden.size(2)+1-psize)/psize, (nhidden.size(3)+1-psize)/psize));
+    npoolbak.Resize(npool.shape_);
+    nflat.Resize(Shape2(batch_size, npool.size(1)*npool.size(2)*npool.size(3)));
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(nchannel)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out));  g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Ki2h.Resize(Shape2(nchannel, ksize*ksize));  g_Ki2h.Resize(Ki2h.shape_);
+    Wh2o.Resize(Shape2(nflat.size(1), num_out));   g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Ki2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);
+
+    printf("conv=%d, pool=%d\n", nhidden.size(3), npool.size(3));
+  }
+  virtual ~ConvNet() {}
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 4, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) {
+    index_t batch_size = inbatch.size(0);
+    // copy data to input layer
+    Copy(ninput, inbatch, ninput.stream_);
+    // first layer, conv, use stride=2
+    ConvForward(ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst);
+    // add bias
+    nhidden += broadcast<1>(hbias, nhidden.shape_);
+    // activation, relu, backup activation in nhidden
+    nhidden = F<relu>(nhidden);
+    Copy(nhiddenbak, nhidden, nhiddenbak.stream_);
+    // max pooling
+    npool = pool<red::maximum>(nhiddenbak, npool[0][0].shape_, psize, psize, psize);
+    Copy(npoolbak, npool, npoolbak.stream_);
+    // flat
+    nflat = reshape(npool, nflat.shape_);
+    // second layer fullc
+    nout = dot(nflat, Wh2o);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout, nout.stream_);
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout, nout.stream_);
+    // calc grad of final layer
+    g_obias = sum_rows(nout);
+    g_Wh2o  = dot(nflat.T(), nout);
+    // backprop to previous layer
+    nflat = dot(nout, Wh2o.T());
+    npool = reshape(nflat, npool.shape_);
+    // backprop pooling layer
+    nhiddenbak = unpool<red::maximum>(nhiddenbak, npoolbak, npool, psize, psize, psize);
+    // calculate gradient of relu layer
+    nhidden = F<relu_grad>(nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sumall_except_dim<1>(nhidden);
+    ConvBackWard(nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst);
+  }
+  // update weight
+  virtual void Update(void) {
+    // run SGD
+    const float eta = 0.1;
+    const float wd = 0.00001;
+    // update weight
+    Ki2h -= eta * (wd * Ki2h + g_Ki2h);
+    Wh2o -= eta * (wd * Wh2o + g_Wh2o);
+    // no regularization for bias
+    hbias-= eta * g_hbias;
+    obias-= eta * g_obias;
+  }
+ private:
+  // forward convolution, tmp_col and tmp_dst are helper structure
+  inline static void ConvForward(const Tensor<xpu, 4, real_t> &in,
+                                 const Tensor<xpu, 2, real_t> &kernel,
+                                 Tensor<xpu, 4, real_t> &out,
+                                 int ksize, int kstride,
+                                 TensorContainer<xpu, 2, real_t> &tmp_col,
+                                 TensorContainer<xpu, 2, real_t> &tmp_dst) {
+    index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+    index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+    index_t nbatch   = in.size(0);
+    index_t nchannel = out.size(1);
+    // we directly unpack all local patches and do a dot product
+    // this cost lots of memory, normally for large image, only unpack several image at a time
+    tmp_col.Resize(Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+    tmp_dst.Resize(Shape2(nchannel, nbatch*oheight*owidth));
+    // unpack local patches , stride=1
+	tmp_col = unpack_patch2col(in, ksize, ksize, kstride, kstride, 1, 1);
+    tmp_dst = dot(kernel, tmp_col);
+    // reshape, then swap axis, we chain equations together
+    out = swapaxis<1,0>(reshape(tmp_dst, Shape4(nchannel, nbatch, oheight, owidth)));
+  }
+  // backward convolution, calculate gradient of kernel, and backprop back to in
+  inline static void ConvBackWard(const Tensor<xpu, 4, real_t> &out,
+                                  const Tensor<xpu, 2, real_t> &kernel,
+                                  Tensor<xpu, 2, real_t> &g_kernel,
+                                  Tensor<xpu, 4, real_t> &in,
+                                  int ksize, int kstride,
+                                  TensorContainer<xpu, 2, real_t> &tmp_col,
+                                  TensorContainer<xpu, 2, real_t> &tmp_dst) {
+    index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+    index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+    index_t nbatch   = in.size(0);
+    index_t nchannel = out.size(1);
+    // we directly unpack all local patches and do a dot product
+    // this cost lots of memory, normally for large image, only unpack several image at a time
+    tmp_col.Resize(Shape2(in.size(1) * ksize * ksize,
+                          nbatch * oheight * owidth));
+    tmp_dst.Resize(Shape2(nchannel, nbatch * oheight * owidth));
+    // unpack local patches
+    tmp_col = unpack_patch2col(in, ksize, ksize, kstride, kstride, 1, 1);
+    tmp_dst = reshape(swapaxis<1,0>(out), tmp_dst.shape_);
+    g_kernel = dot(tmp_dst, tmp_col.T());
+        // backpropgation: not necessary for first layer, but included anyway
+    tmp_col = dot(kernel.T(), tmp_dst);
+    in = pack_col2patch(tmp_col, in.shape_, ksize, ksize, kstride, kstride, 1, 1);
+  }
+ private:
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // kernel size, pooling size
+  int ksize, kstride, psize;
+  // nodes in neural net
+  TensorContainer<xpu, 4, real_t> ninput, nhidden, nhiddenbak, npool, npoolbak;
+  TensorContainer<xpu, 2, real_t> nflat, nout;
+  // temp helper structure
+  TensorContainer<xpu, 2, real_t> tmp_col, tmp_dst;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize)
+  TensorContainer<xpu, 2, real_t> Ki2h,  Wh2o, g_Ki2h, g_Wh2o;
+};
+
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for (index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+int main(int argc, char *argv[]) {
+  if(argc < 2) {
+    printf("Usage: cpu or gpu\n"); return 0;
+  }
+  srand(0);
+  // settings
+  int batch_size = 100;
+  int insize = 28;
+  int nchannel = 10;
+  int ksize = 5;
+  int kstride = 1;
+  int psize = 2;
+  int num_out = 10;
+
+  // choose which version to use
+  INNet *net;
+  if (!strcmp(argv[1], "gpu")) {
+    InitTensorEngine<gpu>();
+    net = new ConvNet<gpu>(batch_size, insize, nchannel, ksize, kstride, psize, num_out);
+  } else {
+    InitTensorEngine<cpu>();
+    net = new ConvNet<cpu>(batch_size, insize, nchannel, ksize, kstride, psize, num_out);
+  }
+
+  // temp output layer
+  TensorContainer<cpu, 2, real_t> pred;
+  pred.Resize(Shape2(batch_size, num_out));
+
+  // label
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu, 2, real_t> xtrain_, xtest_;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false);
+
+  TensorContainer<cpu, 4, real_t> xtrain(Shape4(xtrain_.size(0), 1, insize, insize));
+  TensorContainer<cpu, 4, real_t> xtest(Shape4(xtest_.size(0),  1, insize, insize));
+  xtrain = reshape(xtrain_, xtrain.shape_);
+  xtest = reshape(xtest_, xtest.shape_);
+
+  int num_iter = 20;
+
+  for (int i = 0; i < num_iter; ++ i) {
+    // training
+    for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+      net->Forward(xtrain.Slice(j, j + batch_size), pred);
+      // set gradient into pred
+      for (int k = 0; k < batch_size; ++ k) {
+        pred[k][ ytrain[k+j] ] -= 1.0f;
+      }
+      // scale gradient by batchs zie
+      pred *= 1.0f / batch_size;
+      // run backprop
+      net->Backprop(pred);
+      // update net parameters
+      net->Update();
+    }
+    // evaluation
+    long nerr = 0;
+    for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+      net->Forward(xtest.Slice(j, j + batch_size), pred);
+      for (int k = 0; k < batch_size; ++ k) {
+        nerr += MaxIndex(pred[k]) != ytest[j+k];
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }
+  delete net;
+
+  if (!strcmp(argv[1], "gpu")) {
+    ShutdownTensorEngine<gpu>();
+  } else {
+    ShutdownTensorEngine<cpu>();
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/neuralnet/nnet.cu b/3rdparty/mshadow/guide/neuralnet/nnet.cu
new file mode 100644
index 000000000000..6ef8b0db3f64
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/nnet.cu
@@ -0,0 +1,202 @@
+// this implements a simple two layer neural net
+#include <vector>
+#include <cmath>
+// header file to use mshadow
+#include "mshadow/tensor.h"
+// helper function to load mnist dataset
+#include "util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define sigmoid operation
+struct sigmoid{
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return  1.0f/(1.0f+expf(-a));
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch, Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;
+  virtual void Update(void) = 0;
+  virtual ~INNet() {}
+};
+
+/*!
+ * \brief simple two layer neural net
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class NNet : public INNet {
+ public:
+  // initialize the network
+  NNet(int batch_size, int num_in, int num_hidden, int num_out) : rnd(0) {
+    // setup stream
+    Stream<xpu> *stream = NewStream<xpu>();
+    ninput.set_stream(stream);
+    nhidden.set_stream(stream);
+    nhiddenbak.set_stream(stream);
+    nout.set_stream(stream);
+    hbias.set_stream(stream);
+    g_hbias.set_stream(stream);
+    g_obias.set_stream(stream);
+    obias.set_stream(stream);
+    Wi2h.set_stream(stream);
+    Wh2o.set_stream(stream);
+    g_Wi2h.set_stream(stream);
+    g_Wh2o.set_stream(stream);
+    // setup nodes
+    ninput.Resize(Shape2(batch_size, num_in));
+    nhidden.Resize(Shape2(batch_size, num_hidden));
+    nhiddenbak.Resize(nhidden.shape_);
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Wi2h.Resize(Shape2(num_in, num_hidden));  g_Wi2h.Resize(Wi2h.shape_);
+    Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Wi2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);
+  }
+  virtual ~NNet() {}
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) {
+    // size is same conventsion as numpy
+    index_t batch_size = inbatch.size(0);
+    // copy data to input layer
+    Copy(ninput, inbatch, ninput.stream_);
+    // first layer, fullc
+    nhidden = dot(ninput, Wi2h);
+    nhidden+= repmat(hbias, batch_size);
+    // activation, sigmloid, backup activation in nhidden
+    nhidden = F<sigmoid>(nhidden);
+    Copy(nhiddenbak, nhidden, nhiddenbak.stream_);
+    // second layer fullc
+    nout = dot(nhiddenbak, Wh2o);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout, nout.stream_);
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout, nout.stream_);
+    // calc grad of layer 2
+    g_obias = sum_rows(nout);
+    g_Wh2o  = dot(nhiddenbak.T(), nout);
+    // backprop to layer 1
+    nhiddenbak = dot(nout, Wh2o.T());
+    // calculate gradient of sigmoid layer
+    nhidden = nhidden * (1.0f-nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sum_rows(nhidden);
+    g_Wi2h  = dot(ninput.T(), nhidden);
+  }
+  // update weight
+  virtual void Update(void) {
+    // run SGD
+    const float eta = 0.8;
+    const float wd = 0.00001;
+    // update weight
+    Wi2h -= eta * (wd * Wi2h + g_Wi2h);
+    Wh2o -= eta * (wd * Wh2o + g_Wh2o);
+    // no regularization for bias
+    hbias-= eta * g_hbias;
+    obias-= eta * g_obias;
+  }
+ private:
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // nodes in neural net
+  TensorContainer<xpu, 2, real_t> ninput, nhidden, nhiddenbak, nout;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight gradient
+  TensorContainer<xpu, 2, real_t> Wi2h, Wh2o, g_Wi2h, g_Wh2o;
+};
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for(index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+int main(int argc, char *argv[]) {
+  if(argc < 2) {
+    printf("Usage: cpu or gpu\n"); return 0;
+  }
+  srand(0);
+
+  // settings
+  int batch_size = 100;
+  int num_in = 28 * 28;
+  int num_hidden = 100;
+  int num_out = 10;
+  // choose which version to use
+  INNet *net;
+  if (!strcmp(argv[1], "gpu")) {
+    InitTensorEngine<gpu>();
+    net = new NNet<gpu>(batch_size, num_in, num_hidden, num_out);
+  } else {
+    InitTensorEngine<cpu>();
+    net = new NNet<cpu>(batch_size, num_in, num_hidden, num_out);
+  }
+
+  // temp output layer
+  TensorContainer<cpu, 2, real_t> pred;
+  pred.Resize(Shape2(batch_size, num_out));
+
+  // label
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu,2> xtrain, xtest;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false);
+
+  int num_iter = 20;
+
+  for (int i = 0; i < num_iter; ++ i) {
+    // training
+    for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+      net->Forward(xtrain.Slice(j, j + batch_size), pred);
+      // set gradient into pred
+      for (int k = 0; k < batch_size; ++ k) {
+        pred[k][ ytrain[k+j] ] -= 1.0f;
+      }
+      // scale gradient by batchs zie
+      pred *= 1.0f / batch_size;
+      // run backprop
+      net->Backprop(pred);
+      // update net parameters
+      net->Update();
+    }
+    // evaluation
+    long nerr = 0;
+    for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+      net->Forward(xtest.Slice(j, j + batch_size), pred);
+      for (int k = 0; k < batch_size; ++ k) {
+        nerr += MaxIndex(pred[k]) != ytest[j+k];
+
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }
+  delete net;
+  if (!strcmp(argv[1], "gpu")) {
+    ShutdownTensorEngine<gpu>();
+  } else {
+    ShutdownTensorEngine<cpu>();
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/neuralnet/nnet_ps.cu b/3rdparty/mshadow/guide/neuralnet/nnet_ps.cu
new file mode 100644
index 000000000000..5c03fb10daab
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/nnet_ps.cu
@@ -0,0 +1,312 @@
+// this implements a simple two layer Multi-GPU neural net
+// this implementation uses mshadow-ps to get gradient aggregation
+// between cards
+// this code is modified from nnet.cu
+#include <vector>
+#include <cmath>
+#include <omp.h>
+// header file to use mshadow
+#include <mshadow/tensor.h>
+#include <mshadow-ps/mshadow_ps.h>
+// helper function to load mnist dataset
+#include "./util.h"
+// this namespace contains all data structures, functions
+using namespace mshadow;
+// this namespace contains all operator overloads
+using namespace mshadow::expr;
+
+// define sigmoid operation
+struct sigmoid {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f / (1.0f + expf(-a));
+  }
+};
+
+/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */
+class INNet{
+ public:
+  virtual void Forward(const Tensor<cpu, 2, real_t>& inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) = 0;
+  virtual void Backprop(const Tensor<cpu, 2, real_t>& gradout) = 0;
+  virtual ~INNet() {}
+};
+
+/*!
+ * \brief simple two layer neural net
+ *        this implementation is device invariant
+ */
+template<typename xpu>
+class NNet : public INNet {
+ public:
+  // initialize the network
+  NNet(int batch_size, int num_in, int num_hidden, int num_out,
+       int devid, mshadow::ps::ISharedModel<xpu, real_t> *ps)
+      : rnd(0), devid(devid), ps(ps) {
+    mshadow::SetDevice<xpu>(devid);
+    stream = mshadow::NewStream<xpu>();
+    // set the computing streams
+    ninput.set_stream(stream);
+    nhidden.set_stream(stream);
+    nhiddenbak.set_stream(stream);
+    nout.set_stream(stream);
+    hbias.set_stream(stream);
+    obias.set_stream(stream);
+    g_hbias.set_stream(stream);
+    g_obias.set_stream(stream);
+    Wi2h.set_stream(stream);
+    Wh2o.set_stream(stream);
+    g_Wi2h.set_stream(stream);
+    g_Wh2o.set_stream(stream);
+    rnd.set_stream(stream);
+    // setup nodes
+    ninput.Resize(Shape2(batch_size, num_in));
+    nhidden.Resize(Shape2(batch_size, num_hidden));
+    nhiddenbak.Resize(nhidden.shape_);
+    nout.Resize(Shape2(batch_size, num_out));
+    // setup bias
+    hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_);
+    obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_);
+    hbias = 0.0f; obias = 0.0f;
+    // setup weights
+    Wi2h.Resize(Shape2(num_in, num_hidden));  g_Wi2h.Resize(Wi2h.shape_);
+    Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_);
+    rnd.SampleGaussian(&Wi2h, 0, 0.01f);
+    rnd.SampleGaussian(&Wh2o, 0, 0.01f);
+    // initialize the key
+    ps->InitKey(Wi2h.shape_, 0, devid);
+    ps->InitKey(hbias.shape_, 1, devid);
+    ps->InitKey(Wh2o.shape_, 2, devid);
+    ps->InitKey(obias.shape_, 3, devid);
+  }
+  virtual ~NNet() {
+    mshadow::SetDevice<xpu>(devid);
+    mshadow::DeleteStream(stream);
+  }
+  // forward propagation
+  virtual void Forward(const Tensor<cpu, 2, real_t> &inbatch,
+                       Tensor<cpu, 2, real_t> &oubatch) {
+    // size is same conventsion as numpy
+    index_t batch_size = inbatch.size(0);
+    // copy data to input layer
+    Copy(ninput, inbatch, stream);
+    // wait the last pull requst on layer to complete
+    ps->PullWait(0, devid);
+    // first layer, fullc
+    nhidden = dot(ninput, Wi2h);
+    // wait the pull request on hbias to complete
+    ps->PullWait(1, devid);
+    nhidden+= repmat(hbias, batch_size);
+    // activation, sigmloid, backup activation in nhidden
+    nhidden = F<sigmoid>(nhidden);
+    Copy(nhiddenbak, nhidden, stream);
+    // second layer fullc
+    ps->PullWait(2, devid);
+    nout = dot(nhiddenbak, Wh2o);
+    ps->PullWait(3, devid);
+    nout += repmat(obias, batch_size);
+    // softmax calculation
+    Softmax(nout, nout);
+    // copy result out
+    Copy(oubatch, nout, stream);
+    // Copy with stream is non-blocking, use wait to wait until copy finishes
+    stream->Wait();
+  }
+  // back propagation
+  virtual void Backprop(const Tensor<cpu, 2, real_t> &gradout) {
+    // copy gradient to output layer
+    Copy(nout, gradout, stream);
+    // calc grad of layer 2
+    g_obias = sum_rows(nout);
+    // sync proc defines the synchronization step
+    this->SyncProc(obias, g_obias, 3);
+    // update second layer weights
+    g_Wh2o = dot(nhiddenbak.T(), nout);
+    // backprop to layer 1
+    nhiddenbak = dot(nout, Wh2o.T());
+    this->SyncProc(Wh2o, g_Wh2o, 2);
+    // calculate gradient of sigmoid layer
+    nhidden = nhidden * (1.0f-nhidden) * nhiddenbak;
+    // calc grad of layer 1
+    g_hbias = sum_rows(nhidden);
+    this->SyncProc(hbias, g_hbias, 1);
+    g_Wi2h = dot(ninput.T(), nhidden);
+    this->SyncProc(Wi2h, g_Wi2h, 0);
+  }
+  // synchronization function
+  template<int dim>
+  inline void SyncProc(mshadow::Tensor<xpu, dim> weight,
+                       mshadow::Tensor<xpu, dim> grad,
+                       int data_key) {
+    // wait till last computation finishes
+    stream->Wait();
+    ps->Push(grad, data_key, devid, -data_key);
+    ps->PullReq(grad, data_key, devid, -data_key,
+                UpdateEntry::ApplyUpdate,
+                new UpdateEntry(weight.FlatTo2D(), grad.FlatTo2D(), dim == 1));
+  }
+  // data structure defined to help using callback function
+  struct UpdateEntry {
+    mshadow::Tensor<xpu, 2> weight;
+    mshadow::Tensor<xpu, 2> grad;
+    bool is_bias;
+    // constructor
+    UpdateEntry(mshadow::Tensor<xpu, 2> weight,
+                mshadow::Tensor<xpu, 2> grad,
+                bool is_bias)
+        : weight(weight), grad(grad),
+          is_bias(is_bias) {}
+    inline void Update(mshadow::Stream<xpu> *stream) {
+      weight.set_stream(stream);
+      const float wd = 0.00001;
+      const float eta = 0.8;
+      if (!is_bias) {
+        weight -= eta * (wd * weight + grad);
+      } else {
+        weight -= eta * grad;
+      }
+    }
+    // callback function to apply update
+    inline static void ApplyUpdate(mshadow::Stream<xpu> *stream, void *arg) {
+      UpdateEntry *e = static_cast<UpdateEntry*>(arg);
+      e->Update(stream);
+      delete e;
+    }
+  };
+
+ private:
+  // computing stream
+  mshadow::Stream<xpu> *stream;
+  // device id
+  int devid;
+  // parameter server interface
+  mshadow::ps::ISharedModel<xpu, real_t> *ps;
+  // random seed generator
+  Random<xpu, real_t> rnd;
+  // nodes in neural net
+  TensorContainer<xpu, 2, real_t> ninput, nhidden, nhiddenbak, nout;
+  // hidden bias, gradient
+  TensorContainer<xpu, 1, real_t> hbias, obias, g_hbias, g_obias;
+  // weight gradient
+  TensorContainer<xpu, 2, real_t> Wi2h, Wh2o, g_Wi2h, g_Wh2o;
+};
+
+// helper function to get the max inde
+inline int MaxIndex(Tensor<cpu, 1, real_t> pred) {
+  int maxidx = 0;
+  for(index_t i = 1; i < pred.size(0); ++i) {
+    if(pred[i] > pred[maxidx]) maxidx = (int)i;
+  }
+  return maxidx;
+}
+
+namespace mshadow {
+namespace ps {
+// model updater is used when update is happening on server side
+// if we only use parameter server for sum aggregation
+// this is not needed, but we must declare this function to return NULL
+template<>
+IModelUpdater<float> *CreateModelUpdater(void) {
+  return NULL;
+}
+}
+}
+
+template<typename xpu>
+inline int Run(int argc, char *argv[]) {
+  srand(0);
+  // settings
+  int batch_size = 100;
+  int num_in = 28 * 28;
+  int num_hidden = 100;
+  int num_out = 10;
+  int ndev = argc - 2;
+  if (batch_size % ndev != 0) {
+    fprintf(stderr, "choose number of devices ndev such that 100 MOD ndev == 0\n");
+    return 0;
+  }
+  // choose which version to use
+  std::vector<int> devs;
+  for (int i = 2; i < argc; ++i) {
+    devs.push_back(atoi(argv[i]));
+  }
+  mshadow::ps::ISharedModel<xpu, real_t>
+      *ps = mshadow::ps::CreateSharedModel<xpu, real_t>("local");
+  ps->Init(devs);
+
+  std::vector<INNet *> nets(ndev);
+  for (int i = 0; i < ndev; ++i) {
+    mshadow::InitTensorEngine<xpu>(devs[i]);
+    nets[i] = new NNet<xpu>(batch_size / ndev, num_in, num_hidden, num_out, devs[i], ps);
+  }
+
+  // label
+  std::vector<int> ytrain, ytest;
+  // data
+  TensorContainer<cpu,2> xtrain, xtest;
+  LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true);
+  LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false);
+  int num_iter = 20;
+
+  for (int i = 0; i < num_iter; ++ i) {
+    // mini-batch per device
+    int step = batch_size / ndev;
+    // running parallel threads
+    #pragma omp parallel num_threads(ndev)
+    {
+      // temp output layer
+      TensorContainer<cpu, 2, real_t> pred;
+      pred.Resize(Shape2(step, num_out));
+      int tid = omp_get_thread_num();
+      mshadow::SetDevice<xpu>(devs[tid]);
+      for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) {
+        nets[tid]->Forward(xtrain.Slice(j + tid * step, j + (tid + 1) * step), pred);
+        // set gradient into pred
+        for (int k = 0; k < step; ++ k) {
+          pred[k][ytrain[j + tid * step + k]] -= 1.0f;
+        }
+        // scale gradient by batchs zie
+        pred *= 1.0f / batch_size;
+        // run backprop
+        nets[tid]->Backprop(pred);
+      }
+    }
+    // evaluation
+    long nerr = 0;
+    #pragma omp parallel num_threads(ndev) reduction(+:nerr)
+    {
+      // temp output layer
+      TensorContainer<cpu, 2, real_t> pred;
+      pred.Resize(Shape2(step, num_out));
+      int tid = omp_get_thread_num();
+      mshadow::SetDevice<xpu>(devs[tid]);
+      for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) {
+        nets[tid]->Forward(xtest.Slice(j + tid * step, j + (tid + 1) * step), pred);
+        for (int k = 0; k < step; ++ k) {
+          nerr += MaxIndex(pred[k]) != ytest[j + tid * step + k];
+        }
+      }
+    }
+    printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0));
+  }
+
+  for(int i = 0; i < ndev; ++i) {
+    mshadow::SetDevice<xpu>(devs[i]);
+    delete nets[i];
+    ShutdownTensorEngine<xpu>();
+  }
+  return 0;
+}
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <device> devicelist\n"\
+           "\tExample1: ./nnet_ps cpu 1 2 3\n"\
+           "\tExample2: ./nnet_ps gpu 0 1\n");
+    return 0;
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    Run<mshadow::cpu>(argc, argv);
+  } else {
+    Run<mshadow::gpu>(argc, argv);
+  }
+  return 0;
+}
diff --git a/3rdparty/mshadow/guide/neuralnet/util.h b/3rdparty/mshadow/guide/neuralnet/util.h
new file mode 100644
index 000000000000..f58203c7667a
--- /dev/null
+++ b/3rdparty/mshadow/guide/neuralnet/util.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include "mshadow/tensor.h"
+
+typedef float real_t;
+
+using namespace mshadow;
+
+int pack(unsigned char zz[4]){
+    return (int)(zz[3]) 
+        | (((int)(zz[2])) << 8)
+        | (((int)(zz[1])) << 16)
+        | (((int)(zz[0])) << 24);
+}
+
+template<typename T>
+inline void shuffle(T *data, size_t sz){
+  if(sz == 0) return;
+  for(size_t i = sz - 1; i > 0; i--){
+    std::swap(data[i], data[rand() % (i+1)]);
+  } 
+}
+// random shuffle the data inside, require PRNG 
+template<typename T>
+inline void shuffle(std::vector<T> &data){
+  shuffle(&data[0], data.size());
+}
+
+// simple function to load in mnist
+inline void LoadMNIST(const char *path_img, const char *path_label,
+                      std::vector<int> &ylabel,
+                      TensorContainer<cpu, 2, real_t> &xdata,
+                      bool do_shuffle){
+  // load in data
+  FILE *fi = fopen(path_img, "rb");
+  if (fi == NULL) {
+    printf("cannot open %s\n", path_img);
+    exit(-1);
+  }
+  unsigned char zz[4];
+  unsigned char *t_data, *l_data;
+  int num_image, width, height, nlabel;            
+  assert(fread(zz, 4 , 1, fi));
+  assert(fread(zz, 4 , 1, fi));    
+  num_image = pack(zz);
+  assert(fread(zz, 4 , 1, fi));                
+  width = pack(zz);
+  assert(fread(zz, 4 , 1, fi));                    
+  height = pack(zz);
+  
+  int step = width * height;
+  t_data = new unsigned char[num_image * step];    
+  assert(fread(t_data, step*num_image , 1 , fi));
+  fclose(fi);
+  
+  // load in label
+  fi = fopen(path_label, "rb");
+  assert(fread(zz, 4 , 1, fi));
+  assert(fread(zz, 4 , 1, fi));    
+  nlabel = pack(zz);
+  assert(num_image == nlabel);
+  l_data = new unsigned char[num_image];
+  assert(fread(l_data, num_image , 1 , fi));    
+  // try to do shuffle 
+  std::vector<int> rindex;
+  for (int i = 0; i < num_image; ++ i) {
+    rindex.push_back(i);
+  }
+  if (do_shuffle) {
+    shuffle(rindex);
+  }
+  
+  // save out result
+  ylabel.resize(num_image);
+  xdata.Resize(Shape2(num_image, width * height));
+  for (int i = 0 ; i < num_image ; ++i) {
+    for(int j = 0; j < step; ++j) {
+      xdata[i][j] = (float)(t_data[rindex[i]*step + j]) / 256.0f;            
+    }        
+    ylabel[i] = l_data[rindex[i]];
+  }
+  delete[] t_data; delete [] l_data;
+  printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle);
+}
diff --git a/3rdparty/mshadow/make/README.md b/3rdparty/mshadow/make/README.md
new file mode 100644
index 000000000000..6ef24d6d467c
--- /dev/null
+++ b/3rdparty/mshadow/make/README.md
@@ -0,0 +1,18 @@
+Makefile Configuration of MShadow
+=====
+MShadow is a template library, you only need to include mshadow to use it. So this folder is not used to build mshadow library file.
+
+However, mshadow is a flexible library that allows you to compile with different configurations. For example,
+you can compile mshadow without CUDA, and specify your own choice of BLAS.
+There are different compile flags that you might need to set in your own configuration.
+This folder provides a Makefile script to help you do that.
+
+Usage
+=====
+* Set the configurations via variables in your Makefile, see example in [../guide/config.mk](../guide/config.mk)
+* include [mshadow.mk](mshadow.mk) in your Makefile
+* mshadow.mk will give you compiler variables that you can include when compiling
+  - Add MSHADOW_CFLAGS to the compile flags
+  - Add MSHADOW_LDFLAGS to the linker flags
+  - Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+* For example Makefile, see [../guide/Makefile](../guide/Makefile)
diff --git a/3rdparty/mshadow/make/mshadow.mk b/3rdparty/mshadow/make/mshadow.mk
new file mode 100644
index 000000000000..86155eaaadcf
--- /dev/null
+++ b/3rdparty/mshadow/make/mshadow.mk
@@ -0,0 +1,166 @@
+#---------------------------------------------------------------------------------------
+#  mshadow configuration script
+#
+#  include mshadow.mk after the variables are set
+#
+#  Add MSHADOW_CFLAGS to the compile flags
+#  Add MSHADOW_LDFLAGS to the linker flags
+#  Add MSHADOW_NVCCFLAGS to the nvcc compile flags
+#----------------------------------------------------------------------------------------
+
+MSHADOW_CFLAGS = -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs
+MSHADOW_LDFLAGS = -lm
+MSHADOW_NVCCFLAGS =
+
+
+# atlas blas library has different name on CentOS
+OS := $(shell cat /etc/system-release 2>/dev/null)
+ifeq ($(findstring CentOS,$(OS)), CentOS)
+  ATLAS_LDFLAGS := -lsatlas -L/usr/lib64/atlas
+else
+  ATLAS_LDFLAGS := -lcblas
+endif
+
+ifndef USE_SSE
+	USE_SSE=1
+endif
+
+ifeq ($(USE_SSE), 1)
+	MSHADOW_CFLAGS += -msse3
+else
+	MSHADOW_CFLAGS += -DMSHADOW_USE_SSE=0
+endif
+
+# whether to use F16C instruction set extension for fast fp16 compute on CPU
+# if cross compiling you may want to explicitly turn it off if target system does not support it
+ifndef USE_F16C
+    ifneq ($(OS),Windows_NT)
+        detected_OS := $(shell uname -s)
+        ifeq ($(detected_OS),Darwin)
+            F16C_SUPP = $(shell sysctl -a | grep machdep.cpu.features | grep F16C)
+        endif
+        ifeq ($(detected_OS),Linux)
+            F16C_SUPP = $(shell cat /proc/cpuinfo | grep flags | grep f16c)
+        endif
+	ifneq ($(strip $(F16C_SUPP)),)
+                USE_F16C=1
+        else
+                USE_F16C=0
+        endif
+    endif
+    # if OS is Windows, check if your processor and compiler support F16C architecture.
+    # One way to check if processor supports it is to download the tool 
+    # https://docs.microsoft.com/en-us/sysinternals/downloads/coreinfo.
+    # If coreinfo -c shows F16C and compiler supports it, 
+    # then you can set USE_F16C=1 explicitly to leverage that capability"
+endif
+
+ifeq ($(USE_F16C), 1)
+        MSHADOW_CFLAGS += -mf16c
+else
+        MSHADOW_CFLAGS += -DMSHADOW_USE_F16C=0
+endif
+
+ifeq ($(USE_CUDA), 0)
+	MSHADOW_CFLAGS += -DMSHADOW_USE_CUDA=0
+else
+	MSHADOW_LDFLAGS += -lcudart -lcublas -lcurand -lcusolver
+endif
+ifneq ($(USE_CUDA_PATH), NONE)
+	MSHADOW_CFLAGS += -I$(USE_CUDA_PATH)/include
+	MSHADOW_LDFLAGS += -L$(USE_CUDA_PATH)/lib64 -L$(USE_CUDA_PATH)/lib
+endif
+
+ifeq ($(USE_BLAS), mkl)
+ifneq ($(USE_INTEL_PATH), NONE)
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Darwin)
+		MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/mkl/lib
+		MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/lib
+	else
+		MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/mkl/lib/intel64
+		MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/compiler/lib/intel64
+		MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/lib/intel64
+	endif
+	MSHADOW_CFLAGS += -I$(USE_INTEL_PATH)/mkl/include
+endif
+ifneq ($(USE_STATIC_MKL), NONE)
+ifeq ($(USE_INTEL_PATH), NONE)
+	MKLROOT = /opt/intel/mkl
+else
+	MKLROOT = $(USE_INTEL_PATH)/mkl
+endif
+	MSHADOW_LDFLAGS += -L${MKLROOT}/../compiler/lib/intel64 -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a ${MKLROOT}/lib/intel64/libmkl_core.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a -Wl,--end-group -liomp5 -ldl -lpthread -lm
+else
+ifneq ($(USE_MKLML), 1)
+  MSHADOW_LDFLAGS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5
+endif
+endif
+else
+ifneq ($(USE_BLAS), NONE)
+	MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+endif
+endif
+
+ifeq ($(USE_MKLML), 1)
+	MSHADOW_CFLAGS += -I$(MKLROOT)/include
+	ifneq ($(shell uname),Darwin)
+		MSHADOW_LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu
+	else
+		MSHADOW_LDFLAGS += -lmklml
+	endif
+	MSHADOW_LDFLAGS += -liomp5 -L$(MKLROOT)/lib/
+endif
+
+ifeq ($(USE_BLAS), openblas)
+	MSHADOW_LDFLAGS += -lopenblas
+else ifeq ($(USE_BLAS), perfblas)
+	MSHADOW_LDFLAGS += -lperfblas
+else ifeq ($(USE_BLAS), atlas)
+	MSHADOW_LDFLAGS += $(ATLAS_LDFLAGS)
+else ifeq ($(USE_BLAS), blas)
+	MSHADOW_LDFLAGS += -lblas
+else ifeq ($(USE_BLAS), apple)
+	MSHADOW_CFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/
+	MSHADOW_LDFLAGS += -framework Accelerate
+endif
+
+ifeq ($(PS_PATH), NONE)
+	PS_PATH = ..
+endif
+ifeq ($(PS_THIRD_PATH), NONE)
+	PS_THIRD_PATH = $(PS_PATH)/third_party
+endif
+
+ifndef RABIT_PATH
+	RABIT_PATH = rabit
+endif
+
+ifeq ($(RABIT_PATH), NONE)
+	RABIT_PATH = rabit
+endif
+
+ifeq ($(USE_RABIT_PS),1)
+	MSHADOW_CFLAGS += -I$(RABIT_PATH)/include
+	MSHADOW_LDFLAGS += -L$(RABIT_PATH)/lib -lrabit_base
+	MSHADOW_CFLAGS += -DMSHADOW_RABIT_PS=1
+else
+	MSHADOW_CFLAGS += -DMSHADOW_RABIT_PS=0
+endif
+
+ifeq ($(USE_DIST_PS),1)
+MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++11 \
+	-I$(PS_PATH)/src -I$(PS_THIRD_PATH)/include
+PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libps_main.a) \
+	$(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \
+	libglog.a libz.a libsnappy.a)
+	# -L$(PS_THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy
+MSHADOW_NVCCFLAGS += --std=c++11
+else
+	MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0
+endif
+
+# MSHADOW_USE_PASCAL=1 used to enable true-fp16 gemms.  Now, mshadow
+# only uses pseudo-fp16 gemms, so this flag will be removed after
+# dependent projects no longer reference it.
+MSHADOW_CFLAGS += -DMSHADOW_USE_PASCAL=0
diff --git a/3rdparty/mshadow/mshadow-ps/.gitignore b/3rdparty/mshadow/mshadow-ps/.gitignore
new file mode 100644
index 000000000000..076c1aa82e8b
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/.gitignore
@@ -0,0 +1,3 @@
+Makefile
+test
+test.cpp
diff --git a/3rdparty/mshadow/mshadow-ps/README.md b/3rdparty/mshadow/mshadow-ps/README.md
new file mode 100644
index 000000000000..9c90cc9f3c9d
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/README.md
@@ -0,0 +1,4 @@
+mshadow-ps
+====
+This folder contains mshadow-ps parameter server interface for mshadow GPU/CPU Tensor. See [guide on mshadow-ps](../guide/mshadow-ps) for introduction of the interface.
+
diff --git a/3rdparty/mshadow/mshadow-ps/mshadow_ps.h b/3rdparty/mshadow/mshadow-ps/mshadow_ps.h
new file mode 100644
index 000000000000..a00e79aefda2
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/mshadow_ps.h
@@ -0,0 +1,358 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file mshadow_ps.h
+ * \brief parameter server abstraction for mshadow tensor
+ *  this is a plugin of mshadow that can be used to syncrhonize
+ *  parameters across device and machines
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_H_  // NOLINT(*)
+#define MSHADOW_PS_H_  // NOLINT(*)
+#include <vector>
+// optionally support of lambda function in C++11, if available
+#if __cplusplus >= 201103L
+#include <functional>
+#endif  // C++11
+#include "../mshadow/tensor.h"
+
+/*! \brief whether to adapt distributed PS from parameter-server */
+#ifndef MSHADOW_DIST_PS
+#define MSHADOW_DIST_PS 1
+#endif
+
+/*! \brief whether to support BSP rabit API of PS*/
+#ifndef MSHADOW_RABIT_PS
+#define MSHADOW_RABIT_PS 1
+#endif
+
+namespace mshadow {
+/*! \brief namespace of mshadow-ps */
+namespace ps {
+/*!
+ * \brief interface of parameter server
+ * \tparam xpu the device of the data lies
+ * \tparam DType the type of element in the tensor
+ */
+template<typename xpu,
+         typename DType MSHADOW_DEFAULT_DTYPE>
+class ISharedModel {
+ public:
+  /*!
+   * \brief callback function that will be executed when pull request finishes
+   *        before calling the callback, the thread context is already switched
+   *        to the device of pullrequest
+   * \param stream the stream of callback thread, it is recommended to operate using this stream
+   * \param arg the argument of callback function
+   */
+  typedef void (CallbackFunction) (Stream<xpu> *stream, void *arg);
+  /*! \brief virtual destructor */
+  virtual ~ISharedModel(void) {}
+  /*!
+   * \brief Set param for the layer from string
+   * \param name parameter name
+   * \param val string for configuration
+   */
+  virtual void SetParam(const char *name, const char *val) {}
+  /*!
+   * \brief initialize the paramerver server client
+   * \param devices specifies the possible device id
+   *   to be input from Push and Pull,
+   */
+  virtual void Init(const std::vector<int> &devices) {}
+  /*!
+   * \brief initialize the paramerver server client
+   * without specifying the devices, only device 0 is allowed
+   */
+  inline void Init(void) {
+    std::vector<int> dev;
+    dev.push_back(0);
+    this->Init(dev);
+  }
+  /*!
+   * \brief initialize a key with certain shape
+   *  must be called before using Push/PullReq/PullWait
+   *  on the corresponding key
+   * \param shape the shape content of the key
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  template<int dim>
+  inline void InitKey(Shape<dim> shape,
+                      int key, int devid) {
+    this->InitKey_(shape.FlatTo2D(), key, devid);
+  }
+  /*!
+   * \brief wait until the pull event finishes
+   * if there was no pull request, wait will directly returns
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void PullWait(int key, int devid) = 0;
+  /*!
+   * \brief check if the weight was correct on the current device
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  template<int dim>
+  inline void CheckWeight(Tensor<xpu, dim, DType> data,
+                          int key,
+                          int devid) {
+    this->CheckWeight_(data.FlatTo2D(), key, devid);
+  }
+  /*!
+   * \brief push out a tensor to parameter server
+   *  this call is asynchronize and returns immediately
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   */
+  template<int dim>
+  inline void Push(Tensor<xpu, dim, DType> data,
+                   int key,
+                   int devid,
+                   int priority = 0) {
+    this->Push_(data.FlatTo2D(), key, devid, priority);
+  }
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function that will
+   *                 be invoked when the request finishes
+   * \param callback_arg the argument to pass to callback
+   */
+  template<int dim>
+  inline void PullReq(Tensor<xpu, dim, DType> data,
+                      int key,
+                      int devid,
+                      int priority = 0,
+                      CallbackFunction callback = NULL,
+                      void *callback_arg = NULL) {
+    this->PullReq_(data.FlatTo2D(), key,
+                   devid, priority, callback, callback_arg);
+  }
+#if __cplusplus >= 201103L
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *  this is the c++11 version that allows lambda function as callback
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function
+   */
+  template<int dim>
+  inline void PullReq(Tensor<xpu, dim, DType> data,
+                      int key,
+                      int devid,
+                      int priority,
+                      std::function<void(Stream<xpu> *stream)> callback) {
+    // need to allocate space, because callback can happen latter..
+    auto calbk = new std::function<void(Stream<xpu> *stream)>();
+    *calbk = callback;
+    this->PullReq(data, key, devid, priority, InvokeLambda_, calbk);
+  }
+#endif  // C++11
+
+  /*!
+   * \brief set weight of corresponding key in server
+   *   this is a debug function that was not necessarily
+   *   implemented by the server
+   * \param data the data to set
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void SetWeight_(Tensor<xpu, 2, DType> data,
+                          int key,
+                          int devid) = 0;
+  /*!
+   * \brief check if the weight matches the server side
+   *   this is a debug function that was not necessarily
+   *   implemented by the server
+   * \param data the data to set
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void CheckWeight_(Tensor<xpu, 2, DType> data,
+                            int key,
+                            int devid) = 0;
+
+ protected:
+  /*!
+   * \brief initialize a key with certain shape
+   * \param shape the shape content of the key
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   */
+  virtual void InitKey_(Shape<2> shape,
+                        int key, int devid) = 0;
+  /*!
+   * \brief push out a tensor to parameter server
+   *  this call is asynchronize and returns immediately
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   */
+  virtual void Push_(Tensor<xpu, 2, DType> data,
+                     int key,
+                     int devid,
+                     int priority = 0) = 0;
+  /*!
+   * \brief send a pull request, to pull parameter into data
+   *  this call is asynchronize and returns immediately
+   *  use PullWait to wait the event of copy finish
+   *
+   * \param data the data
+   * \param key the unique key to indicate the tensor,
+   *        this is unique per device
+   * \param devid the device id this tensor lies in
+   * \param priority the priority of this operation,
+   *   the bigger the number is the higher the priority will be
+   * \param callback the callback function that will
+   *                 be invoked when the request finishes
+   * \param callback_arg the argument to pass to callback
+   */
+  virtual void PullReq_(Tensor<xpu, 2, DType> data,
+                        int key,
+                        int devid,
+                        int priority,
+                        CallbackFunction callback,
+                        void *callback_arg) = 0;
+
+ private:
+// C++11 support for lambda prepare function
+#if __cplusplus >= 201103L
+  /*! \brief hack function to convert lambda to callback function */
+  inline static void InvokeLambda_(Stream<xpu> *stream, void *fun) {
+    auto *fp = static_cast<std::function<void(Stream<xpu> *stream)>*>(fun);
+    (*fp)(stream);
+    delete fp;
+  }
+#endif  // C++11
+};
+/*! \brief interface for customized mshadow server */
+template<typename DType>
+class IModelUpdater {
+ public:
+  virtual ~IModelUpdater(void) {}
+  /*!
+   * \brief set parameters from outside
+   * \param name name of parameter
+   * \param val value of parameter
+   */
+  virtual void SetParam(const char *name, const char *val) {}
+  /*!
+   * \brief init the model updater
+   * \param rank the rank of the node
+   * \param argc number of arguments
+   * \param argv arguments
+   */
+  virtual void InitUpdater(int rank, int argc, char *argv[]) {}
+  /*!
+   * \brief initialize the model
+   * \param key the key of data we point to
+   * \param dptr the data pointer
+   * \param size size of the parameter key
+   */
+  virtual void InitModel(int key, DType *dptr, size_t size) {
+    this->InitModel_(key, Tensor<cpu, 1, DType>(dptr, Shape1(size)));
+  }
+  /*!
+   * update the model
+   * \param key the key of data we point to
+   * \param dptr the data pointer
+   * \param size size of the parameter key
+   */
+  virtual void Update(int key, DType *dptr, size_t size) {
+    this->Update_(key, Tensor<cpu, 1, DType>(dptr, Shape1(size)));
+  }
+
+ protected:
+  /*!
+   * \brief initialize the model, user can implement this one
+   *   to take advantage of tensor operations
+   * \param key the key of data we point to
+   * \param data the tensor data corresponding to the data we want to initialize
+   */
+  virtual void InitModel_(int key, Tensor<cpu, 1, DType> data) {
+    LOG(FATAL) << "InitModel: not implemented";
+  }
+  /*!
+   * \brief update the model, user can implement this one
+   *    to take advantage of tensor operations
+   * \param key the key of data we point to
+   * \param data the tensor data corresponding to the data we want to initialize
+   */
+  virtual void Update_(int key, Tensor<cpu, 1, DType> data) {
+    LOG(FATAL) << "InitModel: not implemented";
+  }
+};
+/*!
+ * \brief create customized server
+ * this is a server defined by user
+ * \return new server
+ */
+template<typename DType>
+IModelUpdater<DType> *CreateModelUpdater(void);
+}  // namespace ps
+}  // namespace mshadow
+
+#include "./ps_local-inl.h"
+#include "./ps_dist-inl.h"
+#include "./ps_rabit-inl.h"
+namespace mshadow {
+namespace ps {
+/*!
+ * \brief create a parameter server implementation
+ * \param type the type of paramerver server
+ *     can either be "local" or "dist"
+ * \return the ISharedModel that can be used to synchronize weights
+ */
+template<typename xpu, typename DType>
+inline ISharedModel<xpu, DType> *CreateSharedModel(const char *type) {
+  if (!strcmp("local", type)) {
+#if MSHADOW_RABIT_PS
+    // allreduce on one machine pays no cost
+    if (rabit::IsDistributed()) {
+      return new RabitModel<xpu, DType>();
+    }
+#endif
+    return new LocalModel<xpu, DType>();
+  }
+#if MSHADOW_DIST_PS
+  if (!strcmp("dist", type)) return new DistModel<xpu, DType>();
+#endif
+  LOG(FATAL) << "unknown server type " << type;
+  return NULL;
+}
+}  // namespace ps
+}  // namespace mshadow
+#endif  // MSHADOW_PS_H_  NOLINT(*)
diff --git a/3rdparty/mshadow/mshadow-ps/ps_dist-inl.h b/3rdparty/mshadow/mshadow-ps/ps_dist-inl.h
new file mode 100644
index 000000000000..9bce0e13adbc
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/ps_dist-inl.h
@@ -0,0 +1,126 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps_dist-inl.h
+ * \brief distributed version of PS
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_DIST_INL_H_ // NOLINT(*)
+#define MSHADOW_PS_DIST_INL_H_ // NOLINT(*)
+
+#include <vector>
+#include "./mshadow_ps.h"
+#include "./ps_local-inl.h"
+
+#if MSHADOW_DIST_PS
+#include "parameter/kv_layer.h"
+namespace mshadow {
+namespace ps {
+
+/**
+ * @brief bridge IModelUpdater to KVLayerUpdater
+ */
+template<typename DType>
+class UpdaterWrapper {
+ public:
+  explicit UpdaterWrapper(IModelUpdater<DType> * updater)
+      : updater_(updater) { }
+  ~UpdaterWrapper() { delete updater_; }
+
+  /// @brief initialize the data
+  void Init(int id, size_t size, DType* data) {
+    updater_->InitModel(id, data, size);
+  }
+
+  /// @brief update the model by using received data
+  void Update(int id, size_t size, const DType* recv_data, DType* data) {
+    updater_->Update(id, (DType*)recv_data, size);  // NOLINT(*)
+  }
+ private:
+  IModelUpdater<DType> *updater_;
+};
+
+
+template<typename xpu, typename DType>
+class DistModel : public LocalModel<xpu, DType> {
+ public:
+  // parent type
+  typedef LocalModel<xpu, DType> Parent;
+
+  // initialize the parameter server
+  virtual void Init(const std::vector<int> &devices) {
+    Parent::Init(devices);
+    if (this->custom_server != NULL) {
+      delete this->custom_server;
+      this->custom_server = NULL;
+    }
+  }
+  virtual ~DistModel(void) {
+  }
+
+ protected:
+  // do nothing
+  virtual void InitCustomerServer(void) {
+  }
+  virtual void ServerInitKey(Tensor<cpu, 2> weight, int key) {
+    // this is called when key get initialized for the first time
+    // weight can be used to hold the model that pulled back
+    // use this to initialize the key on serverside
+    shared_model_.Pull(
+        ::ps::Parameter::Request(key), weight.dptr_, weight.MSize(),
+        [this, weight, key]() {
+          // call PullReady to notify LocalServer pulling is ready
+          this->PullReady(weight, key);
+        });
+  }
+  // override this function, to use parameter server
+  virtual void HandlePushFinish(Tensor<cpu, 3, DType> data,
+                                int key) {
+    // summation the data fron all devices
+    LocalModel<xpu, DType>::ReduceSum(data);
+
+    // push and pull
+    Tensor<cpu, 2> sendrecv = data[0];
+    CHECK_EQ(data[0].CheckContiguous(), true) << "data must be contiguous";
+
+    int ts = shared_model_.Push(
+        ::ps::Parameter::Request(key), sendrecv.dptr_, sendrecv.MSize(), false);
+
+    // let this pull request wait the push finish at the server node
+    shared_model_.Pull(
+        ::ps::Parameter::Request(key, -1, {ts}), sendrecv.dptr_, sendrecv.MSize(),
+        [this, sendrecv, key]() {
+          // call PullReady to notify LocalServer pulling is ready
+          this->PullReady(sendrecv, key);
+        });
+  }
+
+ private:
+  ::ps::KVLayer<DType, UpdaterWrapper<DType> > shared_model_;
+};
+
+
+template<typename DType>
+class MShadowServerNode {
+ public:
+  // conf: get from the flag -app_conf
+  MShadowServerNode(int argc, char *argv[]) {
+    IModelUpdater<DType> *updater = CreateModelUpdater<DType>();
+    updater->InitUpdater(::ps::MyRank(), argc, argv);
+
+    UpdaterWrapper<DType> *wrapper = new UpdaterWrapper<DType>(updater);
+    typedef ::ps::KVLayer<DType, UpdaterWrapper<DType> > PSServer;
+    PSServer *shared_model_ = new PSServer();
+    shared_model_->set_updater(wrapper);
+    ::ps::Postoffice::instance().manager().TransferCustomer(
+         CHECK_NOTNULL(shared_model_));
+  }
+  virtual ~MShadowServerNode() { }
+};
+
+// NOTE: do not add PS::CreateServer here add it in the program that uses
+// mshadow-ps
+}  // namespace ps
+}  // namespace mshadow
+#endif
+#endif  // MSHADOW_PS_DIST_INL_H_  NOLINT(*)
diff --git a/3rdparty/mshadow/mshadow-ps/ps_local-inl.h b/3rdparty/mshadow/mshadow-ps/ps_local-inl.h
new file mode 100644
index 000000000000..ee4c55d92487
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/ps_local-inl.h
@@ -0,0 +1,814 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps_local-inl.h
+ * \brief local multi-threading implementation of PS abstraction
+ *
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_LOCAL_INL_H_  // NOLINT(*)
+#define MSHADOW_PS_LOCAL_INL_H_  // NOLINT(*)
+#include <map>
+#include <utility>
+#include <string>
+#include <vector>
+#if defined(_OPENMP)
+#include <omp.h>
+#ifdef _MSC_VER
+typedef int ms_omp_uint;
+#else
+typedef unsigned ms_omp_uint;
+#endif
+#endif
+
+#include "./thread.h"
+#include "./thread_util.h"
+
+namespace mshadow {
+namespace ps {
+// multi-threaded implementation of
+template<typename xpu, typename DType>
+class LocalModel : public ISharedModel<xpu, DType> {
+ public:
+  // redefine callback function
+  typedef typename ISharedModel<xpu, DType>::CallbackFunction
+  CallbackFunction;
+  // constructor
+  LocalModel(void) {
+    init_end = 0;
+    perdev_pull_thread = 1;
+    perdev_push_thread = 1;
+    use_fifo_push_queue = 0;
+    bigarray_bound = 1000 * 1000;
+    nthread_reduction = 8;
+    use_pin_memory = 1;
+    test_on_server = 0;
+    update_on_server = 0;
+    destroy_signal = false;
+    custom_server = NULL;
+  }
+  // destructor
+  virtual ~LocalModel(void) {
+    this->Destroy();
+  }
+  inline void Destroy(void) {
+    if (init_end != 0) {
+      destroy_signal = true;
+      for (size_t i = 0; i < push_queues.size(); ++i) {
+        push_queues[i].Abort(1);
+      }
+      for (size_t i = 0; i < pull_queues.size(); ++i) {
+        pull_queues[i].Abort(1);
+      }
+      for (size_t i = 0; i < thread_push_handler.size(); ++i) {
+        thread_push_handler[i].Join();
+      }
+      for (size_t i = 0; i < thread_pull_handler.size(); ++i) {
+        thread_pull_handler[i].Join();
+      }
+      for (size_t i = 0; i < push_queues.size(); ++i) {
+        push_queues[i].Destroy();
+      }
+      push_map.Destroy();
+      push_lock.Destroy();
+      for (size_t i = 0; i < pull_queues.size(); ++i) {
+        pull_queues[i].Destroy();
+      }
+      pull_map.Destroy();
+      request_lock.Destroy();
+      wait_lock.Destroy();
+      wait_cond.Destroy();
+      init_end = 0;
+    }
+    if (custom_server != NULL) {
+      delete custom_server;
+      custom_server = NULL;
+    }
+  }
+  virtual void SetParam(const char *name, const char *val) {
+    int key;
+    if (sscanf(name, "push_op[%d]", &key) == 1) {
+      if (!strcmp(val, "gather")) {
+        request_lock.Lock();
+        push_operation[key] = kGather;
+        request_lock.Unlock();
+        return;
+      }
+      if (!strcmp(val, "sum")) {
+        push_operation[key] = kSum; return;
+      }
+      LOG(FATAL) << "unknown push operation " << val;
+    }
+    if (!strcmp(name, "reduce_thread")) {
+      nthread_reduction = atoi(val);
+    }
+    if (!strcmp(name, "use_pin_memory")) {
+      use_pin_memory = atoi(val);
+    }
+    if (!strcmp(name, "bigarray_bound")) {
+      bigarray_bound = static_cast<size_t>(atol(val));
+    }
+    if (!strcmp(name, "pull_thread")) {
+      if (!strcmp(val, "ndev")) {
+        perdev_pull_thread = 1;
+      } else if (!strcmp(val, "one")) {
+        perdev_pull_thread = 0;
+      } else {
+        LOG(FATAL) << "invalid value for parameter pull_thread," << " can only be ndev or one";
+      }
+    }
+    if (!strcmp(name, "push_thread")) {
+      if (!strcmp(val, "ndev")) {
+        perdev_push_thread = 1;
+      } else if (!strcmp(val, "one")) {
+        perdev_push_thread = 0;
+      } else {
+        LOG(FATAL) << "invalid value for parameter push_thread," << " can only be ndev or one";
+      }
+    }
+    if (!strcmp(name, "update_on_server")) {
+      update_on_server = atoi(val);
+    }
+    if (!strcmp(name, "test_on_server")) {
+      test_on_server = atoi(val);
+    }
+    // ignore message parameter
+    if (!strncmp(name, "msg:", 4)) return;
+    cfgvec.push_back(std::make_pair(std::string(name),
+                                    std::string(val)));
+  }
+  virtual void PullWait(int key, int devid) {
+    const int wid = GetWorkIndex(devid);
+    PullEntry *p = pull_map.Get(key);
+    if (p == NULL || p->wait.size() == 0) return;
+    PullEntry &e = *p;
+    // wake up waiters if any
+    CHECK_EQ(e.wait.size(), devices.size()) << "PullWait: must initialize the wait";
+    PullWaitRecord &w = e.wait[wid];
+    if (!w.finished) {
+      wait_lock.Lock();
+      w.nwait += 1;
+      while (!w.finished) {
+        wait_cond.Wait(&wait_lock);
+      }
+      w.nwait -= 1;
+      CHECK_GE(w.nwait, 0) << "boundary check";
+      wait_lock.Unlock();
+    }
+  }
+  virtual void Init(const std::vector<int> &devices) {
+    CHECK_EQ(init_end, 0) << "LocalServer.Init can only call Init once";
+    CHECK_NE(devices.size(), 0) << "LocalServer.Init: must at least contain 1 devices";
+    this->devices = devices;
+    destroy_signal = false;
+    // initialize device id to local index
+    dev2index.clear();
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int devid = devices[i];
+      CHECK_GE(devid, 0) << "device id must be bigger than 0";
+      if (devid >= static_cast<int>(dev2index.size())) {
+        dev2index.resize(devid + 1, -1);
+      }
+      dev2index[devid] = static_cast<int>(i);
+    }
+    // allocate space
+    pull_stream.resize(devices.size());
+    push_stream.resize(devices.size());
+    // initialize all the thread related things
+    if (perdev_push_thread != 0) {
+      push_queues.resize(devices.size());
+    } else {
+      push_queues.resize(1);
+    }
+    for (size_t i = 0; i < push_queues.size(); ++i) {
+      push_queues[i].Init(use_fifo_push_queue != 0);
+    }
+    push_map.Init();
+    push_lock.Init();
+    pull_map.Init();
+    request_lock.Init();
+    wait_lock.Init();
+    wait_cond.Init();
+    if (perdev_pull_thread != 0) {
+      pull_queues.resize(devices.size());
+    } else {
+      pull_queues.resize(1);
+    }
+    for (size_t i = 0; i < pull_queues.size(); ++i) {
+      pull_queues[i].Init();
+    }
+    // initialize the thread
+    if (perdev_push_thread != 0) {
+      thread_push_handler.resize(devices.size());
+      for (size_t i = 0; i < devices.size(); ++i) {
+        std::pair<LocalModel*, size_t> *p
+            = new std::pair<LocalModel*, size_t>();
+        *p = std::make_pair(this, i);
+        thread_push_handler[i].Start(PushLocalThread, p);
+      }
+    } else {
+      thread_push_handler.resize(1);
+      thread_push_handler[0].Start(PushGlobalThread, this);
+    }
+    // initialize pull handler
+    if (perdev_pull_thread != 0) {
+      thread_pull_handler.resize(devices.size());
+      for (size_t i = 0; i < devices.size(); ++i) {
+        std::pair<LocalModel*, size_t> *p
+            = new std::pair<LocalModel*, size_t>();
+        *p = std::make_pair(this, i);
+        thread_pull_handler[i].Start(PullLocalThread, p);
+      }
+    } else {
+      thread_pull_handler.resize(1);
+      thread_pull_handler[0].Start(PullGlobalThread, this);
+    }
+    this->InitCustomerServer();
+    this->init_end = 1;
+  }
+
+  // set weight
+  virtual void SetWeight_(Tensor<xpu, 2, DType> data,
+                          int key,
+                          int devid) {
+    PushEntry &e = push_map.GetRef(key);
+    Stream<xpu> s;
+    push_lock.Lock();
+    mshadow::Copy(e.weight, data, &s);
+    push_lock.Unlock();
+  }
+  virtual void CheckWeight_(Tensor<xpu, 2, DType> data,
+                            int key,
+                            int devid) {
+    CHECK_NE(test_on_server, 0) << "must be in pair debug mode";
+    PushEntry &e = push_map.GetRef(key);
+    mshadow::TensorContainer<cpu, 2, DType> tmp(false);
+    tmp.Resize(data.shape_);
+    Stream<xpu> s;
+    push_lock.Lock();
+    // copy data
+    mshadow::Copy(tmp, data, &s);
+    index_t count = tmp.shape_.Size();
+    double diff = 0.0, ssum = 0.0, maxdiff = 0.0;
+    index_t mxidx = 0;
+    for (index_t i = 0; i < count; ++i) {
+      double d = std::abs(tmp.dptr_[i] - e.weight.dptr_[i]);
+      if (d > maxdiff) {
+        maxdiff = d; mxidx = i;
+      }
+      diff += d;
+      ssum += std::abs(tmp.dptr_[i]);
+    }
+    push_lock.Unlock();
+    // relative absolute error
+    double rerr = diff / ssum;
+    if (rerr > 1e-5 || diff != diff) {
+      fprintf(stderr, "PSLocal:key=%d,dev=%d: err=%f, maxd[%u]=%f, diff=%f, ssum=%f\n",
+              key, devid, rerr, mxidx, maxdiff, diff, ssum);
+    } else {
+      fprintf(stderr, "PSLocal:key=%d,dev=%d:check pass\n", key, devid);
+    }
+  }
+
+ protected:
+  /*! \brief operation performed locally in PS */
+  enum LocalOp {
+    /*! \brief take sum of all devices over the same key */
+    kSum = 0,
+    /*!
+     * \brief concatenate(gather),
+     *  the tensors in all devices with same key
+     */
+    kGather = 1
+  };
+  virtual void InitKey_(Shape<2> shape,
+                        int key, int devid) {
+    this->InitPullMap(key);
+    this->InitPushMap(key, shape);
+  }
+  virtual void Push_(Tensor<xpu, 2, DType> data,
+                     int key, int devid, int priority) {
+    PullEntry &e = pull_map.GetRef(key);
+    e.req[GetWorkIndex(devid)].ready = false;
+    if (perdev_push_thread != 0) {
+      int wid = GetWorkIndex(devid);
+      push_queues[wid].Push(PullTask(data, key, devid), priority);
+    } else {
+      push_queues[0].Push(PullTask(data, key, devid), priority);
+    }
+  }
+  virtual void PullReq_(Tensor<xpu, 2, DType> data,
+                        int key, int devid, int priority,
+                        CallbackFunction callback,
+                        void *callback_arg) {
+    PullEntry &e = pull_map.GetRef(key);
+    CHECK_EQ(e.req.size(), devices.size()) << "PullReq: must initialize the key, req";
+    CHECK_EQ(e.wait.size(), devices.size()) << "PullReq: must initialize the key, wait";
+    const int wid = GetWorkIndex(devid);
+    PullReqRecord &r = e.req[wid];
+    r.dest = data;
+    r.priority = priority;
+    r.callback = callback;
+    r.callback_arg = callback_arg;
+    // reset pull request finish mark
+    wait_lock.Lock();
+    e.wait[wid].finished = false;
+    wait_lock.Unlock();
+    // check ready event
+    request_lock.Lock();
+    CHECK_EQ(!r.pending, true) << "key = " << key
+      << "cannot send duplicate pull request before it finishes";
+    if (e.req[wid].ready) {
+      if (perdev_pull_thread != 0) {
+        pull_queues[wid].Push(std::make_pair(key, devid));
+      } else {
+        pull_queues[0].Push(std::make_pair(key, devid));
+      }
+    } else {
+      r.pending = true;
+    }
+    request_lock.Unlock();
+  }
+  /*!
+   * \brief called to notify that the data is ready for pull
+   * \param data the data that can be pulled back
+   * \param the key of the data
+   */
+  virtual void PullReady(Tensor<cpu, 2> data, int key) {
+    PullEntry &e = pull_map.GetRef(key);
+    CHECK_EQ(e.req.size(), devices.size()) << "PullReady: must initialize the key, req";
+    request_lock.Lock();
+    e.src = data;
+    for (index_t i = 0; i < e.req.size(); ++i) {
+      e.req[i].ready = true;
+      if (e.req[i].pending) {
+        if (perdev_pull_thread != 0) {
+          pull_queues[i].Push(std::make_pair(key, devices[i]));
+        } else {
+          pull_queues[0].Push(std::make_pair(key, devices[i]));
+        }
+        e.req[i].pending = false;
+      }
+    }
+    request_lock.Unlock();
+  }
+  virtual void ServerInitKey(Tensor<cpu, 2> weight, int key) {
+    if (custom_server != NULL) {
+      // intialize server, and ready for pullback
+      custom_server->InitModel(key, weight.dptr_, weight.MSize());
+      if (update_on_server != 0) {
+        this->PullReady(weight, key);
+      }
+    }
+  }
+  /*!
+   * \brief event handler for push finish
+   *  called when all the data with same key comes int
+   * \param data the buffer holds the data in all devices
+   * \param key the key of the data
+   */
+  virtual void HandlePushFinish(Tensor<cpu, 3, DType> data,
+                                int key) {
+    // LOG(ERROR) << dbstr(data);
+    LocalOp op = kSum;
+    typename std::map<int, LocalOp>::const_iterator
+        it = push_operation.find(key);
+    if (it != push_operation.end() && it->first == key) {
+      op = it->second;
+    }
+    // customized server
+    if (custom_server != NULL) {
+      this->ReduceSum(data);
+      custom_server->Update(key, data[0].dptr_, data[0].MSize());
+      if (update_on_server != 0) {
+        PushEntry &e = push_map.GetRef(key);
+        this->PullReady(e.weight, key);
+      } else {
+        CHECK_NE(test_on_server, 0) << "test mode";
+        this->PullReady(data[0], key);
+      }
+      return;
+    }
+    switch (op) {
+      case kSum: {
+        this->ReduceSum(data);
+        this->PullReady(data[0], key);
+        return;
+      }
+      case kGather: {
+        this->PullReady(data.FlatTo2D(), key);
+        return;
+      }
+      default: LOG(FATAL) << "unknown LocalOp";
+    }
+  }
+  /*!
+   * \brief event handler for reduce finish
+   *  called when all the data with same key finishes the reduction
+   * \param data the buffer holds the reduction result
+   * \param key the key of the data
+   */
+  inline void HandleReduceFinish(Tensor<cpu, 2, DType> data,
+                                 int key) {
+    if (custom_server != NULL) {
+      custom_server->Update(key, data.dptr_, data.MSize());
+      if (update_on_server != 0) {
+        PushEntry &e = push_map.GetRef(key);
+        this->PullReady(e.weight, key);
+      } else {
+        CHECK_NE(test_on_server, 0) << "test mode";
+        this->PullReady(data, key);
+      }
+    } else {
+      this->PullReady(data, key);
+    }
+  }
+  virtual void InitCustomerServer(void) {
+    if (update_on_server != 0 || test_on_server != 0) {
+      custom_server = CreateModelUpdater<DType>();
+      for (size_t j = 0; j < cfgvec.size(); ++j) {
+        custom_server->SetParam(cfgvec[j].first.c_str(),
+                                cfgvec[j].second.c_str());
+      }
+      custom_server->InitUpdater(0, 0, NULL);
+    }
+  }
+
+ protected:
+  // customized server
+  IModelUpdater<DType> *custom_server;
+  // whether use fifo push queue
+  int use_fifo_push_queue;
+
+  // perform sum reduction
+  inline void ReduceSum(Tensor<cpu, 3, DType> data) {
+    #if defined(_OPENMP)
+    if (data[0].MSize() >= bigarray_bound &&
+        nthread_reduction != 0) {
+      ms_omp_uint ntask = static_cast<ms_omp_uint>(data.size(1));
+      #pragma omp parallel for schedule(static) num_threads(nthread_reduction)
+      for (ms_omp_uint j = 0; j < ntask; ++j) {
+        for (index_t i = 1; i < data.size(0); ++i) {
+          data[0][j] += data[i][j];
+        }
+      }
+    } else  //NOLINT(*)
+      #endif
+    {
+      for (index_t i = 1; i < data.size(0); ++i) {
+        data[0] += data[i];
+      }
+    }
+  }
+
+ private:
+  /*! \brief task running */
+  struct PullTask {
+    /*! \brief the task data source */
+    Tensor<xpu, 2, DType> data;
+    /*! \brief the key to the tensor */
+    int key;
+    /*!
+     * \brief the device id, (key,devid),
+     * uniquely identifies a mem location
+     */
+    int devid;
+    PullTask(void) {}
+    PullTask(Tensor<xpu, 2, DType> data, int key, int devid)
+        : data(data), key(key), devid(devid) {}
+  };
+  /*! \brief data structure to hold temporal push result */
+  struct PushEntry {
+    // temporal space to hold input data
+    Tensor<cpu, 4, DType> data;
+    // temporal space to hold weight, if needed
+    Tensor<cpu, 2, DType> weight;
+    // indicator whether the certain devices is already copied in
+    std::vector<bool> copied;
+    // number of data copied in
+    int num_copied;
+    // version number of data used to hold incomming data in push
+    int copyin_version;
+    // use pinned memory
+    bool pin_memory;
+    // constructor
+    PushEntry(void)
+        : copyin_version(0) {
+      weight.dptr_ = NULL;
+    }
+    ~PushEntry(void) {
+      if (data.dptr_ != NULL) {
+        if (pin_memory) {
+          mshadow::FreeHost<xpu>(&data);
+          if (weight.dptr_ != NULL) {
+            mshadow::FreeHost<xpu>(&weight);
+          }
+        } else {
+          mshadow::FreeSpace(&data);
+          if (weight.dptr_ != NULL) {
+            mshadow::FreeSpace(&weight);
+          }
+        }
+      }
+    }
+    // constructor
+    inline void Init(int ndevice, Shape<2> shape,
+                     bool pin_memory, bool need_weight) {
+      this->pin_memory = pin_memory;
+      data.shape_ = Shape4(2, ndevice, shape[0], shape[1]);
+      weight.shape_ = shape;
+      if (pin_memory) {
+        mshadow::AllocHost<xpu>(&data);
+        if (need_weight) mshadow::AllocHost<xpu>(&weight);
+      } else {
+        mshadow::AllocSpace(&data, false);
+        if (need_weight) mshadow::AllocSpace(&weight);
+      }
+      CHECK_EQ(data.CheckContiguous(), true) << "Data must be contiguous";
+      CHECK(!need_weight || weight.CheckContiguous()) << "Weight must be contiguous";
+      num_copied = 0;
+      copied.resize(ndevice, false);
+    }
+  };
+  // a record to remember things related to pull request
+  struct PullReqRecord {
+    // whether this record contains a pending request
+    // whether pull is ready to go
+    bool ready;
+    // waiting for pull ready
+    bool pending;
+    // the destination to pull data into
+    Tensor<xpu, 2, DType> dest;
+    // the priority of the
+    int priority;
+    // callback function
+    CallbackFunction *callback;
+    // argument for callback
+    void *callback_arg;
+    PullReqRecord(void) : ready(false), pending(false) {
+    }
+  };
+  // a record to help handle pullwait
+  struct PullWaitRecord {
+    // number of thread that waits for the request to finish
+    int nwait;
+    // the request was finished
+    bool finished;
+    PullWaitRecord(void)
+        : nwait(0), finished(true) {
+      // set finished to true so pull without pull request returns
+    }
+  };
+  /*! \brief data structure to hold pull request */
+  struct PullEntry {
+    // data to be pulled back
+    Tensor<cpu, 2, DType> src;
+    // pullrequest record
+    std::vector<PullReqRecord> req;
+    // whether there is thread waiting on this event
+    std::vector<PullWaitRecord> wait;
+    PullEntry(void) {
+    }
+  };
+  // signal to notify all the thread about class destruction
+  bool destroy_signal;
+  // vector of devices
+  std::vector<int> devices;
+  // device index to local index
+  std::vector<int> dev2index;
+  //----- data structure used to support push ----
+  // stream used by push thread each device for memcpy
+  std::vector<Stream<xpu>*> push_stream;
+  // the queue used for push task
+  std::vector<utils::ThreadPQueue<PullTask> > push_queues;
+  // thread to handle push task
+  std::vector<utils::Thread> thread_push_handler;
+  // lock to lock push field
+  utils::Mutex push_lock;
+  // the map of push buffer
+  utils::ThreadSafeMap<PushEntry> push_map;
+  // customized local reduction operation
+  std::map<int, LocalOp> push_operation;
+  //----- data structure used to support pull ----
+  // the queue used for pull task
+  std::vector<utils::ThreadPQueue<std::pair<int, int> > > pull_queues;
+  // stream used by pull thread each device for memcpy
+  std::vector<Stream<xpu>*> pull_stream;
+  // the map to store pull status
+  utils::ThreadSafeMap<PullEntry> pull_map;
+  // thread to handle pull task
+  std::vector<utils::Thread> thread_pull_handler;
+  // lock to lock request field
+  utils::Mutex request_lock;
+  // lock to lock wait field
+  utils::Mutex wait_lock;
+  // conditional variable to do waiting
+  utils::ConditionVariable wait_cond;
+  // ---------configurations of server-------
+  int init_end;
+  // whether perform update on serverside
+  int update_on_server;
+  // debug option
+  int test_on_server;
+  // use pinned memory
+  int use_pin_memory;
+  // number of reduction thread
+  int nthread_reduction;
+  // the threshold for big array
+  size_t bigarray_bound;
+  // whether use pull thread per device
+  int perdev_pull_thread;
+  // whether use push thread per device
+  int perdev_push_thread;
+  /*! \brief history of configurations */
+  std::vector< std::pair<std::string, std::string> > cfgvec;
+  // push handler
+  inline void PushProc(utils::ThreadPQueue<PullTask> *queue) {
+    while (!destroy_signal) {
+      PullTask tsk;
+      if (queue->Pop(&tsk)) {
+        const int wid = GetWorkIndex(tsk.devid);
+        PushEntry &e = push_map.GetRef(tsk.key);
+        CHECK_EQ(e.data[0][0].shape_, tsk.data.shape_)
+          << "Tensor with same key must share same shape "
+          << e.data[0][0].shape_
+          << " vs "
+          << tsk.data.shape_;
+        CHECK_EQ(!e.copied[wid], true) << "data inconsistency";
+        // start copy
+        SetDevice<xpu>(tsk.devid);
+        Copy(e.data[e.copyin_version][wid], tsk.data, push_stream[wid]);
+        // wait till the copy finishes
+        push_stream[wid]->Wait();
+        // mark copied
+        e.copied[wid] = true;
+        push_lock.Lock();
+        e.num_copied += 1;
+        int cp_version = e.copyin_version;
+        bool push_finish = e.num_copied >= static_cast<int>(devices.size());
+        if (push_finish) {
+          // switch version
+          e.copyin_version = (e.copyin_version + 1) % e.data.size(0);
+          std::fill(e.copied.begin(), e.copied.end(), false);
+          e.num_copied = 0;
+        }
+        push_lock.Unlock();
+        if (push_finish) {
+          this->HandlePushFinish(e.data[cp_version], tsk.key);
+        }
+      } else {
+        CHECK_EQ(destroy_signal, true) << "abort but not destroy";
+      }
+    }
+  }
+  inline void PushHandlerGlobal(void) {
+    // allocate stream resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      push_stream[i] = NewStream<xpu>(devices[i]);
+    }
+    this->PushProc(&push_queues[0]);
+    // free resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      DeleteStream(push_stream[i]);
+    }
+  }
+  inline void PushHandlerLocal(size_t tid) {
+    CHECK_LT(tid, devices.size()) << "threadid exceed boundary";
+    CHECK_EQ(push_queues.size(), devices.size()) << "must have one pull_queue per device";
+    // allocate stream resources
+    SetDevice<xpu>(devices[tid]);
+    push_stream[tid] = NewStream<xpu>(devices[tid]);
+    this->PushProc(&push_queues[tid]);
+    SetDevice<xpu>(devices[tid]);
+    DeleteStream(push_stream[tid]);
+  }
+  /*!\brief entry point of loader thread */
+  inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) {
+    static_cast<LocalModel*>(pthread)->PushHandlerGlobal();
+    utils::ThreadExit(NULL);
+    return NULL;
+  }
+  inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) {
+    std::pair<LocalModel*, size_t> *p
+        = static_cast<std::pair<LocalModel*, size_t>*>(arg);
+    p->first->PushHandlerLocal(p->second);
+    delete p;
+    return NULL;
+  }
+  // push handler procedure
+  inline void PullProc(utils::ThreadPQueue<std::pair<int, int> > *queue) {
+    while (!destroy_signal) {
+      std::pair<int, int> tsk;
+      if (queue->Pop(&tsk)) {
+        const int key = tsk.first;
+        const int devid = tsk.second;
+        const int wid = GetWorkIndex(devid);
+        PullEntry &e = pull_map.GetRef(key);
+        {
+          // handle request
+          CHECK_EQ(e.req.size(), devices.size()) << "PullHandler: must initialize the key, req";
+          PullReqRecord &r = e.req[wid];
+          SetDevice<xpu>(devid);
+          Copy(r.dest, e.src, pull_stream[wid]);
+          // callback, if any
+          if (r.callback != NULL) {
+            (*r.callback)(pull_stream[wid], r.callback_arg);
+          }
+          // wait till the operation finishes
+          pull_stream[wid]->Wait();
+        }
+        {
+          // wake up waiters if any
+          CHECK_EQ(e.wait.size(), devices.size()) << "PullHandler, must initialize the key, req";
+          PullWaitRecord &w = e.wait[wid];
+          wait_lock.Lock();
+          w.finished = true;
+          if (w.nwait != 0) {
+            wait_cond.Broadcast();
+          }
+          wait_lock.Unlock();
+        }
+      } else {
+        CHECK_EQ(destroy_signal, true) << "abort but not destroy";
+      }
+    }
+  }
+  // use one thread for all pull actions
+  inline void PullHandlerGlobal(void) {
+    // allocate stream resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      pull_stream[i] = NewStream<xpu>(devices[i]);
+    }
+    this->PullProc(&pull_queues[0]);
+    // free resources
+    for (size_t i = 0; i < devices.size(); ++i) {
+      SetDevice<xpu>(devices[i]);
+      DeleteStream(pull_stream[i]);
+    }
+  }
+  inline void PullHandlerLocal(size_t tid) {
+    CHECK_LT(tid, devices.size()) << "threadid exceed boundary";
+    CHECK_EQ(pull_queues.size(), devices.size()) << "must have one pull_queue per device";
+    // allocate stream resources
+    SetDevice<xpu>(devices[tid]);
+    pull_stream[tid] = NewStream<xpu>(devices[tid]);
+    this->PullProc(&pull_queues[tid]);
+    SetDevice<xpu>(devices[tid]);
+    DeleteStream(pull_stream[tid]);
+  }
+  /*!\brief entry point of pull thread, one thread for all devices */
+  inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) {
+    static_cast<LocalModel*>(arg)->PullHandlerGlobal();
+    return NULL;
+  }
+  inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) {
+    std::pair<LocalModel*, size_t> *p
+        = static_cast<std::pair<LocalModel*, size_t>*>(arg);
+    p->first->PullHandlerLocal(p->second);
+    delete p;
+    return NULL;
+  }
+  // get internal index of device
+  inline int GetWorkIndex(int devid) const {
+    CHECK(devid >= 0 &&
+          devid < static_cast<int>(dev2index.size()) &&
+          dev2index[devid] >= 0) << "Push: invalid devid";
+    return dev2index[devid];
+  }
+  // functions to handle pull
+  inline void InitPullMap(int key) {
+    pull_map.Init(key);
+    PullEntry &e = pull_map.GetRef(key);
+    request_lock.Lock();
+    // must recheck after lock
+    if (e.req.size() == 0) {
+      e.req.resize(devices.size(), PullReqRecord());
+    }
+    request_lock.Unlock();
+    // check wait map
+    wait_lock.Lock();
+    // must recheck after lock
+    if (e.wait.size() == 0) {
+      e.wait.resize(devices.size(), PullWaitRecord());
+    }
+    wait_lock.Unlock();
+  }
+  // functions to handle pull
+  inline void InitPushMap(int key, Shape<2> shape) {
+    push_map.Init(key);
+    PushEntry &e = push_map.GetRef(key);
+    push_lock.Lock();
+    if (e.copied.size() == 0) {
+      e.Init(devices.size(), shape,
+             use_pin_memory != 0,
+             update_on_server != 0 || test_on_server != 0);
+    }
+    this->ServerInitKey(e.weight, key);
+    push_lock.Unlock();
+  }
+};
+}  // namespace ps
+}  // namespace mshadow
+#endif // MSHADOW_PS_LOCAL_INL_H_  NOLINT(*)
diff --git a/3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h b/3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h
new file mode 100644
index 000000000000..238932e38982
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h
@@ -0,0 +1,113 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file ps_rabit-inl.h
+ * \brief distributed version of PS using BSP
+ *     synchronization in the backend
+ * \author Tianqi Chen, Mu Li
+ */
+#ifndef MSHADOW_PS_RABIT_INL_H_ // NOLINT(*)
+#define MSHADOW_PS_RABIT_INL_H_ // NOLINT(*)
+#include <vector>
+#include "./mshadow_ps.h"
+#include "./ps_local-inl.h"
+
+#if MSHADOW_RABIT_PS
+#include <rabit.h>
+namespace mshadow {
+namespace ps {
+// multi-threaded implementation of
+template<typename xpu, typename DType>
+class RabitModel : public LocalModel<xpu, DType> {
+ public:
+  // parent type
+  typedef LocalModel<xpu, DType> Parent;
+  // constructor
+  RabitModel() {
+    // enforce usage of fifo queue
+    this->use_fifo_push_queue = 1;
+    destroy_reduce_thread_ = false;
+    disable_allreduce_ = 0;
+    this->init_reducer_ = 0;
+  }
+  virtual ~RabitModel(void) {
+    Parent::Destroy();
+    if (init_reducer_ != 0) {
+      destroy_reduce_thread_ = true;
+      reduce_queue_.Abort(1);
+      thread_reduce_handler_.Join();
+      reduce_queue_.Destroy();
+    }
+  }
+  // initialize the parameter server
+  virtual void Init(const std::vector<int> &devices) {
+    this->use_fifo_push_queue = 1;
+    // use fifo
+    reduce_queue_.Init(true);
+    thread_reduce_handler_.Start(ReduceGlobalThread, this);
+    init_reducer_ = 1;
+    // initialize other things
+    Parent::Init(devices);
+  }
+  // set parameters
+  virtual void SetParam(const char *name, const char *val) {
+    if (!strcmp(name, "msg:disable_allreduce")) {
+      disable_allreduce_ = atoi(val);
+    }
+    Parent::SetParam(name, val);
+  }
+  // override this function, to use parameter server
+  virtual void HandlePushFinish(Tensor<cpu, 3, DType> data,
+                                int key) {
+    // summation the data fron all devices
+    LocalModel<xpu, DType>::ReduceSum(data);
+    CHECK_EQ(data[0].CheckContiguous(), true) << "data must be contiguous";
+    ReduceTask tsk;
+    tsk.data = data[0]; tsk.key = key;
+    reduce_queue_.Push(tsk, 0);
+  }
+
+ private:
+  // reduce task
+  struct ReduceTask {
+    int key;
+    mshadow::Tensor<cpu, 2> data;
+  };
+  // destroy reduce
+  bool destroy_reduce_thread_;
+  // whether reducer is initialized
+  int init_reducer_;
+  // check disable_allreduce functionalities
+  int disable_allreduce_;
+  // reduce handler thread
+  utils::Thread thread_reduce_handler_;
+  // queue for allreduce task
+  utils::ThreadPQueue<ReduceTask> reduce_queue_;
+  // reduce handler
+  inline void ReduceHandler(void) {
+    while (!destroy_reduce_thread_) {
+      ReduceTask tsk;
+      if (reduce_queue_.Pop(&tsk)) {
+        CHECK_EQ(disable_allreduce_, 0) << "Allreduce disabled error";
+        int key = tsk.key;
+        rabit::Allreduce<rabit::op::Max>(&key, 1);
+        CHECK_EQ(key, tsk.key) << "Allreduce not concensus";
+        rabit::Allreduce<rabit::op::Sum>
+            (tsk.data.dptr_, tsk.data.MSize());
+        tsk.data *= 1.0f / rabit::GetWorldSize();
+        CHECK_EQ(disable_allreduce_, 0) << "Allreduce disabled error";
+        this->HandleReduceFinish(tsk.data, tsk.key);
+      } else {
+        CHECK_EQ(destroy_reduce_thread_, true) << "abort but not destroy";
+      }
+    }
+  }
+  /*!\brief entry point of reduce thread */
+  inline static MSHADOW_THREAD_PREFIX ReduceGlobalThread(void *pthread) {
+    static_cast<RabitModel*>(pthread)->ReduceHandler();
+    return NULL;
+  }
+};
+}  // namespace ps
+}  // namespace mshadow
+#endif  // MSHADOW_RABIT_PS
+#endif  // MSHADOW_PS_RABIT_INL_H_ // NOLINT(*)
diff --git a/3rdparty/mshadow/mshadow-ps/thread.h b/3rdparty/mshadow/mshadow-ps/thread.h
new file mode 100644
index 000000000000..a80dffc22fb6
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/thread.h
@@ -0,0 +1,261 @@
+/*!
+ * Copyright by Contributors
+ * \file thread.h
+ * \brief this header include the minimum necessary resource
+ * for multi-threading that can be compiled in windows, linux, mac
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_PS_THREAD_H_ // NOLINT(*)
+#define MSHADOW_PS_THREAD_H_ // NOLINT(*)
+
+#ifdef _MSC_VER
+#include <windows.h>
+#include <process.h>
+#include "../mshadow/logging.h"
+namespace mshadow {
+namespace utils {
+/*! \brief simple semaphore used for synchronization */
+class Semaphore {
+ public :
+  inline void Init(int init_val) {
+    sem = CreateSemaphore(NULL, init_val, 10, NULL);
+    CHECK_NE(sem, NULL) << "create Semaphore error";
+  }
+  inline void Destroy(void) {
+    CloseHandle(sem);
+  }
+  inline void Wait(void) {
+    CHECK_EQ(WaitForSingleObject(sem, INFINITE), WAIT_OBJECT_0)
+      << "WaitForSingleObject error";
+  }
+  inline void Post(void) {
+    CHECK_NE(ReleaseSemaphore(sem, 1, NULL), 0) << "ReleaseSemaphore error";
+  }
+
+ private:
+  HANDLE sem;
+};
+
+/*! \brief mutex under windows */
+class Mutex {
+ public:
+  inline void Init(void) {
+    CHECK_NE(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400), 0)
+      << "Mutex::Init fail";
+  }
+  inline void Lock(void) {
+    EnterCriticalSection(&mutex);
+  }
+  inline void Unlock(void) {
+    LeaveCriticalSection(&mutex);
+  }
+  inline void Destroy(void) {
+    DeleteCriticalSection(&mutex);
+  }
+
+ private:
+  friend class ConditionVariable;
+  CRITICAL_SECTION mutex;
+};
+
+// conditional variable that uses pthread
+class ConditionVariable {
+ public:
+  // initialize conditional variable
+  inline void Init(void) {
+    InitializeConditionVariable(&cond);
+  }
+  // destroy the thread
+  inline void Destroy(void) {
+    // DeleteConditionVariable(&cond);
+  }
+  // wait on the conditional variable
+  inline void Wait(Mutex *mutex) {
+    CHECK_NE(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE), 0)
+      << "ConditionVariable:Wait fail";
+  }
+  inline void Broadcast(void) {
+    WakeAllConditionVariable(&cond);
+  }
+  inline void Signal(void) {
+    WakeConditionVariable(&cond);
+  }
+
+ private:
+  CONDITION_VARIABLE cond;
+};
+
+/*! \brief simple thread that wraps windows thread */
+class Thread {
+ private:
+  HANDLE    thread_handle;
+  unsigned  thread_id;
+ public:
+  inline void Start(unsigned int __stdcall entry(void*p), void *param) {
+    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
+  }
+  inline int Join(void) {
+    WaitForSingleObject(thread_handle, INFINITE);
+    return 0;
+  }
+};
+/*! \brief exit function called from thread */
+inline void ThreadExit(void *status) {
+  _endthreadex(0);
+}
+#define MSHADOW_THREAD_PREFIX unsigned int __stdcall
+}  // namespace utils
+}  // namespace mshadow
+#else
+// thread interface using g++
+#include <semaphore.h>
+#include <pthread.h>
+#include <errno.h>
+namespace mshadow {
+namespace utils {
+/*!\brief semaphore class */
+class Semaphore {
+  #ifdef __APPLE__
+
+ private:
+  sem_t* semPtr;
+  char sema_name[20];
+
+ private:
+  inline void GenRandomString(char *s, const int len) {
+    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    for (int i = 0; i < len; ++i) {
+      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
+    }
+    s[len] = 0;
+  }
+
+ public:
+  inline void Init(int init_val) {
+    sema_name[0] = '/';
+    sema_name[1] = 's';
+    sema_name[2] = 'e';
+    sema_name[3] = '/';
+    GenRandomString(&sema_name[4], 16);
+    if ((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
+      perror("sem_open");
+      exit(1);
+    }
+    CHECK_NE(semPtr, NULL) << "create Semaphore error";
+  }
+  inline void Destroy(void) {
+    if (sem_close(semPtr) == -1) {
+      perror("sem_close");
+      exit(EXIT_FAILURE);
+    }
+    if (sem_unlink(sema_name) == -1) {
+      perror("sem_unlink");
+      exit(EXIT_FAILURE);
+    }
+  }
+  inline void Wait(void) {
+    sem_wait(semPtr);
+  }
+  inline void Post(void) {
+    sem_post(semPtr);
+  }
+  #else
+
+ private:
+  sem_t sem;
+
+ public:
+  inline void Init(int init_val) {
+    if (sem_init(&sem, 0, init_val) != 0) {
+      LOG(FATAL) << "Semaphore.Init: " << strerror(errno);
+    }
+  }
+  inline void Destroy(void) {
+    if (sem_destroy(&sem) != 0) {
+      LOG(FATAL) << "Semaphore.Destroy: " << strerror(errno);
+    }
+  }
+  inline void Wait(void) {
+    if (sem_wait(&sem) != 0) {
+      LOG(FATAL) << "Semaphore.Wait: " << strerror(errno);
+    }
+  }
+  inline void Post(void) {
+    if (sem_post(&sem) != 0) {
+      LOG(FATAL) << "Semaphore.Post: " << strerror(errno);
+    }
+  }
+  #endif
+};
+
+// mutex that works with pthread
+class Mutex {
+ public:
+  inline void Init(void) {
+    pthread_mutex_init(&mutex, NULL);
+  }
+  inline void Lock(void) {
+    pthread_mutex_lock(&mutex);
+  }
+  inline void Unlock(void) {
+    pthread_mutex_unlock(&mutex);
+  }
+  inline void Destroy(void) {
+    pthread_mutex_destroy(&mutex);
+  }
+
+ private:
+  friend class ConditionVariable;
+  pthread_mutex_t mutex;
+};
+
+// conditional variable that uses pthread
+class ConditionVariable {
+ public:
+  // initialize conditional variable
+  inline void Init(void) {
+    pthread_cond_init(&cond, NULL);
+  }
+  // destroy the thread
+  inline void Destroy(void) {
+    pthread_cond_destroy(&cond);
+  }
+  // wait on the conditional variable
+  inline void Wait(Mutex *mutex) {
+    pthread_cond_wait(&cond, &(mutex->mutex));
+  }
+  inline void Broadcast(void) {
+    pthread_cond_broadcast(&cond);
+  }
+  inline void Signal(void) {
+    pthread_cond_signal(&cond);
+  }
+
+ private:
+  pthread_cond_t cond;
+};
+
+/*!\brief simple thread class */
+class Thread {
+ private:
+  pthread_t thread;
+ public :
+  inline void Start(void * entry(void*), void *param) { // NOLINT(*)
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_create(&thread, &attr, entry, param);
+  }
+  inline int Join(void) {
+    void *status;
+    return pthread_join(thread, &status);
+  }
+};
+inline void ThreadExit(void *status) {
+  pthread_exit(status);
+}
+}  // namespace utils
+}  // namespace mshadow
+#define MSHADOW_THREAD_PREFIX void *
+#endif  // Linux
+#endif  // MSHADOW_PS_THREAD_H_  NOLINT(*)
diff --git a/3rdparty/mshadow/mshadow-ps/thread_util.h b/3rdparty/mshadow/mshadow-ps/thread_util.h
new file mode 100644
index 000000000000..185cfcfa0728
--- /dev/null
+++ b/3rdparty/mshadow/mshadow-ps/thread_util.h
@@ -0,0 +1,169 @@
+/*!
+ * Copyright by Contributors
+ * \file thread_util.h
+ * \brief data structures for multi-threading communication
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_PS_THREAD_UTIL_H_  // NOLINT(*)
+#define MSHADOW_PS_THREAD_UTIL_H_  // NOLINT(*)
+
+#include <utility>
+#include <queue>
+#include <map>
+#include "./thread.h"
+namespace mshadow {
+namespace utils {
+/*!
+ * \brief thread safe queue that can be used for customer consumer model
+ * in the future, it will support priority scheduling
+ * \tparam DType the content of the queue
+ */
+template<typename DType>
+class ThreadPQueue {
+ public:
+  // constructor
+  ThreadPQueue() : use_fifo_(false) {
+  }
+  /*! \brief intitialize the queue, must call this before use */
+  inline void Init(bool use_fifo = false) {
+    use_fifo_ = use_fifo;
+    lock_.Init();
+    counter_.Init(0);
+  }
+  /*! \brief destroy the resources on the queue */
+  inline void Destroy(void) {
+    lock_.Destroy();
+    counter_.Destroy();
+  }
+  /*!
+   * \brief Destroy the queue
+   *        wake up all the threads waits on pop
+   *  this is usually used in class destructor
+   * \param max_nthread the maximum number of thread that
+   *  could be waiting on the queue
+   */
+  inline void Abort(int max_nthread = 1) {
+    for (int i = 0; i < max_nthread; ++i) {
+      counter_.Post();
+    }
+  }
+  /*!
+   * \brief push an element to the queue
+   * \param data the data to be puhed into queue
+   * \param optionally priority level to hint which
+   *        element should be poped first
+   */
+  inline void Push(const DType &data, int priority = 0) {
+    lock_.Lock();
+    if (use_fifo_) {
+      fqueue_.push(data);
+    } else {
+      pqueue_.push(Entry(data, priority));
+    }
+    lock_.Unlock();
+    counter_.Post();
+  }
+  /*!
+   * \brief pop an element from the queue
+   * this will block the thread if the queue is empty
+   * \param data_out the address to put output of the queue
+   * \return true if a correct element is returned
+   *  false if abort is called and no element was left in queue
+   */
+  inline bool Pop(DType *data_out) {
+    counter_.Wait();
+    lock_.Lock();
+    if (use_fifo_) {
+      if (fqueue_.size() == 0) {
+        lock_.Unlock(); return false;
+      }
+    } else {
+      if (pqueue_.size() == 0) {
+        lock_.Unlock(); return false;
+      }
+    }
+    if (use_fifo_) {
+      CHECK_NE(fqueue_.size(), 0) << "Queue.Pop";
+      *data_out = fqueue_.front();
+      fqueue_.pop();
+    } else {
+      CHECK_NE(pqueue_.size(), 0) << "Queue.Pop";
+      *data_out = pqueue_.top().data;
+      pqueue_.pop();
+    }
+    lock_.Unlock();
+    return true;
+  }
+
+ private:
+  // entry in the queue
+  struct Entry {
+    DType data;
+    int priority;
+    Entry(const DType &data, int priority)
+        : data(data), priority(priority) {}
+    inline bool operator<(const Entry &b) const {
+      return priority < b.priority;
+    }
+  };
+  // whether use FIFO queue
+  bool use_fifo_;
+  // a priority queue
+  std::priority_queue<Entry> pqueue_;
+  // a FIFO queue
+  std::queue<DType> fqueue_;
+  // lock for accessing the queue
+  utils::Mutex lock_;
+  // counter to count number of push tasks
+  utils::Semaphore counter_;
+};
+
+// naive implementation of threadsafe map
+template<typename TValue>
+class ThreadSafeMap {
+ public:
+  inline void Init(void) {
+    lock_.Init();
+  }
+  inline void Destroy(void) {
+    for (typename std::map<int, TValue*>::iterator
+             it = map_.begin(); it != map_.end(); ++it) {
+      delete it->second;
+    }
+    lock_.Destroy();
+  }
+  inline TValue *Get(int key) {
+    TValue *ret;
+    lock_.Lock();
+    typename std::map<int, TValue*>::const_iterator
+        it = map_.find(key);
+    if (it == map_.end() || it->first != key) {
+      ret = NULL;
+    } else {
+      ret = it->second;
+    }
+    lock_.Unlock();
+    return ret;
+  }
+  inline TValue &GetRef(int key) {
+    TValue *ret = this->Get(key);
+    CHECK_NE(ret, NULL) << "key = " << key << " does not exist";
+    return *ret;
+  }
+  inline void Init(int key) {
+    lock_.Lock();
+    if (map_.count(key) == 0) {
+      map_[key] = new TValue();
+    }
+    lock_.Unlock();
+  }
+
+ private:
+  // lock for accessing the queue
+  utils::Mutex lock_;
+  std::map<int, TValue*> map_;
+};
+
+}  // namespace utils
+}  // namespace mshadow
+#endif  // MSHADOW_PS_THREAD_UTIL_H_
diff --git a/3rdparty/mshadow/mshadow/README.md b/3rdparty/mshadow/mshadow/README.md
new file mode 100644
index 000000000000..86276af013e2
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/README.md
@@ -0,0 +1,8 @@
+Code Guide
+====
+This readme contains notes about code in mshadow. MShadow generally follows Google's C++ Style.
+
+Convention
+====
+* Basically, all the files ends in ```-inl.h, -inl.cuh``` are implementations, and can be ignored if only using mshadow
+* The files ends in ```.h``` are heavily commented with [doxyen format](http://www.doxygen.org/), and can be used to generate the corresponding document.
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
new file mode 100755
index 000000000000..d08efd387c7e
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -0,0 +1,1110 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file base.h
+ * \brief definitions of base types, operators, macros functions
+ *
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_BASE_H_
+#define MSHADOW_BASE_H_
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#endif
+#include <cmath>
+#include <cstdio>
+#include <cfloat>
+#include <climits>
+#include <algorithm>
+#include <functional>
+#include <sstream>
+#include <string>
+
+#ifdef _MSC_VER
+//! \cond Doxygen_Suppress
+typedef signed char int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+//! \endcond
+#else
+#include <inttypes.h>
+#endif
+// macro defintiions
+/*!
+ * \brief if this macro is define to be 1,
+ * mshadow should compile without any of other libs
+ */
+#ifndef MSHADOW_STAND_ALONE
+#define MSHADOW_STAND_ALONE 0
+#endif
+/*! \brief whether do padding during allocation */
+#ifndef MSHADOW_ALLOC_PAD
+#define MSHADOW_ALLOC_PAD true
+#endif
+/*!
+ * \brief
+ *  x dimension of data must be bigger pad_size * ratio to be alloced padded memory,
+ *  otherwise use tide allocation
+ *  for example, if pad_ratio=2, GPU memory alignement size is 32,
+ *  then we will only allocate padded memory if x dimension > 64
+ *  set it to 0 then we will always allocate padded memory
+ */
+#ifndef MSHADOW_MIN_PAD_RATIO
+  #define MSHADOW_MIN_PAD_RATIO 2
+#endif
+
+#if MSHADOW_STAND_ALONE
+  #define MSHADOW_USE_CBLAS 0
+  #define MSHADOW_USE_MKL   0
+  #define MSHADOW_USE_CUDA  0
+#endif
+
+/*!
+ * \brief force user to use GPU stream during computation
+ *  error will be shot when default stream NULL is used
+ */
+#ifndef MSHADOW_FORCE_STREAM
+#define MSHADOW_FORCE_STREAM 1
+#endif
+
+/*! \brief use CBLAS for CBLAS */
+#ifndef MSHADOW_USE_CBLAS
+  #define MSHADOW_USE_CBLAS 0
+#endif
+/*! \brief use MKL for BLAS */
+#ifndef MSHADOW_USE_MKL
+  #define MSHADOW_USE_MKL   1
+#endif
+
+/*!
+ * \brief use CUDA support, must ensure that the cuda include path is correct,
+ * or directly compile using nvcc
+ */
+#ifndef MSHADOW_USE_CUDA
+  #define MSHADOW_USE_CUDA   1
+#endif
+
+/*!
+ * \brief use CUDNN support, must ensure that the cudnn include path is correct
+ */
+#ifndef MSHADOW_USE_CUDNN
+  #define MSHADOW_USE_CUDNN 0
+#endif
+
+/*!
+ * \brief use CUSOLVER support
+ */
+#ifndef MSHADOW_USE_CUSOLVER
+  #define MSHADOW_USE_CUSOLVER MSHADOW_USE_CUDA
+#endif
+
+/*!
+ * \brief seems CUDAARCH is deprecated in future NVCC
+ * set this to 1 if you want to use CUDA version smaller than 2.0
+ */
+#ifndef MSHADOW_OLD_CUDA
+#define MSHADOW_OLD_CUDA 0
+#endif
+
+/*!
+ * \brief macro to decide existence of c++11 compiler
+ */
+#ifndef MSHADOW_IN_CXX11
+  #if (defined(__GXX_EXPERIMENTAL_CXX0X__) ||\
+      __cplusplus >= 201103L || defined(_MSC_VER))
+    #define MSHADOW_IN_CXX11 1
+  #else
+    #define MSHADOW_IN_CXX11 0
+  #endif
+#endif
+
+/*! \brief whether use SSE */
+#ifndef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 1
+#endif
+
+/*! \brief whether use F16C instruction set architecture extension */
+#ifndef MSHADOW_USE_F16C
+  #if defined(_MSC_VER) || defined(__CUDACC__)
+    #define MSHADOW_USE_F16C 0
+  #elif defined(__clang__) && \
+        ((__clang_major__ < 8) || ((__clang_major__ == 8) && (__clang_minor__ < 1)))
+    #define MSHADOW_USE_F16C 0
+  #else
+    #define MSHADOW_USE_F16C 1
+  #endif
+#endif
+
+/*! \brief whether use NVML to get dynamic info */
+#ifndef MSHADOW_USE_NVML
+  #define MSHADOW_USE_NVML 0
+#endif
+// SSE is conflict with cudacc
+#ifdef __CUDACC__
+  #undef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 0
+#endif
+
+#if MSHADOW_USE_CBLAS
+extern "C" {
+    #include <cblas.h>
+}
+#elif MSHADOW_USE_MKL
+  #include <mkl_blas.h>
+  #include <mkl_cblas.h>
+  #include <mkl_vsl.h>
+  #include <mkl_vsl_functions.h>
+  #include <mkl_version.h>
+#endif
+
+#if MSHADOW_USE_CUDA
+  #include <cuda.h>
+  #include <cublas_v2.h>
+  #include <curand.h>
+#endif
+
+#if MSHADOW_USE_CUDNN == 1
+  #include <cudnn.h>
+#endif
+
+#if MSHADOW_USE_CUSOLVER == 1
+  #include <cusolverDn.h>
+#endif
+
+#if MSHADOW_USE_NVML
+  #include <nvml.h>
+#endif
+
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef _MSC_VER
+#define MSHADOW_FORCE_INLINE __forceinline
+#pragma warning(disable : 4068)
+#else
+#define MSHADOW_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE __device__ __host__
+#else
+  #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE
+#endif
+/*! \brief cpu force inline */
+#define MSHADOW_CINLINE MSHADOW_FORCE_INLINE
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X) ||\
+    defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+  #define MSHADOW_CONSTEXPR constexpr
+#else
+  #define MSHADOW_CONSTEXPR const
+#endif
+
+/*!
+ * \brief default data type for tensor string
+ *  in code release, change it to default_real_t
+ *  during development, change it to empty string so that missing
+ *  template arguments can be detected
+ */
+#ifndef MSHADOW_DEFAULT_DTYPE
+#define MSHADOW_DEFAULT_DTYPE = ::mshadow::default_real_t
+#endif
+
+/*!
+ * \brief DMLC marco for logging
+ */
+#ifndef MSHADOW_USE_GLOG
+#define MSHADOW_USE_GLOG DMLC_USE_GLOG
+#endif  // MSHADOW_USE_GLOG
+
+#if DMLC_USE_CXX11
+#define MSHADOW_THROW_EXCEPTION noexcept(false)
+#define MSHADOW_NO_EXCEPTION  noexcept(true)
+#else
+#define MSHADOW_THROW_EXCEPTION
+#define MSHADOW_NO_EXCEPTION
+#endif
+
+#if defined(_MSC_VER)
+#define MSHADOW_ALIGNED(x) __declspec(align(x))
+#else
+#define MSHADOW_ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+
+/*!
+ * \brief Protected cuda call in mshadow
+ * \param func Expression to call.
+ * It checks for CUDA errors after invocation of the expression.
+ */
+#define MSHADOW_CUDA_CALL(func)                                    \
+  {                                                                \
+    cudaError_t e = (func);                                        \
+    if (e == cudaErrorCudartUnloading) {                           \
+      throw dmlc::Error(cudaGetErrorString(e));                    \
+    }                                                              \
+    CHECK(e == cudaSuccess)                                        \
+        << "CUDA: " << cudaGetErrorString(e);                      \
+  }
+
+/*!
+ * \brief Run function and catch error, log unknown error.
+ * \param func Expression to call.
+ */
+#define MSHADOW_CATCH_ERROR(func)                                     \
+  {                                                                   \
+    try {                                                             \
+      (func);                                                         \
+    } catch (const dmlc::Error &e) {                                    \
+      std::string what = e.what();                                      \
+      if (what.find("driver shutting down") == std::string::npos) {     \
+        LOG(ERROR) << "Ignore CUDA Error " << what;                     \
+      }                                                                 \
+    }                                                                   \
+  }
+
+#include "./half.h"
+#include "./half2.h"
+#include "./logging.h"
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/*! \brief buffer size for each random number generator */
+const unsigned kRandBufferSize = 1000000;
+/*! \brief pi  */
+const float kPi = 3.1415926f;
+/*! \brief type that will be used for index */
+#if MSHADOW_INT64_TENSOR_SIZE == 1
+  typedef int64_t index_t;
+#else
+  typedef int32_t index_t;
+#endif
+
+#ifdef _WIN32
+  /*! \brief openmp index for windows */
+  typedef int64_t openmp_index_t;
+#else
+  /*! \brief openmp index for linux */
+  typedef index_t openmp_index_t;
+#endif
+
+/*! \brief float point type that will be used in default by mshadow */
+typedef float default_real_t;
+
+/*! \brief data type flag */
+enum TypeFlag {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+template<typename DType>
+struct DataType;
+
+template<>
+struct DataType<float> {
+  static const int kFlag = kFloat32;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_32F;
+#endif
+#if MSHADOW_USE_CUDNN
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_FLOAT;
+  typedef float ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<double> {
+  static const int kFlag = kFloat64;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_64F;
+#endif
+#if MSHADOW_USE_CUDNN
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_DOUBLE;
+  typedef double ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<half::half_t> {
+  static const int kFlag = kFloat16;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_16F;
+#endif
+#if MSHADOW_USE_CUDNN
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_HALF;
+  typedef float ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<half::half2_t> {
+  static const int kFlag = kFloat16;
+  static const int kLanes = 2;
+};
+template<>
+struct DataType<uint8_t> {
+  static const int kFlag = kUint8;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_8U;
+#endif
+#if (MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 6)
+  // no uint8 in cudnn for now
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_INT8;
+  typedef uint8_t ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<int8_t> {
+  static const int kFlag = kInt8;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_8I;
+#endif
+#if (MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 6)
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_INT8;
+  typedef int8_t ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<int32_t> {
+  static const int kFlag = kInt32;
+  static const int kLanes = 1;
+#if MSHADOW_USE_CUDA
+#if (CUDA_VERSION >= 8000)
+  static const cudaDataType_t kCudaFlag = CUDA_R_32I;
+#endif
+#if (MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 6)
+  static const cudnnDataType_t kCudnnFlag = CUDNN_DATA_INT32;
+  typedef int32_t ScaleType;
+#endif
+#endif
+};
+template<>
+struct DataType<int64_t> {
+  static const int kFlag = kInt64;
+  static const int kLanes = 1;
+};
+
+/*! \brief type enum value for default real type */
+const int default_type_flag = DataType<default_real_t>::kFlag;
+
+/*! layout flag */
+enum LayoutFlag {
+  kNCHW = 0,
+  kNHWC,
+  kCHWN,
+
+  kNCW = 1 << 3,
+  kNWC,
+  kCWN,
+
+  kNCDHW = 1 << 5,
+  kNDHWC,
+  kCDHWN
+};
+
+template<int layout>
+struct LayoutType;
+
+template<>
+struct LayoutType<kNCHW> {
+  static const index_t kNdim = 4;
+#if (MSHADOW_USE_CUDA && MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 4)
+  static const cudnnTensorFormat_t kCudnnFlag = CUDNN_TENSOR_NCHW;
+#else
+  static const int kCudnnFlag = -1;
+#endif
+};
+
+template<>
+struct LayoutType<kNHWC> {
+  static const index_t kNdim = 4;
+#if (MSHADOW_USE_CUDA && MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 4)
+  static const cudnnTensorFormat_t kCudnnFlag = CUDNN_TENSOR_NHWC;
+#else
+  static const int kCudnnFlag = -1;
+#endif
+};
+
+/*! \brief default layout for 4d tensor */
+const int default_layout = kNCHW;
+
+template<>
+struct LayoutType<kNCDHW> {
+  static const index_t kNdim = 5;
+#if (MSHADOW_USE_CUDA && MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 4)
+  static const cudnnTensorFormat_t kCudnnFlag = CUDNN_TENSOR_NCHW;
+#else
+  static const int kCudnnFlag = -1;
+#endif
+};
+
+template<>
+struct LayoutType<kNDHWC> {
+  static const index_t kNdim = 5;
+#if (MSHADOW_USE_CUDA && MSHADOW_USE_CUDNN == 1 && CUDNN_MAJOR >= 4)
+  static const cudnnTensorFormat_t kCudnnFlag = CUDNN_TENSOR_NHWC;
+#else
+  static const int kCudnnFlag = -1;
+#endif
+};
+
+/*! \brief default layout for 5d tensor */
+const int default_layout_5d = kNCDHW;
+
+/*! \brief namespace for operators */
+namespace op {
+// binary operator
+/*! \brief mul operator */
+struct mul{
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a * b;
+  }
+};
+/*! \brief plus operator */
+struct plus {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a + b;
+  }
+};
+/*! \brief minus operator */
+struct minus {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a - b;
+  }
+};
+/*! \brief divide operator */
+struct div {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return a / b;
+  }
+};
+/*! \brief get rhs */
+struct right {
+  /*! \brief map a, b to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return b;
+  }
+};
+// unary operator/ function: example
+// these operators can be defined by user,
+// in the same style as binary and unary operator
+// to use, simply write F<op::identity>( src )
+/*! \brief identity function that maps a real number to it self */
+struct identity{
+  /*! \brief map a to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return a;
+  }
+};
+}  // namespace op
+/*! \brief namespace for savers */
+namespace sv {
+/*! \brief save to saver: = */
+struct saveto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) { // NOLINT(*)
+    a = b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return 1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 0.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::right OPType;
+};
+/*! \brief save to saver: += */
+struct plusto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) { // NOLINT(*)
+    a += b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return 1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 1.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::plus OPType;
+};
+/*! \brief minus to saver: -= */
+struct minusto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) { // NOLINT(*)
+    a -= b;
+  }
+  /*! \brief helper constant to use BLAS, alpha */
+  inline static default_real_t AlphaBLAS(void) { return -1.0f; }
+  /*! \brief helper constant to use BLAS, beta */
+  inline static default_real_t BetaBLAS(void) { return 1.0f; }
+  /*! \brief corresponding binary operator type */
+  typedef op::minus OPType;
+};
+/*! \brief multiply to saver: *= */
+struct multo {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType &a, DType b) { // NOLINT(*)
+    a *= b;
+  }
+  /*! \brief corresponding binary operator type */
+  typedef op::mul OPType;
+};
+/*! \brief divide to saver: /= */
+struct divto {
+  /*! \brief save b to a using save method */
+  template<typename DType>
+  MSHADOW_XINLINE static void Save(DType& a, DType b) { // NOLINT(*)
+    a /= b;
+  }
+  /*! \brief corresponding binary operator type */
+  typedef op::div OPType;
+};
+}  // namespace sv
+/*! \brief namespace for potential reducer operations */
+namespace red {
+namespace limits {
+/*!
+ * \brief minimum value of certain types
+ * \tparam DType data type
+ */
+template<typename DType>
+MSHADOW_XINLINE DType MinValue(void);
+/*! \brief minimum value of float */
+template<>
+MSHADOW_XINLINE float MinValue<float>(void) {
+  return -FLT_MAX;
+}
+/*! \brief minimum value of double */
+template<>
+MSHADOW_XINLINE double MinValue<double>(void) {
+  return -DBL_MAX;
+}
+/*! \brief minimum value of half */
+template<>
+MSHADOW_XINLINE half::half_t MinValue<half::half_t>(void) {
+  return MSHADOW_HALF_MIN;
+}
+/*! \brief minimum value of uint8_t */
+template<>
+MSHADOW_XINLINE uint8_t MinValue<uint8_t>(void) {
+  return 0;
+}
+/*! \brief minimum value of int8_t */
+template<>
+MSHADOW_XINLINE int8_t MinValue<int8_t>(void) {
+  return SCHAR_MIN;
+}
+/*! \brief minimum value of int32_t */
+template<>
+MSHADOW_XINLINE int MinValue<int32_t>(void) {
+  return INT_MIN;
+}
+/*! \brief minimum value of int64_t */
+template<>
+MSHADOW_XINLINE int64_t MinValue<int64_t>(void) {
+  return LLONG_MIN;
+}
+
+/*!
+ * \brief maximum value of certain types
+ * \tparam DType data type
+ */
+template<typename DType>
+MSHADOW_XINLINE DType MaxValue(void);
+/*! \brief maximum value of float */
+template<>
+MSHADOW_XINLINE float MaxValue<float>(void) {
+  return FLT_MAX;
+}
+/*! \brief maximum value of double */
+template<>
+MSHADOW_XINLINE double MaxValue<double>(void) {
+  return DBL_MAX;
+}
+/*! \brief maximum value of half */
+template<>
+MSHADOW_XINLINE half::half_t MaxValue<half::half_t>(void) {
+  return MSHADOW_HALF_MAX;
+}
+/*! \brief maximum value of uint8_t */
+template<>
+MSHADOW_XINLINE uint8_t MaxValue<uint8_t>(void) {
+  return UCHAR_MAX;
+}
+/*! \brief maximum value of int8_t */
+template<>
+MSHADOW_XINLINE int8_t MaxValue<int8_t>(void) {
+  return SCHAR_MAX;
+}
+/*! \brief maximum value of int32_t */
+template<>
+MSHADOW_XINLINE int MaxValue<int32_t>(void) {
+  return INT_MAX;
+}
+/*! \brief maximum value of int64_t */
+template<>
+MSHADOW_XINLINE int64_t MaxValue<int64_t>(void) {
+  return LLONG_MAX;
+}
+}  // namespace limits
+
+/*! \brief sum reducer */
+struct sum {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
+    dst += src;
+  }
+  /*! \brief do stable reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src, volatile DType& residual) { // NOLINT(*)
+    DType y = src - residual;
+    DType t = dst + y;
+    residual = (t - dst) - y;
+    dst = t;
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
+    DType t1 = dst_val + src_val;
+    DType e = t1 - dst_val;
+    DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
+    dst_val = t1 + t2;
+    dst_residual = t2 - (dst_val - t1);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& residual) {} // NOLINT(*)
+  /*!
+   *\brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return 1;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
+    initv = 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &residual) { // NOLINT(*)
+    SetInitValue(initv);
+    residual = 0;
+  }
+};
+/*! \brief maximum reducer */
+struct maximum {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
+    using namespace std;
+#ifdef __CUDACC__
+    dst = ::max(dst, src);
+#else
+    dst = max(dst, src);
+#endif  // __CUDACC__
+  }
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src, volatile DType &none) { // NOLINT(*)
+    Reduce(dst, src);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& residual) {} // NOLINT(*)
+  /*!
+   * \brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return redres == redsrc ? 1: 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
+    initv = limits::MinValue<DType>();
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &none) { // NOLINT(*)
+    SetInitValue(initv);
+  }
+};
+/*! \brief minimum reducer */
+struct minimum {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src) { // NOLINT(*)
+    using namespace std;
+#ifdef __CUDACC__
+    dst = ::min(dst, src);
+#else
+    dst = min(dst, src);
+#endif  // __CUDACC__
+  }
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst,  volatile DType src, volatile DType &none) { // NOLINT(*)
+    Reduce(dst, src);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& residual) {} // NOLINT(*)
+  /*!
+   * \brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return redres == redsrc ? 1: 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
+    initv = limits::MaxValue<DType>();
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &none) { // NOLINT(*)
+    SetInitValue(initv);
+  }
+};
+}  // namespace red
+
+#define MSHADOW_TYPE_SWITCH(type, DType, ...)       \
+  switch (type) {                                   \
+  case mshadow::kFloat32:                           \
+    {                                               \
+      typedef float DType;                          \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat64:                           \
+    {                                               \
+      typedef double DType;                         \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat16:                           \
+    {                                               \
+      typedef mshadow::half::half_t DType;          \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kUint8:                             \
+    {                                               \
+      typedef uint8_t DType;                        \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kInt8:                              \
+    {                                               \
+      typedef int8_t DType;                         \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kInt32:                             \
+    {                                               \
+      typedef int32_t DType;                        \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kInt64:                             \
+    {                                               \
+      typedef int64_t DType;                        \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  default:                                          \
+    LOG(FATAL) << "Unknown type enum " << type;     \
+  }
+
+#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...)  \
+  switch (type) {                                         \
+  case mshadow::kFloat32:                                 \
+    {                                                     \
+      typedef float DType;                                \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat64:                                 \
+    {                                                     \
+      typedef double DType;                               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat16:                                 \
+    {                                                     \
+      typedef mshadow::half::half2_t DType;               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kUint8:                                   \
+    {                                                     \
+      typedef uint8_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt32:                                   \
+    {                                                     \
+      typedef int32_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt64:                                   \
+    {                                                     \
+      typedef int64_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  default:                                                \
+    LOG(FATAL) << "Unknown type enum " << type;           \
+  }
+
+#define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...)  \
+  switch (type) {                                      \
+  case mshadow::kFloat32:                              \
+    {                                                  \
+      typedef float DType;                             \
+      {__VA_ARGS__}                                    \
+    }                                                  \
+    break;                                             \
+  case mshadow::kFloat64:                              \
+    {                                                  \
+      typedef double DType;                            \
+      {__VA_ARGS__}                                    \
+    }                                                  \
+    break;                                             \
+  default:                                             \
+    LOG(FATAL) << "This operation only supports "      \
+                  "32-bit and 64-bit floating point";  \
+  }
+
+#define MSHADOW_REAL_TYPE_SWITCH(type, DType, ...)  \
+  switch (type) {                                   \
+  case mshadow::kFloat32:                           \
+    {                                               \
+      typedef float DType;                          \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat64:                           \
+    {                                               \
+      typedef double DType;                         \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat16:                           \
+    {                                               \
+      typedef mshadow::half::half_t DType;          \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kUint8:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint8"; \
+    break;                                          \
+  case mshadow::kInt8:                              \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not int8";  \
+    break;                                          \
+  case mshadow::kInt32:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int32";\
+    break;                                          \
+  case mshadow::kInt64:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int64";\
+    break;                                          \
+  default:                                          \
+    LOG(FATAL) << "Unknown type enum " << type;     \
+  }
+
+#define MSHADOW_REAL_TYPE_SWITCH_EX(type$, DType$, DLargeType$, ...)  \
+  switch (type$) {                                  \
+  case mshadow::kFloat32:                           \
+    {                                               \
+      typedef float DType$;                         \
+      typedef float DLargeType$;                    \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat64:                           \
+    {                                               \
+      typedef double DType$;                        \
+      typedef double DLargeType$;                   \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kFloat16:                           \
+    {                                               \
+      typedef mshadow::half::half_t DType$;         \
+      typedef float DLargeType$;                    \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kUint8:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not uint8"; \
+    break;                                          \
+  case mshadow::kInt8:                              \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types not int8";  \
+    break;                                          \
+  case mshadow::kInt32:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int32";\
+    break;                                          \
+  case mshadow::kInt64:                             \
+    LOG(FATAL) << "This operation only support "    \
+                  "floating point types, not int64";\
+    break;                                          \
+  default:                                          \
+    LOG(FATAL) << "Unknown type enum " << type$;    \
+  }
+
+#define MSHADOW_LAYOUT_SWITCH(layout, Layout, ...)  \
+  switch (layout) {                                 \
+  case mshadow::kNCHW:                              \
+    {                                               \
+      const int Layout = kNCHW;                     \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kNHWC:                              \
+    {                                               \
+      const int Layout = kNHWC;                     \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kNCDHW:                             \
+    {                                               \
+      const int Layout = kNCDHW;                    \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  case mshadow::kNDHWC:                             \
+    {                                               \
+      const int Layout = kNDHWC;                    \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  default:                                          \
+    LOG(FATAL) << "Unknown layout enum " << layout; \
+  }
+
+/*!
+ * \brief Only supports int64 index type for aux_data
+ * in NDArray class fow now.
+ */
+#define MSHADOW_IDX_TYPE_SWITCH(type, DType, ...)   \
+  switch (type) {                                   \
+  case mshadow::kInt64:                             \
+    {                                               \
+      typedef int64_t DType;                        \
+      {__VA_ARGS__}                                 \
+    }                                               \
+    break;                                          \
+  default:                                          \
+    LOG(FATAL) << "Unknown type enum " << type;     \
+  }
+
+/*! \brief get data type size from type enum */
+inline size_t mshadow_sizeof(int type) {
+  int size = 0;
+  MSHADOW_TYPE_SWITCH(type, DType, size = sizeof(DType););
+  return size;
+}
+
+}  // namespace mshadow
+#endif  // MSHADOW_BASE_H_
diff --git a/3rdparty/mshadow/mshadow/cuda/reduce.cuh b/3rdparty/mshadow/mshadow/cuda/reduce.cuh
new file mode 100644
index 000000000000..921d5ad5e0c0
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/cuda/reduce.cuh
@@ -0,0 +1,120 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reduce.cuh
+ * \brief helper functions to do reduction
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_CUDA_REDUCE_CUH_
+#define MSHADOW_CUDA_REDUCE_CUH_
+
+namespace mshadow {
+namespace cuda {
+/*
+ * \brief reduce over the dimension x
+ * \tparam Reducer reducer
+ * \tparam x_bits dimension = 1<<x_bits
+ * \tparam DType content data type
+ */
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]);
+/*
+ * \brief reduce over the dimension x
+ * \tparam Reducer reducer
+ * \tparam xmax_bits maximum size of buffer
+ * \tparam DType content data type
+ * \param xsize size of x dimension, not sure if aligned
+ */
+template<typename Reducer, int xmax_bits, typename DType>
+inline __device__ void
+Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize);
+// ===============================================x===
+//  implementations afterwards,
+//  no need to read if only use the functions
+// --------------------------------------------------
+#ifdef  __DEVICE_EMULATION__
+#define __syncwarp() __syncthreads()
+#else
+#if CUDA_VERSION < 9000
+#define __syncwarp()
+#endif
+#endif
+
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void ReduceX(volatile DType  buf[], int tid) {
+  if (x_bits >= 10) {
+    if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
+    __syncthreads();
+  }
+  if (x_bits >= 9) {
+    if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
+    __syncthreads();
+  }
+  if (x_bits >= 8) {
+    if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
+    __syncthreads();
+  }
+  if (x_bits >= 7) {
+    if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
+    __syncthreads();
+  }
+  if (x_bits >= 6) {
+    if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
+    __syncthreads();
+  }
+  // in warp optimization
+  if (x_bits >= 5) {
+    if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
+#if MSHADOW_OLD_CUDA
+    __syncthreads();
+#else
+    __syncwarp();
+#endif
+  }
+  if (x_bits >= 4) {
+    if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
+    __syncwarp();
+  }
+  if (x_bits >= 3) {
+    if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
+    __syncwarp();
+  }
+  if (x_bits >= 2) {
+    if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
+    __syncwarp();
+  }
+  if (x_bits >= 1) {
+    if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
+    __syncwarp();
+  }
+}
+template<typename Reducer, int x_bits, typename DType>
+inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) {
+  ReduceX<Reducer, x_bits>(buf, threadIdx.x);
+}
+// reduce with a upper bound
+#define __RD_NON_ALIGN(els, x_bits)                                     \
+  els                                                                   \
+  if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) {                 \
+    if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) {          \
+      Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]);             \
+    }                                                                   \
+    __syncthreads();                                                    \
+    ReduceX<Reducer, x_bits>(buf, tid);                                 \
+  }                                                                     \
+
+template<typename Reducer, int xmax_bits, typename DType>
+inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) {
+  int tid = threadIdx.x;
+  __RD_NON_ALIGN(, 8)
+  __RD_NON_ALIGN(else, 7)
+  __RD_NON_ALIGN(else, 6)
+  __RD_NON_ALIGN(else, 5)
+  __RD_NON_ALIGN(else, 4)
+  __RD_NON_ALIGN(else, 3)
+  __RD_NON_ALIGN(else, 2)
+  __RD_NON_ALIGN(else, 1)
+}
+}  // namespace cuda
+}  // namespace mshadow
+#endif  // MSHADOW_CUDA_REDUCE_CUH_
+
diff --git a/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh b/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
new file mode 100755
index 000000000000..72e4b7eb9ee9
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
@@ -0,0 +1,828 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_gpu-inl.cuh
+ * \brief implementation of GPU code using CUDA
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
+#define MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#if CUDA_VERSION >= 7000
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+#include "../tensor.h"
+#include "./reduce.cuh"
+#define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
+  /* Code block avoids redefinition of cudaError_t err */ \
+  do { \
+    cudaError err = cudaPeekAtLastError(); \
+    CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
+  } while (0)
+namespace mshadow {
+namespace cuda {
+/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */
+#if MSHADOW_OLD_CUDA
+const int kMemUnitBits = 4;
+const int kMaxThreadsPerBlock = 512;
+#else
+const int kMemUnitBits = 5;
+const int kMaxThreadsPerBlock = 1024;
+#endif
+/*! \brief number of units that can do synchronized update, half warp size */
+const int kMemUnit = 1 << kMemUnitBits;
+/*! \brief mask that could be helpful sometime */
+const int kMemUnitMask = kMemUnit - 1;
+/*! \brief suggested thread number(logscale) for mapping kernel */
+const int kBaseThreadBits = 8;
+/*! \brief suggested thread number for mapping kernel */
+const int kBaseThreadNum  = 1 << kBaseThreadBits;
+/*! \brief maximum value of grid */
+const int kMaxGridNum = 65535;
+/*! \brief maximum value of grid within each dimension */
+const int kMaxGridDim = 65535;
+/*! \brief suggested grid number for mapping kernel */
+const int kBaseGridNum = 1024;
+/*! \brief get align stride for given size in x dimension */
+inline index_t GetAlignStride(index_t xsize) {
+  if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) {
+    return ((xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
+  } else {
+    // if originally space is not aligned, no necessary to to alligned thread allocation
+    return xsize;
+  }
+}
+inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "") {
+  if (dimBlock.x * dimBlock.y * dimBlock.z > static_cast<unsigned>(kMaxThreadsPerBlock) ||
+      dimGrid.x > kMaxGridDim || dimGrid.y > kMaxGridDim) {
+    LOG(FATAL) << "too large launch parameter: "
+      << estr << "["
+      << dimGrid.x << ","
+      << dimGrid.y << "], ["
+      << dimBlock.x << ","
+      << dimBlock.y << ","
+      << dimBlock.z << "]";
+  }
+}
+template<typename Saver, typename DstPlan,
+         typename Plan, int block_dim_bits>
+__device__ void MapPlanProc(DstPlan dst, index_t xstride,
+                            Shape<2> dshape, const Plan plan, int block_idx) {
+  const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
+  const int y = tid / xstride;
+  const int x = tid % xstride;
+  if (y < dshape[0] && x < dshape[1]) {
+    Saver::Save(dst.REval(y, x), plan.Eval(y, x));
+  }
+}
+template<typename Saver, int block_dim_bits,
+         typename DstPlan, typename Plan>
+__global__ void MapPlanKernel(DstPlan dst, index_t xstride,
+                              Shape<2> dshape, const Plan plan) {
+  MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
+      (dst, xstride, dshape, plan, blockIdx.x);
+}
+template<typename Saver, int block_dim_bits, int grid_size,
+         typename DstPlan, typename Plan>
+__global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride,
+                                   Shape<2> dshape, const Plan plan, int repeat) {
+  for (int i = 0; i < repeat; ++i) {
+  MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
+      (dst, xstride, dshape, plan, blockIdx.x + i * grid_size);
+  }
+}
+
+template<typename Saver, typename DstExp, typename E, typename DType>
+inline void MapPlan(expr::Plan<DstExp, DType> dst,
+                    const expr::Plan<E, DType> &plan,
+                    Shape<2> dshape,
+                    cudaStream_t stream) {
+  const index_t xstride = GetAlignStride(dshape[1]);
+  const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum;
+  dim3 dimBlock(kBaseThreadNum, 1, 1);
+
+  if (num_block < kMaxGridNum) {
+    dim3 dimGrid(num_block, 1, 1);
+    MapPlanKernel<Saver, kBaseThreadBits,
+                  expr::Plan<DstExp, DType>,
+                  expr::Plan<E, DType> >
+        <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanKernel);
+  } else {
+    int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
+    dim3 dimGrid(kBaseGridNum, 1 , 1);
+    MapPlanLargeKernel<Saver, kBaseThreadBits, kBaseGridNum,
+                       expr::Plan<DstExp, DType>,
+                       expr::Plan<E, DType> >
+        <<<dimGrid, dimBlock, 0, stream>>>(dst, xstride, dshape, plan, repeat);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(MapPlanLargeKernel);
+  }
+}
+
+template<typename Saver, typename Reducer, int warp_bits,
+         typename DType, typename DstPlan, typename Plan>
+__global__ void
+__launch_bounds__(kMemUnit*kMemUnit, 1)
+MapRedKeepLowestKernel(DstPlan dst, Plan plan,
+                       DType scale, Shape<2> eshape) {
+  const unsigned warp_size = 1 << warp_bits;
+  const unsigned x = (blockIdx.x << warp_bits) + threadIdx.x;
+  // to avoid bank conflict
+  __shared__ DType s_res[warp_size][warp_size + 1];
+  // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
+  if (threadIdx.y < eshape[0] && x < eshape[1]) {
+    s_res[threadIdx.x][threadIdx.y] = plan.Eval(threadIdx.y, x);
+  }
+  for (unsigned y = warp_size; y < eshape[0]; y += warp_size) {
+    if (threadIdx.y + y < eshape[0] && x < eshape[1]) {
+      Reducer::Reduce(s_res[threadIdx.x][threadIdx.y], plan.Eval(threadIdx.y + y, x));
+    }
+  }
+  __syncthreads();
+  if (eshape[0] >= warp_size) {
+    Reduce1D<Reducer, warp_bits>(s_res[threadIdx.y]);
+  } else {
+    Reduce1DNotAlign<Reducer, warp_bits>(s_res[threadIdx.y], eshape[0]);
+  }
+  __syncthreads();
+
+  if (threadIdx.y == 0 && x < eshape[1]) {
+    Saver::Save(dst.REval(0, x),  DType(s_res[threadIdx.x][0] * scale));
+  }
+}
+
+template<typename Saver, typename Reducer,
+         typename DstExp, typename E, typename DType>
+inline void MapReduceKeepLowest(expr::Plan<DstExp, DType> dst,
+                                const expr::Plan<E, DType> &plan,
+                                DType scale, Shape<2> eshape,
+                                cudaStream_t stream) {
+  dim3 dimBlock(kMemUnit, kMemUnit);
+  dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits);
+  CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel");
+  MapRedKeepLowestKernel<Saver, Reducer, kMemUnitBits, DType,
+                         expr::Plan<DstExp, DType>,
+                         expr::Plan<E, DType> >
+      <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, eshape);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(MapRedKeepLowestKernel);
+}
+
+template<typename Saver, typename Reducer, int block_dim_bits,
+         typename DType, typename DstPlan, typename Plan>
+__global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) {
+  const int block_size = 1 << block_dim_bits;
+  __shared__ DType s_rec[block_size];
+  const int c = blockIdx.x + blockIdx.y * gridDim.x;
+  const index_t tot = pshape[3] * pshape[2] * pshape[0];
+
+  if (c < pshape[1]) {
+    DType res; Reducer::SetInitValue(res);
+    for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) {
+      index_t i = i_offset + threadIdx.x;
+      if (i< tot) {
+        const index_t x = i % pshape[3];
+        i /= pshape[3];
+        const index_t y = i % pshape[2];
+        const index_t n = i / pshape[2];
+        Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
+      }
+    }
+    s_rec[threadIdx.x] = res;
+    __syncthreads();
+    Reduce1D<Reducer, block_dim_bits>(s_rec);
+    if (threadIdx.x == 0) {
+      Saver::Save(dst.REval(0, c), DType(s_rec[0] * scale));
+    }
+  }
+}
+
+template<typename Saver, typename Reducer, typename DstExp, typename E, typename DType>
+inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
+                              const expr::Plan<E, DType> &plan,
+                              DType scale, Shape<4> pshape,
+                              cudaStream_t stream) {
+  dim3 dimBlock(kBaseThreadNum);
+  const int grid_dim_x = (pshape[1] > kMaxGridNum) ? kMaxGridNum : pshape[1];
+  const int grid_dim_y = (pshape[1] > kMaxGridNum) ? (pshape[1] + kMaxGridNum - 1) / kMaxGridNum
+                                                   : 1;
+  dim3 dimGrid(grid_dim_x, grid_dim_y);
+  CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1");
+  MapReduceKeepDim1Kernel<Saver, Reducer, kBaseThreadBits, DType,
+                          expr::Plan<DstExp, DType>,
+                          expr::Plan<E, DType> >
+      <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(MapReduceKeepDim1Kernel);
+}
+
+template<int x_bits, typename DType>
+__global__ void GetBatchedViewKernel(DType **dst, DType *src, int num, int stride) {
+  const int x_size = 1 << x_bits;
+  const int start = threadIdx.x;
+  // Copy the addresses of src to dst every stride steps
+  for (int i = start; i < num; i += x_size) {
+    dst[i] = src + i * stride;
+  }
+}
+
+template<typename DType>
+inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
+                           Stream<gpu> *stream) {
+  cudaStream_t stream_ = Stream<gpu>::GetStream(stream);
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(1);
+  CheckLaunchParam(dimGrid, dimBlock, "GetBatchedView");
+  GetBatchedViewKernel<kBaseThreadBits, DType>
+    <<<dimGrid, dimBlock, 0, stream_>>> (dst, src, num, stride);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(GetBatchedViewKernel);
+}
+
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  const int k = static_cast<int>(label.Eval(0, y));
+
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    const unsigned xindex = x + threadIdx.x;
+    if (xindex < xmax) {
+      if (xindex == k) {
+        dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f;
+      } else {
+        dst.REval(y, xindex) = src.Eval(y, xindex);
+      }
+    }
+  }
+}
+
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void SmoothSoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax,
+                                        float alpha) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  const int k = static_cast<int>(label.Eval(0, y));
+  // xmax is the number of classes in our distribution
+  const float smooth_grad = (alpha / (xmax - 1));
+
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    const unsigned xindex = x + threadIdx.x;
+    if (xindex < xmax) {
+      if (xindex == k) {
+        dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f + alpha;
+      } else {
+        dst.REval(y, xindex) = src.Eval(y, xindex) - smooth_grad;
+      }
+    }
+  }
+}
+
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax,
+                                  DType ignore_label) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  const int k = static_cast<int>(label.Eval(0, y));
+
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    const unsigned xindex = x + threadIdx.x;
+    if (xindex < xmax) {
+      if (static_cast<int>(ignore_label) == k) {
+        dst.REval(y, xindex) = 0.0f;
+      } else {
+        if (xindex == k) {
+          dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f;
+        } else {
+          dst.REval(y, xindex) = src.Eval(y, xindex);
+        }
+      }
+    }
+  }
+}
+
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void SmoothSoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax,
+                                  DType ignore_label, float alpha) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  const int k = static_cast<int>(label.Eval(0, y));
+  // xmax is the number of classes in our distribution
+  const float smooth_grad = (alpha / (xmax - 1));
+
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    const unsigned xindex = x + threadIdx.x;
+    if (xindex < xmax) {
+      if (static_cast<int>(ignore_label) == k) {
+        dst.REval(y, xindex) = 0.0f;
+      } else {
+        if (xindex == k) {
+          dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f + alpha;
+        } else {
+          dst.REval(y, xindex) = src.Eval(y, xindex) - smooth_grad;
+        }
+      }
+    }
+  }
+}
+
+template<int x_bits, typename DType,  typename DstPlan, typename SrcPlan>
+__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
+  const unsigned x_size = 1 << x_bits;
+  const int y = blockIdx.x;
+  __shared__ DType s_rec[x_size];
+  // step 1: get max
+  if (threadIdx.x < xmax) {
+    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
+  }
+  for (unsigned x = x_size; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType a = src.Eval(y, x + threadIdx.x);
+      s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x >= xmax) {
+    s_rec[threadIdx.x] = s_rec[0];
+  }
+  __syncthreads();
+  Reduce1D<red::maximum, x_bits>(s_rec);
+  __syncthreads();
+  DType smax = s_rec[0];
+  __syncthreads();
+  s_rec[threadIdx.x] = 0.0f;
+  __syncthreads();
+
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType p = expf(src.Eval(y, x + threadIdx.x) - smax);
+      s_rec[threadIdx.x] += p;
+      // write back first, will fetch later
+      dst.REval(y, x + threadIdx.x) = p;
+    }
+  }
+  // calculate normalizer
+  __syncthreads();
+  Reduce1D<red::sum, x_bits>(s_rec);
+  __syncthreads();
+  DType ssum = s_rec[0];
+
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      dst.REval(y, x + threadIdx.x) /= ssum;
+    }
+  }
+}
+
+template<typename DType>
+inline void Softmax(const Tensor<gpu, 2, DType> &dst,
+                    const Tensor<gpu, 2, DType> &src) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "Softmax");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SoftmaxKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       dst.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxKernel);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SoftmaxGradKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       expr::MakePlan(label),
+       dst.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                              const Tensor<gpu, 2, DType> &src,
+                              const Tensor<gpu, 1, DType> &label,
+                              const float alpha) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SmoothSoftmaxGradKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       expr::MakePlan(label),
+       dst.size(1),
+       alpha);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label,
+                        const DType &ignore_label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SoftmaxGradKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       expr::MakePlan(label),
+       dst.size(1),
+       ignore_label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                              const Tensor<gpu, 2, DType> &src,
+                              const Tensor<gpu, 1, DType> &label,
+                              const DType &ignore_label,
+                              const float alpha) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SmoothSoftmaxGradKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       expr::MakePlan(label),
+       dst.size(1),
+       ignore_label,
+       alpha);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SoftmaxGradKernel);
+}
+
+template<int n_bits, typename DType>
+__global__ void Softmax3DGradKernel(Tensor<gpu, 3, DType> dst,
+                                    const Tensor<gpu, 3, DType> src,
+                                    const Tensor<gpu, 2, DType> label) {
+  const index_t xmax = dst.size(1);
+  const index_t nmax = dst.size(2);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    const int k = static_cast<int>(label[y][n_index]);
+    for (index_t i = 0; i < xmax; ++i) {
+      if (i == k) {
+        dst[y][i][n_index] = src[y][i][n_index] - 1.0f;
+      } else {
+        dst[y][i][n_index] = src[y][i][n_index];
+      }
+    }
+  }
+}
+
+template<int n_bits, typename DType>
+__global__ void Softmax3DGradKernel(Tensor<gpu, 3, DType> dst,
+                                    const Tensor<gpu, 3, DType> src,
+                                    const Tensor<gpu, 2, DType> label,
+                                    DType ignore_label) {
+  const index_t xmax = dst.size(1);
+  const index_t nmax = dst.size(2);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    int k = static_cast<int>(label[y][n_index]);
+    if (k == static_cast<int>(ignore_label)) {
+      for (index_t i = 0; i < xmax; ++i) {
+        dst[y][i][n_index] = 0.0f;
+      }
+    } else {
+      for (index_t i = 0; i < xmax; ++i) {
+        if (i == k) {
+          dst[y][i][n_index] = src[y][i][n_index] - 1.0f;
+        } else {
+          dst[y][i][n_index] = src[y][i][n_index];
+        }
+      }
+    }
+  }
+}
+
+template<int n_bits, typename DType>
+__global__ void Softmax3DKernel(Tensor<gpu, 3, DType> dst,
+                    const Tensor<gpu, 3, DType> src) {
+  const index_t xmax = dst.size(1);
+  const index_t nmax = dst.size(2);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    DType smax = src[y][0][n_index];
+    for (index_t i = 1; i < xmax; ++i) {
+      smax = max(smax, src[y][i][n_index]);  // NOLINT(*)
+    }
+    DType ssum = 0.0f;
+    for (index_t i = 0; i < xmax; ++i) {
+      DType p = expf(src[y][i][n_index] - smax);
+      ssum += p;
+      dst[y][i][n_index] = p;
+    }
+    for (index_t i = 0; i < xmax; ++i) {
+      dst[y][i][n_index] /= ssum;
+    }
+  }
+}
+
+template<typename DType>
+inline void Softmax(const Tensor<gpu, 3, DType> &dst,
+                    const Tensor<gpu, 3, DType> &src) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "Softmax");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DKernel);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label,
+                        const DType &ignore_label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DGradKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(
+    dst, src, label, ignore_label);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(Softmax3DGradKernel);
+}
+
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void AddTakeGradKernel(DstPlan dst,
+                                  SrcPlan1 index, SrcPlan2 src,
+                                  index_t ymax, index_t xmax, const int K) {
+  const unsigned x_size = 1 << x_bits;
+  const int xindex = blockIdx.x * x_size + threadIdx.x;
+  __shared__ int ptr;
+  for (unsigned y = 0; y < ymax; ++y) {
+    if (threadIdx.x == 0) {
+      ptr = index.Eval(0, y);
+      if (ptr <= 0) ptr = 0;
+      else if (ptr >= K) ptr = K - 1;
+    }
+    __syncthreads();
+    if (xindex < xmax) {
+      dst.REval(ptr, xindex) += src.Eval(y, xindex);
+    }
+  }
+}
+
+template<int warp_bits, int SZ, typename DType, typename IdxType>
+__global__ void AddTakeGradLargeBatchKernel(DType* dst,
+                                            const IdxType *sorted, const IdxType *index,
+                                            const DType *src,
+                                            int ymax, int xmax) {
+  // Based on Torch's Version https://github.com/torch/cunn/blob/master/lib/THCUNN/LookupTable.cu
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceeding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+  // Also, all warp will loop for SZ times to increase the throughput.
+
+  const int warp_size = 1 << warp_bits;
+  int idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  if (idx < ymax
+    && (idx == 0 || sorted[idx] != sorted[idx - 1])) {
+    do {
+      const int start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int dst_row = static_cast<int>(sorted[idx]) * xmax;
+      const int src_row = static_cast<int>(index[idx]) * xmax;
+      float grad_out[SZ];
+      float grad_weight[SZ];
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * warp_size;
+        if (feature_dim < xmax) {
+          grad_out[ii] = src[src_row + feature_dim];
+          grad_weight[ii] = dst[dst_row + feature_dim];
+        }
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        grad_weight[ii] += grad_out[ii];
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * warp_size;
+        if (feature_dim < xmax) {
+          dst[dst_row + feature_dim] = grad_weight[ii];
+        }
+      }
+      idx++;
+    } while (idx < ymax && (sorted[idx] == sorted[idx - 1]));
+  }
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
+                        const Tensor<gpu, 1, IndexType>& index,
+                        const Tensor<gpu, 2, DType> &src) {
+  CHECK_EQ(dst.CheckContiguous(), true);
+  CHECK_EQ(index.CheckContiguous(), true);
+  CHECK_EQ(src.CheckContiguous(), true);
+  const int kUnitBits = kMemUnitBits + 1;
+  dim3 dimBlock(1 << kUnitBits);
+  dim3 dimGrid((dst.size(1) + (1 << kUnitBits) - 1) >> kUnitBits);
+
+  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGrad: shape mismatch";
+  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGrad: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "AddTakeGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  const int K = dst.shape_[0];
+
+  AddTakeGradKernel<kUnitBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(index),
+       expr::MakePlan(src),
+       src.size(0),
+       src.size(1), K);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradKernel);
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
+                                  const Tensor<gpu, 1, IndexType>& sorted,
+                                  const Tensor<gpu, 1, IndexType>& index,
+                                  const Tensor<gpu, 2, DType> &src) {
+  CHECK_EQ(dst.CheckContiguous(), true);
+  CHECK_EQ(sorted.CheckContiguous(), true);
+  CHECK_EQ(index.CheckContiguous(), true);
+  CHECK_EQ(src.CheckContiguous(), true);
+  const int kWarpBits = kMemUnitBits;
+  const int SZ = 4;
+  const int block_dim_x = 1 << kWarpBits;
+  const int block_dim_y = 4;
+  const int grid_dim_x = (src.size(0) + block_dim_y - 1) / block_dim_y;
+  const int grid_dim_y = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
+  dim3 dimBlock(block_dim_x, block_dim_y);
+  dim3 dimGrid(grid_dim_x, grid_dim_y);
+
+  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGradLargeBatch: shape mismatch";
+  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGradLargeBatch: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "AddTakeGradLargeBatch");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+
+  AddTakeGradLargeBatchKernel<kWarpBits, SZ, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (dst.dptr_,
+       sorted.dptr_,
+       index.dptr_,
+       src.dptr_,
+       static_cast<int>(src.size(0)),
+       static_cast<int>(src.size(1)));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
+}
+
+template<int warp_bits, typename DType, typename DstPlan, typename IndexPlan, typename SrcPlan>
+__global__ void IndexFillKernel(DstPlan dst,
+                                const IndexPlan index,
+                                const SrcPlan src,
+                                const int ymax,
+                                const int xmax) {
+  int bid = blockIdx.y * blockDim.x + blockIdx.x;
+  int tid = bid * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+  if (tid < ymax * xmax) {
+    int i = tid / xmax;
+    int j = tid % xmax;
+    int k = static_cast<int>(index.Eval(0, i));
+    dst.REval(k, j) = src.Eval(i, j);
+  }
+}
+
+template<typename IndexType, typename DType>
+inline void IndexFill(Tensor<gpu, 2, DType> dst,
+                      const Tensor<gpu, 1, IndexType>& index,
+                      const Tensor<gpu, 2, DType> &src) {
+  CHECK_EQ(dst.CheckContiguous(), true);
+  CHECK_EQ(index.CheckContiguous(), true);
+  CHECK_EQ(src.CheckContiguous(), true);
+  CHECK_EQ(dst.size(1), src.size(1)) << "IndexFill: shape mismatch";
+  CHECK_EQ(index.size(0), src.size(0)) << "IndexFill: shape mismatch";
+  const int block_dim_x = 1 << kMemUnitBits;
+  const int block_dim_y = 1 << kMemUnitBits;
+  const int block_size = block_dim_x * block_dim_y;
+  int grid_dim_x = (src.size(0) * src.size(1) + block_size - 1) / block_size;
+  int grid_dim_y = 1;
+  while (grid_dim_x > kMaxGridDim) {
+    grid_dim_x = (grid_dim_x + 1) / 2;
+    grid_dim_y *= 2;
+  }
+  dim3 dimBlock(block_dim_x, block_dim_y);
+  dim3 dimGrid(grid_dim_x, grid_dim_y);
+  CheckLaunchParam(dimGrid, dimBlock, "IndexFill");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+
+  IndexFillKernel<kMemUnitBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(index),
+       expr::MakePlan(src),
+       src.size(0),
+       src.size(1));
+  MSHADOW_CUDA_POST_KERNEL_CHECK(IndexFillKernel);
+}
+
+template<typename KDType, typename VDType>
+inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,
+                      bool is_ascend) {
+  CHECK_EQ(keys.CheckContiguous(), true);
+  CHECK_EQ(values.CheckContiguous(), true);
+#if CUDA_VERSION >= 7000
+  cudaStream_t stream = Stream<gpu>::GetStream(keys.stream_);
+  thrust::device_ptr<KDType> key_iter = thrust::device_pointer_cast(keys.dptr_);
+  thrust::device_ptr<VDType> value_iter = thrust::device_pointer_cast(values.dptr_);
+  if (is_ascend) {
+    thrust::stable_sort_by_key(
+      thrust::cuda::par.on(stream),
+      key_iter, key_iter + keys.size(0), value_iter, thrust::less<KDType>());  // NOLINT(*)
+  } else {
+    thrust::stable_sort_by_key(
+      thrust::cuda::par.on(stream),
+      key_iter, key_iter + keys.size(0), value_iter, thrust::greater<KDType>());  // NOLINT(*)
+  }
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SortByKey);
+#else
+  LOG(FATAL) << "SortByKey is only supported for CUDA version >=7.0!";
+#endif
+}
+
+template<typename DType>
+inline void SortByKey(Tensor<gpu, 1, mshadow::half::half_t> keys, Tensor<gpu, 1, DType> values,
+                      bool is_ascend) {
+  LOG(FATAL) << "SortByKey for half_t is not implemented!";
+}
+
+template<typename DType>
+inline void SortByKey(Tensor<gpu, 1, DType> keys, Tensor<gpu, 1, mshadow::half::half_t> values,
+  bool is_ascend) {
+  LOG(FATAL) << "SortByKey for half_t is not implemented!";
+}
+
+// break ambiguous template deduction for <half_t, half_t>
+inline void SortByKey(Tensor<gpu, 1, mshadow::half::half_t> keys,
+  Tensor<gpu, 1, mshadow::half::half_t> values,
+  bool is_ascend) {
+  LOG(FATAL) << "SortByKey for half_t is not implemented!";
+}
+}  // namespace cuda
+}  // namespace mshadow
+#endif  // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
diff --git a/3rdparty/mshadow/mshadow/dot_engine-inl.h b/3rdparty/mshadow/mshadow/dot_engine-inl.h
new file mode 100644
index 000000000000..21816f209e40
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/dot_engine-inl.h
@@ -0,0 +1,936 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file dot_engine-inl.h
+ * \brief definitions of how Matrix Multiplications can be evaluated
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_DOT_ENGINE_INL_H_
+#define MSHADOW_DOT_ENGINE_INL_H_
+
+#include <vector>
+#include "./base.h"
+#include "./extension/implicit_gemm.h"
+
+#ifdef __CUDACC__
+#include "./cuda/tensor_gpu-inl.cuh"
+#endif  // #ifdef __CUDACC__
+
+namespace mshadow {
+ /*!
+* \brief CPU/GPU: Get a batched view of the src array. dst[i] = src + i * stride
+* \param dst 2D pointer
+* \param src 1D pointer
+* \param num number of batches
+* \param stride size of each batch
+* \param stream
+*/
+template<typename Device, typename DType>
+inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
+                           Stream<Device> *stream);
+template<typename DType>
+inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
+                           Stream<cpu> *stream) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src + i * stride;
+  }
+}
+#ifdef __CUDACC__
+namespace cuda {};
+template<typename DType>
+inline void GetBatchedView(DType **dst, DType *src, int num, int stride,
+                           Stream<gpu> *stream) {
+  cuda::GetBatchedView(dst, src, num, stride, stream);
+}
+#endif  // #ifdef __CUDACC__
+
+namespace expr {
+//---------------------------------------------------------------------
+// Matrix Multiplications, depends on BLAS Engine
+//---------------------------------------------------------------------
+template<typename SV, typename Device, int ddim, int ldim,
+         int rdim, bool ltrans, bool rtrans, typename DType>
+struct DotEngine {
+  inline static void Eval(Tensor<Device, ddim, DType> *p_dst,
+                          const Tensor<Device, ldim, DType> &lhs,
+                          const Tensor<Device, rdim, DType> &rhs,
+                          DType scale);
+};
+// handles the dot, use CblasColMajor
+template<typename Device, typename DType = default_real_t>
+struct BLASEngine {
+  inline static bool GetT(bool t) {
+    return t ? true : false;
+  }
+  inline static void SetStream(Stream<Device> *stream) {
+  }
+  inline static void gemm(Stream<Device> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, DType alpha,
+                          const DType *A, int lda, const DType *B, int ldb,
+                          DType beta, DType *C, int ldc) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_gemm(Stream<Device> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, DType alpha,
+                                  const DType *A, int lda, const DType *B, int ldb,
+                                  DType beta, DType *C, int ldc, int batch_count,
+                                  DType **workspace) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void gemv(Stream<Device> *stream,
+                          bool trans, int m, int n,
+                          DType alpha, const DType *A, int lda,
+                          const DType *X, int incX,
+                          DType beta, DType *Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_gemv(Stream<Device> *stream,
+                                  bool trans, int m, int n,
+                                  DType alpha, const DType *A, int lda,
+                                  const DType *X, int incX,
+                                  DType beta, DType *Y, int incY, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void ger(Stream<Device> *stream,
+                         int m, int n, DType alpha,
+                         const DType *X, int incX,
+                         const DType *Y, int incY, DType *A, int lda) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_ger(Stream<Device> *stream,
+                         int m, int n, DType alpha,
+                         const DType *X, int incX,
+                         const DType *Y, int incY, DType *A, int lda, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void dot(Stream<Device> *stream,
+                         int n,
+                         const DType* X, int incX,
+                         const DType* Y, int incY,
+                         DType* ret) {
+    LOG(FATAL) << "Not implmented!";
+  }
+};
+
+#if MSHADOW_STAND_ALONE
+template<>
+struct BLASEngine<cpu, float> {
+  inline static bool GetT(bool t) {
+    return t ? true : false;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda, const float *B, int ldb,
+                          float beta, float *C, int ldc) {
+    if (alpha == 1.0f && beta == 0.0f) {
+      bool transpose_left = transb;
+      bool transpose_right = transa;
+      Tensor<cpu, 2, float> lhs((float*)B, Shape2(transpose_left ? k : n, transpose_left ? n : k));  // NOLINT(*)
+      Tensor<cpu, 2, float> rhs((float*)A, Shape2(transpose_right ? m : k, transpose_right ? k : m));  // NOLINT(*)
+      Tensor<cpu, 2, float> dst(C, Shape2(m, n));
+      if (!transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs); return;
+      } else if (!transpose_left && transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs.T()); return;
+      } else if (transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs.T(), rhs); return;
+      } else {
+        LOG(FATAL) << "Not implmented!";
+      }
+    } else {
+      LOG(FATAL) << "Not implmented!";
+    }
+  }
+  inline static void batched_gemm(Stream<cpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, float alpha,
+                                  const float *A, int lda, const float *B, int ldb,
+                                  float beta, float *C, int ldc, int batch_count,
+                                  float **workspace) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n,
+                          float alpha, const float *A, int lda,
+                          const float *X, int incX,
+                          float beta, float *Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_gemv(Stream<cpu> *stream,
+                                  bool trans, int m, int n,
+                                  float alpha, const float *A, int lda,
+                                  const float *X, int incX,
+                                  float beta, float *Y, int incY, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_ger(Stream<cpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const float* X, int incX,
+                         const float* Y, int incY,
+                         float* ret) {
+    LOG(FATAL) << "Not implmented!";
+  }
+};
+
+template<>
+struct BLASEngine<cpu, double> {
+  inline static bool GetT(bool t) {
+    return t ? true : false;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda, const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    if (alpha == 1.0f && beta == 0.0f) {
+      bool transpose_left = transb;
+      bool transpose_right = transa;
+      Tensor<cpu, 2, double> lhs((double*)B, Shape2(transpose_left ? k : n, transpose_left ? n : k));  // NOLINT(*)
+      Tensor<cpu, 2, double> rhs((double*)A, Shape2(transpose_right ? m : k, transpose_right ? k : m));  // NOLINT(*)
+      Tensor<cpu, 2, double> dst(C, Shape2(m, n));
+      if (!transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs); return;
+      } else if (!transpose_left && transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs.T()); return;
+      } else if (transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs.T(), rhs); return;
+      } else {
+        LOG(FATAL) << "Not implmented!";
+      }
+    } else {
+      LOG(FATAL) << "Not implmented!";
+    }
+  }
+  inline static void batched_gemm(Stream<cpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, double alpha,
+                                  const double *A, int lda, const double *B, int ldb,
+                                  double beta, double *C, int ldc, int batch_count,
+                                  double **workspace) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n,
+                          double alpha, const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_gemv(Stream<cpu> *stream,
+                                  bool trans, int m, int n,
+                                  double alpha, const double *A, int lda,
+                                  const double *X, int incX,
+                                  double beta, double *Y, int incY, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_ger(Stream<cpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const double* X, int incX,
+                         const double* Y, int incY,
+                         double* ret) {
+    LOG(FATAL) << "Not implmented!";
+  }
+};
+
+#elif (MSHADOW_USE_MKL || MSHADOW_USE_CBLAS)  // NOLINT(*)
+template<>
+struct BLASEngine<cpu, float> {
+  inline static CBLAS_TRANSPOSE GetT(bool t) {
+    return t ? CblasTrans : CblasNoTrans;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda, const float *B, int ldb,
+                          float beta, float *C, int ldc) {
+    cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb),
+                m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void batched_gemm(Stream<cpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, float alpha,
+                                  const float *A, int lda, const float *B, int ldb,
+                                  float beta, float *C, int ldc, int batch_count,
+                                  float **workspace) {
+#if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
+  // since same m/n/k is used for all single gemms, so we put all gemms into one group
+  const int GROUP_SIZE = 1;
+  MKL_INT p_m[GROUP_SIZE] = {m};
+  MKL_INT p_n[GROUP_SIZE] = {n};
+  MKL_INT p_k[GROUP_SIZE] = {k};
+  MKL_INT p_lda[GROUP_SIZE] = {lda};
+  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
+  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+
+  float p_alpha[GROUP_SIZE] = {alpha};
+  float p_beta[GROUP_SIZE] = {beta};
+
+  CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
+  CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
+
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
+  CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
+
+  std::vector<const float*> pp_A;
+  std::vector<const float*> pp_B;
+  std::vector<float*> pp_C;
+  pp_A.reserve(batch_count);
+  pp_B.reserve(batch_count);
+  pp_C.reserve(batch_count);
+
+  auto m_k = m * k;
+  auto k_n = k * n;
+  auto m_n = m * n;
+
+  for (int i = 0; i < batch_count; i++) {
+    pp_A[i] = A + i * m_k;
+    pp_B[i] = B + i * k_n;
+    pp_C[i] = C + i * m_n;
+  }
+
+  cblas_sgemm_batch(CblasColMajor, p_transa, p_transb,
+                    p_m, p_n, p_k, p_alpha, pp_A.data(), p_lda, pp_B.data(),
+                    p_ldb, p_beta, pp_C.data(), p_ldc, GROUP_SIZE, p_group_sizeb);
+#else
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+#endif
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n,
+                          float alpha, const float *A, int lda,
+                          const float *X, int incX,
+                          float beta, float *Y, int incY) {
+    cblas_sgemv(CblasColMajor, GetT(trans), m, n, alpha,
+                A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void batched_gemv(Stream<cpu> *stream,
+                                  bool trans, int m, int n,
+                                  float alpha, const float *A, int lda,
+                                  const float *X, int incX,
+                                  float beta, float *Y, int incY, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemv(stream, trans, m, n, alpha, A + i * m * n, lda,
+           X + i * (trans ? m : n) * incX, incX,
+           beta, Y + i * (trans ? n : m) * incY, incY);
+    }
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    cblas_sger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+  inline static void batched_ger(Stream<cpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      ger(stream, m, n, alpha, X + i * m * incX, incX, Y + i * n * incY, incY,
+          A + i * lda * n, lda);
+    }
+  }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const float* X, int incX,
+                         const float* Y, int incY,
+                         float* ret) {
+    *ret = cblas_sdot(n, X, incX, Y, incY);
+  }
+};
+
+template<>
+struct BLASEngine<cpu, double> {
+  inline static CBLAS_TRANSPOSE GetT(bool t) {
+    return t ? CblasTrans : CblasNoTrans;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda, const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb),
+                m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+  inline static void batched_gemm(Stream<cpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, double alpha,
+                                  const double *A, int lda, const double *B, int ldb,
+                                  double beta, double *C, int ldc, int batch_count,
+                                  double **workspace) {
+#if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
+  // since same m/n/k is used for all single gemms, so we put all gemms into one group
+  const int GROUP_SIZE = 1;
+  MKL_INT p_m[GROUP_SIZE] = {m};
+  MKL_INT p_n[GROUP_SIZE] = {n};
+  MKL_INT p_k[GROUP_SIZE] = {k};
+  MKL_INT p_lda[GROUP_SIZE] = {lda};
+  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
+  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+
+  double p_alpha[GROUP_SIZE] = {alpha};
+  double p_beta[GROUP_SIZE] = {beta};
+
+  CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
+  CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
+
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
+  CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
+
+  std::vector<const double*> pp_A;
+  std::vector<const double*> pp_B;
+  std::vector<double*> pp_C;
+  pp_A.reserve(batch_count);
+  pp_B.reserve(batch_count);
+  pp_C.reserve(batch_count);
+
+  auto m_k = m * k;
+  auto k_n = k * n;
+  auto m_n = m * n;
+
+  for (int i = 0; i < batch_count; i++) {
+    pp_A[i] = A + i * m_k;
+    pp_B[i] = B + i * k_n;
+    pp_C[i] = C + i * m_n;
+  }
+
+  cblas_dgemm_batch(CblasColMajor, p_transa, p_transb,
+                    p_m, p_n, p_k, p_alpha, pp_A.data(), p_lda, pp_B.data(),
+                    p_ldb, p_beta, pp_C.data(), p_ldc, GROUP_SIZE, p_group_sizeb);
+#else
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+#endif
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n, double alpha,
+                          const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    cblas_dgemv(CblasColMajor, GetT(trans), m, n, alpha,
+                A, lda, X, incX, beta, Y, incY);
+  }
+  inline static void batched_gemv(Stream<cpu> *stream,
+                                  bool trans, int m, int n,
+                                  double alpha, const double *A, int lda,
+                                  const double *X, int incX,
+                                  double beta, double *Y, int incY, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemv(stream, trans, m, n, alpha, A + i * m * n, lda,
+           X + i * (trans ? m : n) * incX, incX,
+           beta, Y + i * (trans ? n : m) * incY, incY);
+    }
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
+  }
+  inline static void batched_ger(Stream<cpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      ger(stream, m, n, alpha, X + i * m * incX, incX, Y + i * n * incY, incY,
+          A + i * lda * n, lda);
+    }
+  }
+  inline static void dot(Stream<cpu> *stream,
+                         int n,
+                         const double* X, int incX,
+                         const double* Y, int incY,
+                         double* ret) {
+    *ret = cblas_ddot(n, X, incX, Y, incY);
+  }
+};
+#endif  // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL || MSHADOW_STAND_ALONE
+// CuBLAS redirect code
+#if MSHADOW_USE_CUDA
+// All CuBLAS goes to here, use legacy API: not threadsafe
+template<>
+struct BLASEngine<gpu, half::half_t> {
+  inline static cublasOperation_t GetT(bool t) {
+    return t ? CUBLAS_OP_T : CUBLAS_OP_N;
+  }
+  inline static void SetStream(Stream<gpu> *stream) {
+    cublasStatus_t err = cublasSetStream(Stream<gpu>::GetBlasHandle(stream),
+                    Stream<gpu>::GetStream(stream));
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas set stream fail";
+  }
+  inline static void gemm(Stream<gpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, half::half_t alpha,
+                          const half::half_t *A, int lda,
+                          const half::half_t *B, int ldb, half::half_t beta,
+                          half::half_t *C, int ldc) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 7050
+  // Always use pseudo-fp16: fp32 compute with fp16 I/O.
+  float alpha_f = float(alpha);  // NOLINT(*)
+  float beta_f = float(beta);  // NOLINT(*)
+  #if CUDA_VERSION >= 8000
+    cublasStatus_t err = cublasSgemmEx(Stream<gpu>::GetBlasHandle(stream),
+                                       GetT(transa), GetT(transb), m, n, k, &alpha_f,
+                                       A, CUDA_R_16F, lda, B, CUDA_R_16F,
+                                       ldb, &beta_f, C, CUDA_R_16F, ldc);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas SgemmEx fail";
+  #else
+    cublasStatus_t err = cublasSgemmEx(Stream<gpu>::GetBlasHandle(stream),
+                                       GetT(transa), GetT(transb), m, n, k, &alpha_f,
+                                       A, CUBLAS_DATA_HALF, lda, B, CUBLAS_DATA_HALF,
+                                       ldb, &beta_f, C, CUBLAS_DATA_HALF, ldc);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas SgemmEx fail";
+  #endif  // CUDA_VERSION >= 8000
+#else
+    LOG(FATAL) << "Require CUDA version >= 7.5!";
+#endif  // defined(CUDA_VERSION) && CUDA_VERSION >= 7050
+  }
+  inline static void batched_gemm(Stream<gpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, half::half_t alpha,
+                                  const half::half_t *A, int lda, const half::half_t *B, int ldb,
+                                  half::half_t beta, half::half_t *C, int ldc, int batch_count,
+                                  half::half_t **workspace) {
+#if defined(__CUDACC__) && CUDA_VERSION >= 9000
+    int major = stream->prop.major;
+    int minor = stream->prop.minor;
+    // fp16 is not supported before ARCH 53
+    if ((major > 5) || (major == 5 && minor >= 3)) {
+      const __half* A_h = reinterpret_cast<const __half*>(A);
+      const __half* B_h = reinterpret_cast<const __half*>(B);
+      __half* alpha_h = reinterpret_cast<__half*>(&alpha);
+      __half* beta_h = reinterpret_cast<__half*>(&beta);
+      __half* C_h = reinterpret_cast<__half*>(C);
+      cublasStatus_t err = cublasHgemmStridedBatched(Stream<gpu>::GetBlasHandle(stream),
+        GetT(transa), GetT(transb), m, n, k, alpha_h,
+        A_h, lda, m * k,
+        B_h, ldb, k * n,
+        beta_h, C_h, ldc, m * n,
+        batch_count);
+      CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: HgemmStridedBatched fail";
+      return;
+    }
+#endif
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+  }
+  inline static void gemv(Stream<gpu> *stream,
+                          bool trans, int m, int n, half::half_t alpha,
+                          const half::half_t *A, int lda,
+                          const half::half_t *X, int incX, half::half_t beta,
+                          half::half_t *Y, int incY) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_gemv(Stream<gpu> *stream,
+                                  bool trans, int m, int n,
+                                  half::half_t alpha, const half::half_t *A, int lda,
+                                  const half::half_t *X, int incX,
+                                  half::half_t beta, half::half_t *Y, int incY, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void ger(Stream<gpu> *stream,
+                         int m, int n, half::half_t alpha,
+                         const half::half_t *X, int incX,
+                         const half::half_t *Y, int incY, half::half_t *A, int lda) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void batched_ger(Stream<gpu> *stream,
+                         int m, int n, half::half_t alpha,
+                         const half::half_t *X, int incX, const half::half_t *Y, int incY,
+                         half::half_t *A, int lda, int batch_count) {
+    LOG(FATAL) << "Not implmented!";
+  }
+  inline static void dot(Stream<gpu> *stream,
+                         int n,
+                         const half::half_t* X, int incX,
+                         const half::half_t* Y, int incY,
+                         half::half_t *ret) {
+    LOG(FATAL) << "Not implmented!";
+  }
+};
+
+template<>
+struct BLASEngine<gpu, float> {
+  inline static cublasOperation_t GetT(bool t) {
+    return t ? CUBLAS_OP_T : CUBLAS_OP_N;
+  }
+  inline static void SetStream(Stream<gpu> *stream) {
+    cublasStatus_t err = cublasSetStream(Stream<gpu>::GetBlasHandle(stream),
+                    Stream<gpu>::GetStream(stream));
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: set stream fail";
+  }
+  inline static void gemm(Stream<gpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda,
+                          const float *B, int ldb, float beta,
+                          float *C, int ldc) {
+    cublasStatus_t err = cublasSgemm(Stream<gpu>::GetBlasHandle(stream),
+                GetT(transa), GetT(transb), m, n, k, &alpha,
+                A, lda, B, ldb, &beta, C, ldc);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Sgemm fail";
+  }
+  inline static void batched_gemm(Stream<gpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, float alpha,
+                                  const float *A, int lda, const float *B, int ldb,
+                                  float beta, float *C, int ldc, int batch_count,
+                                  float **workspace) {
+#if defined(__CUDACC__) && CUDA_VERSION >= 4010 && CUDA_VERSION < 8000
+    // Cast DType* to DType** using workspace as a buffer
+    bool alloc_workspace = false;
+    if (workspace == NULL) {
+      // Allocate the workspace if it's NULL.
+      // TODO(sxjscience) Try to move the allocation inside Tensor, which is thread-safe.
+      cudaMalloc(reinterpret_cast<void**>(&workspace), 3 * batch_count * sizeof(float*));
+      alloc_workspace = true;
+    }
+    GetBatchedView(workspace, const_cast<float*>(A), batch_count, m * k, stream);
+    GetBatchedView(workspace + batch_count,
+                   const_cast<float*>(B), batch_count, k * n, stream);
+    GetBatchedView(workspace + 2 * batch_count, C, batch_count, m * n, stream);
+    cublasStatus_t err = cublasSgemmBatched(Stream<gpu>::GetBlasHandle(stream),
+                                            GetT(transa), GetT(transb), m, n, k, &alpha,
+                                            (const float**)workspace, lda,
+                                            (const float**)(workspace + batch_count), ldb,
+                                            &beta, workspace + 2 * batch_count, ldc, batch_count);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: SgemmBatched fail";
+    if (alloc_workspace) {
+      cudaFree(workspace);
+    }
+#elif defined(__CUDACC__) && CUDA_VERSION >= 8000
+    cublasStatus_t err = cublasSgemmStridedBatched(Stream<gpu>::GetBlasHandle(stream),
+      GetT(transa), GetT(transb), m, n, k, &alpha,
+      A, lda, m * k,
+      B, ldb, k * n,
+      &beta, C, ldc, m * n,
+      batch_count);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: SgemmStridedBatched fail";
+#else
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+#endif  // defined(__CUDACC__) && CUDA_VERSION >= 4010
+  }
+  inline static void gemv(Stream<gpu> *stream,
+                          bool trans, int m, int n, float alpha,
+                          const float *A, int lda,
+                          const float *X, int incX, float beta,
+                          float *Y, int incY) {
+    cublasStatus_t err = cublasSgemv(Stream<gpu>::GetBlasHandle(stream),
+                GetT(trans), m, n, &alpha, A, lda, X, incX, &beta, Y, incY);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Sgemv fail";
+  }
+  inline static void batched_gemv(Stream<gpu> *stream,
+                                  bool trans, int m, int n,
+                                  float alpha, const float *A, int lda,
+                                  const float *X, int incX,
+                                  float beta, float *Y, int incY, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemv(stream, trans, m, n, alpha, A + i * m * n, lda,
+           X + i * (trans ? m : n) * incX, incX,
+           beta, Y + i * (trans ? n : m) * incY, incY);
+    }
+  }
+  inline static void ger(Stream<gpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    cublasStatus_t err = cublasSger(Stream<gpu>::GetBlasHandle(stream),
+                                    m, n, &alpha, X, incX, Y, incY, A, lda);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Sger fail";
+  }
+  inline static void batched_ger(Stream<gpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      ger(stream, m, n, alpha, X + i * m * incX, incX, Y + i * n * incY, incY,
+          A + i * lda * n, lda);
+    }
+  }
+  inline static void dot(Stream<gpu> *stream,
+                         int n,
+                         const float* X, int incX,
+                         const float* Y, int incY,
+                         float *ret) {
+    cublasSetPointerMode(Stream<gpu>::GetBlasHandle(stream),
+                         CUBLAS_POINTER_MODE_DEVICE);
+    cublasStatus_t err = cublasSdot(Stream<gpu>::GetBlasHandle(stream),
+                                    n, X, incX, Y, incY, ret);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dot fail";
+    cublasSetPointerMode(Stream<gpu>::GetBlasHandle(stream),
+                         CUBLAS_POINTER_MODE_HOST);
+  }
+};
+
+template<>
+struct BLASEngine<gpu, double> {
+  inline static cublasOperation_t GetT(bool t) {
+    return t ? CUBLAS_OP_T : CUBLAS_OP_N;
+  }
+  inline static void SetStream(Stream<gpu> *stream) {
+    cublasStatus_t err = cublasSetStream(Stream<gpu>::GetBlasHandle(stream),
+                    Stream<gpu>::GetStream(stream));
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: set stream fail";
+  }
+  inline static void gemm(Stream<gpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda,
+                          const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    cublasStatus_t err = cublasDgemm(Stream<gpu>::GetBlasHandle(stream),
+                GetT(transa), GetT(transb), m, n, k, &alpha,
+                A, lda, B, ldb, &beta, C, ldc);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dgemm fail";
+  }
+  inline static void batched_gemm(Stream<gpu> *stream,
+                                  bool transa, bool transb,
+                                  int m, int n, int k, double alpha,
+                                  const double *A, int lda, const double *B, int ldb,
+                                  double beta, double *C, int ldc, int batch_count,
+                                  double **workspace) {
+#if defined(__CUDACC__) && CUDA_VERSION >= 4010 && CUDA_VERSION < 8000
+    // Cast DType* to DType** using workspace as a buffer
+    bool alloc_workspace = false;
+    if (workspace == NULL) {
+      // Allocate the workspace if it's NULL.
+      // TODO(sxjscience) Try to move the allocation inside Tensor, which is thread-safe.
+      cudaMalloc(reinterpret_cast<void**>(&workspace), 3 * batch_count * sizeof(double*));
+      alloc_workspace = true;
+    }
+    GetBatchedView(workspace, const_cast<double*>(A), batch_count, m * k, stream);
+    GetBatchedView(workspace + batch_count,
+                   const_cast<double*>(B), batch_count, k * n, stream);
+    GetBatchedView(workspace + 2 * batch_count, C, batch_count, m * n, stream);
+    cublasStatus_t err = cublasDgemmBatched(Stream<gpu>::GetBlasHandle(stream),
+                                            GetT(transa), GetT(transb), m, n, k, &alpha,
+                                            (const double**)workspace, lda,
+                                            (const double**)(workspace + batch_count), ldb,
+                                            &beta, workspace + 2 * batch_count, ldc, batch_count);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: DgemmBatched fail";
+    if (alloc_workspace) {
+      cudaFree(workspace);
+    }
+#elif defined(__CUDACC__) && CUDA_VERSION >= 8000
+    cublasStatus_t err = cublasDgemmStridedBatched(Stream<gpu>::GetBlasHandle(stream),
+      GetT(transa), GetT(transb), m, n, k, &alpha,
+      A, lda, m * k,
+      B, ldb, k * n,
+      &beta, C, ldc, m * n,
+      batch_count);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: DgemmStridedBatched fail";
+#else
+    for (int i = 0; i < batch_count; ++i) {
+      gemm(stream, transa, transb, m, n, k, alpha,
+           A + i * m * k, lda, B + i * k * n, ldb,
+           beta, C + i * m * n, ldc);
+    }
+#endif  // defined(__CUDACC__) && CUDA_VERSION >= 4010
+  }
+  inline static void gemv(Stream<gpu> *stream,
+                          bool trans, int m, int n, double alpha,
+                          const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    cublasStatus_t err = cublasDgemv(Stream<gpu>::GetBlasHandle(stream),
+                GetT(trans), m, n, &alpha, A, lda, X, incX, &beta, Y, incY);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dgemv fail";
+  }
+  inline static void batched_gemv(Stream<gpu> *stream,
+                                  bool trans, int m, int n,
+                                  double alpha, const double *A, int lda,
+                                  const double *X, int incX,
+                                  double beta, double *Y, int incY, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      gemv(stream, trans, m, n, alpha, A + i * m * n, lda,
+           X + i * (trans ? m : n) * incX, incX,
+           beta, Y + i * (trans ? n : m) * incY, incY);
+    }
+  }
+  inline static void ger(Stream<gpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    cublasStatus_t err = cublasDger(Stream<gpu>::GetBlasHandle(stream),
+                                    m, n, &alpha, X, incX, Y, incY, A, lda);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dger fail";
+  }
+  inline static void batched_ger(Stream<gpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda, int batch_count) {
+    for (int i = 0; i < batch_count; ++i) {
+      ger(stream, m, n, alpha, X + i * m * incX, incX, Y + i * n * incY, incY,
+          A + i * lda * n, lda);
+    }
+  }
+  inline static void dot(Stream<gpu> *stream,
+                         int n,
+                         const double* X, int incX,
+                         const double* Y, int incY,
+                         double *ret) {
+    cublasSetPointerMode(Stream<gpu>::GetBlasHandle(stream),
+                         CUBLAS_POINTER_MODE_DEVICE);
+    cublasStatus_t err = cublasDdot(Stream<gpu>::GetBlasHandle(stream),
+                                    n, X, incX, Y, incY, ret);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Cublas: Dot fail";
+    cublasSetPointerMode(Stream<gpu>::GetBlasHandle(stream),
+                         CUBLAS_POINTER_MODE_HOST);
+  }
+};
+#endif  // MSHADOW_USE_CUDA
+// helper function to decide which shape we are in
+inline Shape<2> GetShape(const Shape<2> &shape, bool transpose) {
+  return transpose ? Shape2(shape[1], shape[0]) : shape;
+}
+// dst = dot(lhs[.T], rhs[.T])
+template<typename SV, typename xpu,
+         bool transpose_left, bool transpose_right, typename DType>
+struct DotEngine<SV, xpu, 2, 2, 2, transpose_left, transpose_right, DType> {
+  inline static void Eval(Tensor<xpu, 2, DType> *p_dst,
+                          const Tensor<xpu, 2, DType> &lhs,
+                          const Tensor<xpu, 2, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 2, DType> &dst = *p_dst;
+#if MSHADOW_STAND_ALONE
+    if (xpu::kDevMask == cpu::kDevMask && scale == 1.0f) {
+      if (!transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs); return;
+      } else if (!transpose_left && transpose_right) {
+        dst = expr::implicit_dot(lhs, rhs.T()); return;
+      } else if (transpose_left && !transpose_right) {
+        dst = expr::implicit_dot(lhs.T(), rhs); return;
+      }
+    }
+#endif
+    // set kernel stream
+    // if there is no stream, crush
+    BLASEngine<xpu, DType>::SetStream(dst.stream_);
+    Shape<2> sleft = GetShape(lhs.shape_, transpose_left);
+    Shape<2> sright = GetShape(rhs.shape_, transpose_right);
+    CHECK(dst.size(0) == sleft[0] && dst.size(1) == sright[1] && sleft[1] == sright[0])
+      << "dot-gemm: matrix shape mismatch";
+    // use column major argument to compatible with most BLAS
+    BLASEngine<xpu, DType>::gemm
+        (dst.stream_,
+         transpose_right , transpose_left,
+         transpose_right ? rhs.size(0) : rhs.size(1),
+         transpose_left  ? lhs.size(1) : lhs.size(0),
+         transpose_right ? rhs.size(1) : rhs.size(0),
+         DType(scale * SV::AlphaBLAS()),
+         rhs.dptr_, rhs.stride_,
+         lhs.dptr_, lhs.stride_,
+         DType(SV::BetaBLAS()),
+         dst.dptr_, dst.stride_);
+  }
+};
+template<typename SV, typename xpu, bool transpose_right, typename DType>
+struct DotEngine<SV, xpu, 1, 1, 2, false, transpose_right, DType> {
+  inline static void Eval(Tensor<xpu, 1, DType> *p_dst,
+                          const Tensor<xpu, 1, DType> &lhs,
+                          const Tensor<xpu, 2, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 1, DType> &dst = *p_dst;
+    // set kernel stream
+    // if there is no stream, crush
+    BLASEngine<xpu, DType>::SetStream(dst.stream_);
+    Shape<2> sright = GetShape(rhs.shape_, transpose_right);
+    CHECK(dst.size(0) == sright[1] && lhs.size(0) == sright[0])
+      << "dot-gemv: matrix shape mismatch"
+      << "dst: " << dst.shape_ << "\n"
+      << "lhs: " << lhs.shape_ << "\n"
+      << "rhs: " << sright << "\n";
+    BLASEngine<xpu, DType>::gemv
+        (dst.stream_,
+         transpose_right,
+         rhs.size(1), rhs.size(0), scale * SV::AlphaBLAS(),
+         rhs.dptr_, rhs.stride_,
+         lhs.dptr_, 1, SV::BetaBLAS(),
+         dst.dptr_, 1);
+  }
+};
+template<typename SV, typename xpu, typename DType>
+struct DotEngine<SV, xpu, 2, 1, 1, true, false, DType> {
+  inline static void Eval(Tensor<xpu, 2, DType> *p_dst,
+                          const Tensor<xpu, 1, DType> &lhs,
+                          const Tensor<xpu, 1, DType> &rhs,
+                          DType scale) {
+    Tensor<xpu, 2, DType> &dst = *p_dst;
+    // set kernel stream
+    // if there is no stream, crush
+    BLASEngine<xpu, DType>::SetStream(dst.stream_);
+    CHECK(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0))
+      << "dot-ger: matrix shape mismatch"
+      << "dst: " << dst.shape_ << "\n"
+      << "lhs: " << lhs.shape_ << "\n"
+      << "rhs: " << rhs.shape_;
+    if (SV::BetaBLAS() == 0.0f) {
+      BLASEngine<xpu, DType>::ger
+          (dst.stream_, rhs.size(0), lhs.size(0), scale * SV::AlphaBLAS(),
+           rhs.dptr_, 1, lhs.dptr_, 1, dst.dptr_, dst.stride_);
+    } else {
+      DotEngine<SV, xpu, 2, 2, 2, true, false,
+                DType>::Eval(p_dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale);
+    }
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_DOT_ENGINE_INL_H_
diff --git a/3rdparty/mshadow/mshadow/expr_engine-inl.h b/3rdparty/mshadow/mshadow/expr_engine-inl.h
new file mode 100644
index 000000000000..6421ebcff812
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/expr_engine-inl.h
@@ -0,0 +1,482 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expr_engine-inl.h
+ * \brief definitions of how expressions should be evaluated
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXPR_ENGINE_INL_H_
+#define MSHADOW_EXPR_ENGINE_INL_H_
+#include <utility>
+#include <algorithm>
+#include "./logging.h"
+#include "./expression.h"
+#include "./tensor.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief a general class that allows extension that makes tensors of some shape
+ * \tparam SubType type of subclass
+ * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
+ * \tparam dim dimension of the expression
+ * \tparam DType the type of elements
+ */
+template<typename SubType, typename SrcExp, int dim, typename DType>
+struct MakeTensorExp
+    : public Exp<MakeTensorExp<SubType, SrcExp, dim, DType>,
+                 DType, type::kChainer> {
+  /*! \brief the shape of this expression */
+  Shape<dim> shape_;
+  /*! \brief true self of subtype */
+  inline const SubType& real_self(void) const{
+    return *static_cast<const SubType*>(this);
+  }
+};
+//----------------------------------------------------------------------
+// This part of code gives plan that can be used to carry out execution
+//---------------------------------------------------------------------
+// Declarations of plans
+template<typename ExpType, typename DType>
+class Plan {
+ public:
+  /*!
+   * \brief evaluate the expression at index [y][x]
+   *  to be implemented by SubType, for RValue, the return type will be DType &
+   */
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const;
+};
+// tensor plan
+template <typename Device, int dim, typename DType>
+class Plan<Tensor<Device, dim, DType>, DType> {
+ public:
+  explicit Plan(const Tensor<Device, dim, DType> &t)
+      : dptr_(t.dptr_), stride_(t.stride_) {}
+  // for RValue, the return type should be reference
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    return dptr_[y * stride_ + x];
+  }
+  // const evaluation
+  MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const {
+    return dptr_[y * stride_ + x];
+  }
+
+ private:
+  DType  *dptr_;
+  index_t stride_;
+};
+// special evaluation case for 1d tensor, no stride
+template <typename Device, typename DType>
+class Plan<Tensor<Device, 1, DType>, DType> {
+ public:
+  explicit Plan(const Tensor<Device, 1, DType> &t) : dptr_(t.dptr_) {}
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    return dptr_[x];
+  }
+  MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const {
+    return dptr_[x];
+  }
+
+ private:
+  DType  *dptr_;
+};
+// scalar
+template<typename DType>
+class Plan<ScalarExp<DType>, DType> {
+ public:
+  explicit Plan(DType scalar) : scalar_(scalar) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return scalar_;
+  }
+
+ private:
+  DType scalar_;
+};
+// unary expression
+template<typename DstDType, typename SrcDType,
+         typename EType, int etype>
+class Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType> {
+ public:
+  explicit Plan(const Plan<EType, SrcDType> &src) : src_(src) {}
+  MSHADOW_XINLINE DstDType Eval(index_t y, index_t x) const {
+    return DstDType(src_.Eval(y, x));  // NOLINT(*)
+  }
+
+ private:
+  Plan<EType, SrcDType> src_;
+};
+
+// ternary expression
+template<typename OP, typename TA, typename TB, typename TC, int etype, typename DType>
+class Plan<TernaryMapExp<OP, TA, TB, TC, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &item1, const Plan<TB, DType> &item2,
+       const Plan<TC, DType> &item3)
+      : item1_(item1), item2_(item2), item3_(item3) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(item1_.Eval(y, x), item2_.Eval(y, x), item3_.Eval(y, x));
+  }
+
+ private:
+  Plan<TA, DType> item1_;
+  Plan<TB, DType> item2_;
+  Plan<TC, DType> item3_;
+};
+// binary expression
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
+      : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
+  }
+
+ private:
+  Plan<TA, DType> lhs_;
+  Plan<TB, DType> rhs_;
+};
+// unary expression
+template<typename OP, typename TA, int etype, typename DType>
+class Plan<UnaryMapExp<OP, TA, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(src_.Eval(y, x));
+  }
+
+ private:
+  Plan<TA, DType> src_;
+};
+// remaps map tensor expression to subtype's plan
+template<typename SubType, typename SrcExp, int dim, typename DType>
+struct Plan<MakeTensorExp<SubType, SrcExp, dim, DType>, DType> {
+ public:
+  Plan(const Plan<SubType, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(y, x);
+  }
+
+ private:
+  Plan<SubType, DType> src_;
+};
+// tranpsoe
+template<typename EType, typename DType>
+class Plan<TransposeExp<EType, DType>, DType> {
+ public:
+  explicit Plan(const Plan<EType, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(x, y);
+  }
+
+ private:
+  Plan<EType, DType> src_;
+};
+//----------------------------------------------------------------------
+// Mappings from expression to plans
+//---------------------------------------------------------------------
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakePlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
+
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int etype>
+inline Plan<TernaryMapExp<OP, TA, TB, TC, DType, etype>, DType>
+MakePlan(const TernaryMapExp<OP, TA, TB, TC, DType, etype> &e);
+
+template<typename DType>
+inline Plan<ScalarExp<DType>, DType> MakePlan(const ScalarExp<DType> &e) {
+  return Plan<ScalarExp<DType>, DType>(e.scalar_);
+}
+
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+inline Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType>
+MakePlan(const TypecastExp<DstDType, SrcDType, EType, etype> &e) {
+  return Plan<TypecastExp<DstDType, SrcDType, EType, etype>, DstDType>(MakePlan(e.exp));
+}
+
+template<typename T, typename DType>
+inline Plan<T, DType> MakePlan(const RValueExp<T, DType> &e) {
+  return Plan<T, DType>(e.self());
+}
+
+template<typename T, typename DType>
+inline Plan<TransposeExp<T, DType>, DType>
+MakePlan(const TransposeExp<T, DType> &e) {
+  return Plan<TransposeExp<T, DType>, DType>(MakePlan(e.exp));
+}
+
+template<typename T, typename SrcExp, int dim, typename DType>
+inline Plan<T, DType>
+MakePlan(const MakeTensorExp<T, SrcExp, dim, DType> &e) {
+  return Plan<T, DType>(e.real_self());
+}
+
+template<typename OP, typename TA, typename DType, int etype>
+inline Plan<UnaryMapExp<OP, TA, DType, etype>, DType>
+MakePlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
+  return Plan<UnaryMapExp<OP, TA, DType, etype>, DType>(MakePlan(e.src_));
+}
+
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+inline Plan<BinaryMapExp<OP, TA, TB, DType, etype>, DType>
+MakePlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
+  return Plan<BinaryMapExp<OP, TA, TB, DType, etype>,
+              DType>(MakePlan(e.lhs_), MakePlan(e.rhs_));
+}
+
+// Ternary
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int etype>
+inline Plan<TernaryMapExp<OP, TA, TB, TC, DType, etype>, DType>
+MakePlan(const TernaryMapExp<OP, TA, TB, TC, DType, etype> &e) {
+  return Plan<TernaryMapExp<OP, TA, TB, TC, DType, etype>,
+              DType>(MakePlan(e.item1_), MakePlan(e.item2_), MakePlan(e.item3_));
+}
+//----------------------------------------------------------------
+// Static Type inference and Type Checking
+//----------------------------------------------------------------
+/*!
+ * \brief static type inference template,
+ *        used to get the dimension of each expression,
+ *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
+ *        if (ExpInfo<E>::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu
+ * \tparam E expression
+ */
+template<typename E>
+struct ExpInfo {
+  static const int kDim = -1;
+  static const int kDevMask = 0;
+};
+template<typename DType>
+struct ExpInfo< ScalarExp<DType> > {
+  static const int kDim = 0;
+  static const int kDevMask = 0xffff;
+};
+template<typename E, typename DType>
+struct ExpInfo<TransposeExp<E, DType> > {
+  static const int kDim = ExpInfo<E>::kDim;
+  static const int kDevMask = ExpInfo<E>::kDevMask;
+};
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+struct ExpInfo<TypecastExp<DstDType, SrcDType, EType, etype> > {
+  static const int kDim = ExpInfo<EType>::kDim;
+  static const int kDevMask = ExpInfo<EType>::kDevMask;
+};
+template<typename Device, int dim, typename DType>
+struct ExpInfo<Tensor<Device, dim, DType> > {
+  static const int kDim = dim;
+  static const int kDevMask = Device::kDevMask;
+};
+template<typename T, typename SrcExp, int dim, typename DType>
+struct ExpInfo<MakeTensorExp<T, SrcExp, dim, DType> > {
+  static const int kDimSrc = ExpInfo<SrcExp>::kDim;
+  static const int kDim = kDimSrc >= 0 ? dim : -1;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+template<typename OP, typename TA, typename DType, int etype>
+struct ExpInfo<UnaryMapExp<OP, TA, DType, etype> > {
+  static const int kDim = ExpInfo<TA>::kDim;
+  static const int kDevMask = ExpInfo<TA>::kDevMask;
+};
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+struct ExpInfo<BinaryMapExp<OP, TA, TB, DType, etype> > {
+  static const int kDimLhs = ExpInfo<TA>::kDim;
+  static const int kDimRhs = ExpInfo<TB>::kDim;
+  static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\
+      (kDimLhs == 0 ?\
+       kDimRhs :\
+       ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1;
+  static const int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
+};
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int etype>
+struct ExpInfo<TernaryMapExp<OP, TA, TB, TC, DType, etype> > {
+  static const int kDimItem1 = ExpInfo<TA>::kDim;
+  static const int kDimItem2 = ExpInfo<TB>::kDim;
+  static const int kDimItem3 = ExpInfo<TC>::kDim;
+  static const int kDim = kDimItem1;
+  static const int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask & ExpInfo<TC>::kDevMask;
+};
+
+/*! \brief template to do type check */
+template<typename Device, int dim, typename DType, typename E>
+struct TypeCheck {
+  /*! \brief dimension of expression*/
+  static const int kExpDim = ExpInfo<E>::kDim;
+  /*! \brief whether the expression device type matches */
+  static const bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
+  /*! \brief whether the expression can be mapped to expression of dim */
+  static const bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
+  /*! \brief whether the expression can be reduced to expression of dim */
+  static const bool kRedPass = (kExpDim > dim) && kDevPass;
+};
+/*! \brief used to help static type check*/
+template<bool kPass>
+struct TypeCheckPass;
+// Todo : add static assert using C++11
+template<>
+struct TypeCheckPass<false> {};
+template<>
+struct TypeCheckPass<true> {
+  inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {}
+  inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {}
+  inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {}
+};
+
+//----------------------------------------------------------------
+// Runtime Stream Getting
+//----------------------------------------------------------------
+template<typename Device, typename E>
+struct StreamInfo {
+  inline static Stream<Device> *Get(const E &t);
+};
+template<int dim, typename Device, typename DType>
+struct StreamInfo<Device, Tensor<Device, dim, DType> > {
+  inline static Stream<Device> *Get(const Tensor<Device, dim, DType> &t) {
+    return t.stream_;
+  }
+};
+//----------------------------------------------------------------
+// Runtime Shape Checking
+//----------------------------------------------------------------
+/*!
+ * \brief runtime shape checking template
+ *    get the shape of an expression, report error if shape mismatch
+ * \tparam dim the dimension of the shape
+ * \tparam E expression
+ */
+template<int dim, typename E>
+struct ShapeCheck {
+  inline static Shape<dim> Check(const E &t);
+};
+template<int dim, typename DType>
+struct ShapeCheck<dim, ScalarExp<DType> > {
+  inline static Shape<dim> Check(const ScalarExp<DType> &exp) {
+    // use lowest dimension to mark scalar exp
+    Shape<dim> shape;
+    for (int i = 0; i < dim; ++i) {
+      shape[i] = 0;
+    }
+    return shape;
+  }
+};
+template<int dim, typename DstDType, typename SrcDType, typename EType, int etype>
+struct ShapeCheck<dim, TypecastExp<DstDType, SrcDType, EType, etype> > {
+  inline static Shape<dim>
+  Check(const TypecastExp<DstDType, SrcDType, EType, etype> &exp) {
+    return ShapeCheck<dim, EType>::Check(exp.exp);
+  }
+};
+template<int dim, typename E, typename DType>
+struct ShapeCheck<dim, TransposeExp<E, DType> > {
+  inline static Shape<dim> Check(const TransposeExp<E, DType> &e) {
+    // swap the lowest two dimensions
+    Shape<dim> s = ShapeCheck<dim, E>::Check(e.exp);
+    std::swap(s[0], s[1]);
+    return s;
+  }
+};
+template<int dim, typename Device, typename DType>
+struct ShapeCheck<dim, Tensor<Device, dim, DType> > {
+  inline static Shape<dim> Check(const Tensor<Device, dim, DType> &t) {
+    return t.shape_;
+  }
+};
+template<int dim, typename SrcExp, typename T, typename DType>
+struct ShapeCheck<dim, MakeTensorExp<T, SrcExp, dim, DType> > {
+  inline static Shape<dim>
+  Check(const MakeTensorExp<T, SrcExp, dim, DType> &t) {
+    return t.shape_;
+  }
+};
+template<int dim, typename OP, typename TA, typename DType, int etype>
+struct ShapeCheck<dim, UnaryMapExp<OP, TA, DType, etype> > {
+  inline static Shape<dim> Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
+    Shape<dim> s = ShapeCheck<dim, TA>::Check(t.src_);
+    return s;
+  }
+};
+
+template<int dim, typename OP, typename TA, typename TB,
+         typename DType, int etype>
+struct ShapeCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype> > {
+  inline static Shape<dim>
+  Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
+    Shape<dim> shape1 = ShapeCheck<dim, TA>::Check(t.lhs_);
+    Shape<dim> shape2 = ShapeCheck<dim, TB>::Check(t.rhs_);
+    if (shape1[0] == 0) return shape2;
+    if (shape2[0] == 0) return shape1;
+    CHECK_EQ(shape1, shape2) << "BinaryMapExp: Shapes of operands are not the same, " <<
+      "Shape1=" << shape1 << ", Shape2=" << shape2;
+    return shape1;
+  }
+};
+
+template<int dim, typename OP, typename TA, typename TB, typename TC,
+         typename DType, int etype>
+struct ShapeCheck<dim, TernaryMapExp<OP, TA, TB, TC, DType, etype> > {
+  inline static Shape<dim>
+  Check(const TernaryMapExp<OP, TA, TB, TC, DType, etype> &t) {
+    Shape<dim> shape1 = ShapeCheck<dim, TA>::Check(t.item1_);
+    Shape<dim> shape2 = ShapeCheck<dim, TB>::Check(t.item2_);
+    Shape<dim> shape3 = ShapeCheck<dim, TC>::Check(t.item3_);
+    bool same = (shape1 == shape2) && (shape2 == shape3);
+    CHECK(same) << "TernaryMapExp: Shapes of operands are not the same, " <<
+      "Shape1=" << shape1 << ", Shape2=" << shape2 << ", Shape3=" << shape3;
+
+    return shape1;
+  }
+};
+}  // namespace expr
+
+}  // namespace mshadow
+// include definition of dot engine
+#include "./dot_engine-inl.h"
+
+namespace mshadow {
+namespace expr {
+/*! \brief some engine that evaluate complex expression */
+template<typename SV, typename RV, typename E, typename DType>
+struct ExpComplexEngine {
+  inline static void Eval(RV *dst, const E &exp);
+};
+/*! \brief the engine that dispatches simple operations*/
+template<typename SV, typename RV, typename DType>
+struct ExpEngine {
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kMapper> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kChainer> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kRValue> &exp) {
+    MapExp<SV>(dst, exp);
+  }
+  template<typename E>
+  inline static void Eval(RV *dst,
+                          const Exp<E, DType, type::kComplex> &exp) {
+    ExpComplexEngine<SV, RV, E, DType>::Eval(dst->ptrself(), exp.self());
+  }
+};
+template<typename SV, typename Device, int dim, int ldim,
+         int rdim, bool ltrans, bool rtrans, typename DType>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, dim, DType>,
+                        DotExp<Tensor<Device, ldim, DType>,
+                               Tensor<Device, rdim, DType>,
+                               ltrans, rtrans, DType>,
+                        DType> {
+  inline static void Eval(Tensor<Device, dim, DType> *dst,
+                          const DotExp<Tensor<Device, ldim, DType>,
+                                       Tensor<Device, rdim, DType>,
+                                       ltrans, rtrans, DType> &exp) {
+    DotEngine<SV, Device, dim, ldim, rdim,
+              ltrans, rtrans, DType>::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_);
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXPR_ENGINE_INL_H_
diff --git a/3rdparty/mshadow/mshadow/expr_scalar-inl.h b/3rdparty/mshadow/mshadow/expr_scalar-inl.h
new file mode 100644
index 000000000000..1ddaba412543
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/expr_scalar-inl.h
@@ -0,0 +1,165 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expr_scalar-inl.h
+ * \brief definitions of operators in expression with respect to scalar
+ *  this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types
+ *
+ * DO NOT add pragma once or macro guard
+ * \author Tianqi Chen, Bing Xu
+ */
+// macro guard is harmful, used to pass the cpplint
+#ifndef MSHADOW_EXPR_SCALAR_INL_H_
+#define MSHADOW_EXPR_SCALAR_INL_H_
+// undef the guard so it can be included multiple times
+#undef MSHADOW_EXPR_SCALAR_INL_H_
+
+namespace mshadow {
+namespace expr {
+// DotExp
+/*! \brief dot operator def */
+template<typename TA, typename TB, bool ltrans, bool rtrans>
+inline DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_>
+operator*(const DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_> &lhs,
+          MSHADOW_SCALAR_ rhs) {
+  return DotExp<TA, TB, ltrans, rtrans,
+                MSHADOW_SCALAR_>(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs);
+}
+/*! \brief scale of dot operation */
+template<typename TA, typename TB, bool ltrans, bool rtrans>
+inline DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_>
+operator*(MSHADOW_SCALAR_ lhs,
+          const DotExp<TA, TB, ltrans, rtrans, MSHADOW_SCALAR_> &rhs) {
+  return DotExp<TA, TB, ltrans, rtrans,
+                MSHADOW_SCALAR_>(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs);
+}
+
+/*! \brief operator overload */
+template<typename E, typename DType, typename R, int d>
+inline ReduceTo1DExp<E, DType, R, d>
+operator*(const ReduceTo1DExp<E, DType, R, d> &e, MSHADOW_SCALAR_ scale) {
+  return ReduceTo1DExp<E, DType, R, d>(e.src_, e.scale_ * scale);
+}
+/*! \brief operator overload */
+template<typename E, typename DType, typename R, int d>
+inline ReduceTo1DExp<E, DType, R, d>
+operator*(MSHADOW_SCALAR_ scale, const ReduceTo1DExp<E, DType, R, d> &e) {
+  return ReduceTo1DExp<E, DType, R, d>(e.src_, e.scale_ * scale);
+}
+
+/*! \brief operator overload for const */
+template<typename OP, typename TA, int ta>
+inline BinaryMapExp<OP, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+F(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs, const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+/*! \brief operator overload for const */
+template<typename OP, typename TB, int tb>
+inline BinaryMapExp<OP, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+F(const ScalarExp<MSHADOW_SCALAR_> &lhs, const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+/*! \brief operator overload for const */
+template<typename OP>
+inline BinaryMapExp<OP, ScalarExp<MSHADOW_SCALAR_>, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (1|type::kMapper)>
+F(const ScalarExp<MSHADOW_SCALAR_> &lhs, const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+// constant operators
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::plus, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator+(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::minus, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator-(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::mul, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator*(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, int ta>
+inline BinaryMapExp<op::div, TA, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (ta|type::kMapper)>
+operator/(const Exp<TA, MSHADOW_SCALAR_, ta> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+// constant operators 2
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::plus, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator+(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::minus, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator-(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::mul, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator*(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TB, int tb>
+inline BinaryMapExp<op::div, ScalarExp<MSHADOW_SCALAR_>, TB,
+                    MSHADOW_SCALAR_, (tb|type::kMapper)>
+operator/(const ScalarExp<MSHADOW_SCALAR_> &lhs, const Exp<TB, MSHADOW_SCALAR_, tb> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+// constant operators 3
+/*! \brief operator overload */
+inline BinaryMapExp<op::plus, ScalarExp<MSHADOW_SCALAR_>, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (1|type::kMapper)>
+operator+(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+inline BinaryMapExp<op::minus, ScalarExp<MSHADOW_SCALAR_>, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (1|type::kMapper)>
+operator-(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+inline BinaryMapExp<op::mul, ScalarExp<MSHADOW_SCALAR_>, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (1|type::kMapper)>
+operator*(const ScalarExp<MSHADOW_SCALAR_> &lhs,
+          const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+inline BinaryMapExp<op::div, ScalarExp<MSHADOW_SCALAR_>, ScalarExp<MSHADOW_SCALAR_>,
+                    MSHADOW_SCALAR_, (1|type::kMapper)>
+operator/(const ScalarExp<MSHADOW_SCALAR_> &lhs, const ScalarExp<MSHADOW_SCALAR_> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXPR_SCALAR_INL_H_
diff --git a/3rdparty/mshadow/mshadow/expression.h b/3rdparty/mshadow/mshadow/expression.h
new file mode 100644
index 000000000000..77f943165088
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/expression.h
@@ -0,0 +1,416 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file expression.h
+ * \brief definitions of abstract expressions and expressions template
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXPRESSION_H_
+#define MSHADOW_EXPRESSION_H_
+#include "./base.h"
+
+namespace mshadow {
+/*!
+ * \brief namespace for abstract expressions and expressions template,
+ *        have no dependency on tensor.h,
+ *        These data structure takes no charge in computations,
+ *        they are only used to define operations and represent expression in a symbolic way
+ */
+namespace expr {
+/*! \brief type of expressions */
+namespace type {
+// type expression type are defined as bitmask
+// subtype relationshop kRValue < kMapper < kPull < kComplex
+/*!
+ * \brief this expression directly correspnds to a data class,
+ *   can be used to assign data
+ */
+const int kRValue = 0;
+/*!
+ * \brief expression contains element-wise tensor operations,
+ *   map a expression to same shape
+ */
+const int kMapper = 1;
+/*!
+ * \brief expression that can be chained with other expressiones
+ *    Usually it have function Eval(i,j) defined, which pulls the result (i, j) from input
+ *    expression and output the result at certain position.
+ */
+const int kChainer = 3;
+/*! \brief othercase: e.g dot product */
+const int kComplex = 7;
+}  // namespace type
+/*!
+ * \brief expression engine that actually interprets these expressions
+ *   this is a function template that needed to be implemented for specific expressions
+ * \tparam Saver the save method
+ * \tparam RValue the type of RValue to be saved
+ * \sa namespace sv
+ */
+template<typename Saver, typename RValue, typename DType>
+struct ExpEngine;
+/*! \brief defines how expression exp can be evaluated and stored into dst */
+// template<typename EType>
+// inline static void Eval(RValue *dst, const EType &exp);
+/*!
+ * \brief base class for expression
+ * \tparam SubType inheritated class must put their type into this parameter
+ * \tparam DType the data type of each element in the expression
+ * \tparam exp_type expression type, see namespace type
+ */
+template<typename SubType, typename DType, int exp_type>
+struct Exp {
+ public:
+  /*! \return  subtype instance of current class */
+  inline const SubType& self(void) const {
+    return *static_cast<const SubType*>(this);
+  }
+  /*! \return reference of subtype instance of current class */
+  inline SubType* ptrself(void) {
+    return static_cast<SubType*>(this);
+  }
+};
+/*!
+ * \brief scalar expression
+ * \tparam DType the data type of the scalar
+ */
+template<typename DType>
+struct ScalarExp: public Exp<ScalarExp<DType>, DType, type::kMapper> {
+  /*! \brief scalar value */
+  DType scalar_;
+  /*! \brief implicit constructor, MUST NOT BE explicit */
+  ScalarExp(DType scalar) : scalar_(scalar) {}  // NOLINT(*)
+};
+/*! \brief create an scalar expression */
+template<typename DType>
+inline ScalarExp<DType> scalar(DType s) {
+  return ScalarExp<DType>(s);
+}
+/*!
+ * \brief typecast expression, cast the type of elements
+ * \tparam DstDType the target type we want to cast into
+ * \tparam SrcDType the target type we want to cast from
+ * \tparam EType the type of the source expression
+ * \tparam etype the type of expression after cast
+ */
+template<typename DstDType, typename SrcDType, typename EType, int etype>
+struct TypecastExp:
+      public Exp<TypecastExp<DstDType, SrcDType, EType, etype>,
+                 DstDType, etype> {
+  /*! \brief expression to be typecasted */
+  const EType &exp;
+  /*! \brief constructor */
+  explicit TypecastExp(const EType &e) : exp(e) {}
+};
+/*! \brief create an scalar expression */
+template<typename DstDType, typename SrcDType,
+         typename EType, int etype>
+inline TypecastExp<DstDType, SrcDType, EType, (etype|type::kMapper)>
+tcast(const Exp<EType, SrcDType, etype> &exp) {
+  return TypecastExp<DstDType, SrcDType, EType, (etype|type::kMapper)>(exp.self());
+}
+/*! \brief represent a transpose expression of a container */
+template<typename EType, typename DType>
+struct TransposeExp: public Exp<TransposeExp<EType, DType>,
+                                DType, type::kChainer> {
+  /*! \brief expression to be transposed */
+  const EType &exp;
+  /*! \brief constructor */
+  explicit TransposeExp(const EType &e) : exp(e) {}
+  /*! \brief transpose expression */
+  inline const EType &T(void) const {
+    return exp;
+  }
+};
+/*!
+ * \brief base class of all rvalues
+ * \tparam Container the actually class of data container, e.g. Tensor1D
+ * \tparam DataType the element data type of each element in the container
+ */
+template<typename Container, typename DType>
+class RValueExp: public Exp<Container, DType, type::kRValue> {
+ public:
+  /*!
+   *\brief transpose of a matrix
+   *\return transpose of current expression
+   */
+  inline const TransposeExp<Container, DType> T(void) const {
+    return TransposeExp<Container, DType>(this->self());
+  }
+  /*! \brief operator overload */
+  inline Container &operator+=(DType s) {
+    ExpEngine<sv::plusto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator-=(DType s) {
+    ExpEngine<sv::minusto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator*=(DType s) {
+    ExpEngine<sv::multo, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &operator/=(DType s) {
+    ExpEngine<sv::divto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload */
+  inline Container &__assign(DType s) {
+    ExpEngine<sv::saveto, Container, DType>::Eval(this->ptrself(), scalar<DType>(s));
+    return *(this->ptrself());
+  }
+  /*! \brief  we can not define container = container */
+  template<typename E, int etype>
+  inline Container &__assign(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::saveto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief operator overload, assign */
+  inline Container &__assign(const Exp<Container, DType, type::kRValue> &exp);
+  /*! \brief implementation of operator+= */
+  template<typename E, int etype>
+  inline Container &operator+=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::plusto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator-= */
+  template<typename E, int etype>
+  inline Container &operator-=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::minusto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator*= */
+  template<typename E, int etype>
+  inline Container &operator*=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::multo, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+  /*! \brief implementation of operator/= */
+  template<typename E, int etype>
+  inline Container &operator/=(const Exp<E, DType, etype> &exp) {
+    ExpEngine<sv::divto, Container, DType>::Eval(this->ptrself(), exp.self());
+    return *(this->ptrself());
+  }
+};
+/*!
+ * \brief matrix multiplication expression dot(lhs[.T], rhs[.T])
+ * \tparam TA type of lhs
+ * \tparam TB type of rhs
+ * \tparam ltrans whether lhs is transposed
+ * \tparam rtrans whether rhs is transposed
+ * \tparam DType the data type of the scalar
+ */
+template<typename TA, typename TB, bool ltrans, bool rtrans, typename DType>
+struct DotExp: public Exp<DotExp<TA, TB, ltrans, rtrans, DType>,
+                          DType, type::kComplex> {
+  /*! \brief left operand */
+  const TA &lhs_;
+  /*! \brief right operand */
+  const TB &rhs_;
+  /*! \brief scale over result */
+  DType scale_;
+  /*! \brief constructor */
+  explicit DotExp(const TA &lhs, const TB &rhs, DType scale)
+      : lhs_(lhs), rhs_(rhs), scale_(scale) {}
+};
+// definition of dot expression
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, false, false, DType>
+dot(const RValueExp<TA, DType> &lhs, const RValueExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, false, false, DType>(lhs.self(), rhs.self(), DType(1.0f));
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, true, false, DType>
+dot(const TransposeExp<TA, DType> &lhs, const RValueExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, true, false, DType>(lhs.exp, rhs.self(), DType(1.0f));
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, false, true, DType>
+dot(const RValueExp<TA, DType> &lhs, const TransposeExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, false, true, DType>(lhs.self(), rhs.exp, DType(1.0f));
+}
+/*! \brief dot operator def */
+template<typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, true, true, DType>
+dot(const TransposeExp<TA, DType> &lhs, const TransposeExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, true, true, DType>(lhs.exp, rhs.exp, DType(1.0f));
+}
+/*! \brief batch_dot operator def */
+template<bool transpose_left, bool transpose_right, typename TA, typename TB, typename DType>
+inline DotExp<TA, TB, transpose_left, transpose_right, DType>
+batch_dot(const RValueExp<TA, DType> &lhs, const RValueExp<TB, DType> &rhs) {
+  return DotExp<TA, TB, transpose_left, transpose_right, DType>(
+    lhs.self(), rhs.self(), DType(1.0f));
+}
+//---------------
+// TernaryMapExp
+// --------------
+/*!
+ * \brief ternary map expression
+ * \tparam OP operator
+ * \tparam TA type of item1
+ * \tparam TB type of item2
+ * \tparam etype expression type, sa namespace::type
+ */
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int etype>
+struct TernaryMapExp: public Exp<TernaryMapExp<OP, TA, TB, TC, DType, etype>,
+                                DType, etype> {
+  /*! \brief first operand */
+  const TA &item1_;
+  /*! \brief second operand */
+  const TB &item2_;
+  /*! \brief third  operand */
+  const TC &item3_;
+  /*! \brief constructor */
+  explicit TernaryMapExp(const TA &item1, const TB &item2, const TC &item3)
+      :item1_(item1), item2_(item2), item3_(item3) {}
+};
+
+/*! \brief make expression */
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int ta, int tb, int tc>
+inline TernaryMapExp<OP, TA, TB, TC, DType, (ta|tb|tc|type::kMapper)>
+MakeExp(const Exp<TA, DType, ta> &item1, const Exp<TB, DType, tb> &item2,
+ const Exp<TC, DType, tc> &item3) {
+  return TernaryMapExp<OP, TA, TB, TC, DType,
+                      (ta|tb|tc|type::kMapper)>(item1.self(), item2.self(), item3.self());
+}
+/*!
+ * \brief short hand for MakeExp, usage F<op>(item1,item2,item3). create a ternary operation expression
+ * \param item1 first operand
+ * \param item2 second operand
+ * \param item3 third operand
+ * \return the result expression
+ * \tparam ternary operator
+ * \tparam TA item1 expression
+ * \tparam ta item1 expression type
+ * \tparam TB item2 expression
+ * \tparam tb item2 expression type
+ * \tparam TC item3 expression
+ * \tparam tc item3 expression type
+ * \sa mshadow::op
+ */
+
+// Ternary
+template<typename OP, typename TA, typename TB, typename TC, typename DType, int ta, int tb, int tc>
+inline TernaryMapExp<OP, TA, TB, TC, DType, (ta|tb|tc|type::kMapper)>
+F(const Exp<TA, DType, ta> &item1, const Exp<TB, DType, tb> &item2,
+ const Exp<TC, DType, tc> &item3) {
+  return MakeExp<OP>(item1, item2, item3);
+}
+//---------------
+// BinaryMapExp
+// --------------
+/*!
+ * \brief binary map expression lhs [op] rhs
+ * \tparam OP operator
+ * \tparam TA type of lhs
+ * \tparam TB type of rhs
+ * \tparam etype expression type, sa namespace::type
+ */
+template<typename OP, typename TA, typename TB, typename DType, int etype>
+struct BinaryMapExp: public Exp<BinaryMapExp<OP, TA, TB, DType, etype>,
+                                DType, etype> {
+  /*! \brief left operand */
+  const TA &lhs_;
+  /*! \brief right operand */
+  const TB &rhs_;
+  /*! \brief constructor */
+  explicit BinaryMapExp(const TA &lhs, const TB &rhs)
+      :lhs_(lhs), rhs_(rhs) {}
+};
+
+/*! \brief make expression */
+template<typename OP, typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<OP, TA, TB, DType, (ta|tb|type::kMapper)>
+MakeExp(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return BinaryMapExp<OP, TA, TB, DType,
+                      (ta|tb|type::kMapper)>(lhs.self(), rhs.self());
+}
+/*!
+ * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression
+ * \param lhs left operand
+ * \param rhs right operand
+ * \return the result expression
+ * \tparam binary operator
+ * \tparam TA lhs expression
+ * \tparam ta lhs expression type
+ * \tparam TB rhs expression
+ * \tparam tb rhs expression type
+ * \sa mshadow::op
+ */
+template<typename OP, typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<OP, TA, TB, DType, (ta|tb|type::kMapper)>
+F(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<OP>(lhs, rhs);
+}
+// operator rules
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::plus, TA, TB, DType, (ta|tb|type::kMapper)>
+operator+(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::plus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::minus, TA, TB, DType, (ta|tb|type::kMapper)>
+operator-(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::minus>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::mul, TA, TB, DType, (ta|tb|type::kMapper)>
+operator*(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::mul>(lhs, rhs);
+}
+/*! \brief operator overload */
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline BinaryMapExp<op::div, TA, TB, DType, (ta|tb|type::kMapper)>
+operator/(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return MakeExp<op::div>(lhs, rhs);
+}
+//---------------
+// UnaryMapExp
+// --------------
+/*!
+ * \brief unary map expression op(src)
+ * \tparam OP operator
+ * \tparam TA type of src
+ * \tparam etype expression type, sa namespace::type
+ */
+template<typename OP, typename TA, typename DType, int etype>
+struct UnaryMapExp: public Exp<UnaryMapExp<OP, TA, DType, etype>,
+                               DType, etype> {
+  /*! \brief source expression */
+  const TA &src_;
+  /*! \brief constructor */
+  explicit UnaryMapExp(const TA &src) : src_(src) {}
+};
+
+/*! \brief make expression */
+template<typename OP, typename TA, typename DType, int ta>
+inline UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>
+MakeExp(const Exp<TA, DType, ta> &src) {
+  return UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>(src.self());
+}
+/*!
+ * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression
+ * \param src source expression
+ * \return the result expression
+ * \tparam operator
+ * \tparam TA source expression
+ * \tparam ta source expression type
+ * \sa mshadow::op
+ */
+template<typename OP, typename TA, typename DType, int ta>
+inline UnaryMapExp<OP, TA, DType, (ta|type::kMapper)>
+F(const Exp<TA, DType, ta> &src) {
+  return MakeExp<OP>(src);
+}
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXPRESSION_H_
diff --git a/3rdparty/mshadow/mshadow/extension.h b/3rdparty/mshadow/mshadow/extension.h
new file mode 100644
index 000000000000..7af0f56f7699
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension.h
@@ -0,0 +1,41 @@
+/*!
+ * Copyright by Contributors
+ * \file extension.h
+ * \brief some extension of expressions,
+ *  used to support something beyond elementwise op
+ * \author Tianqi Chen, Bing Xu
+ */
+#ifndef MSHADOW_EXTENSION_H_
+#define MSHADOW_EXTENSION_H_
+#include "./expr_engine-inl.h"
+#include "./extension/broadcast.h"
+#include "./extension/unpack_patch2col.h"
+#include "./extension/pack_col2patch.h"
+#include "./extension/reshape.h"
+#include "./extension/swapaxis.h"
+#include "./extension/reduceto1d.h"
+#include "./extension/spatial_pool.h"
+#include "./extension/spatial_unpool.h"
+#include "./extension/channel_pool.h"
+#include "./extension/channel_unpool.h"
+#include "./extension/pad.h"
+#include "./extension/crop.h"
+#include "./extension/mirror.h"
+#include "./extension/concat.h"
+#include "./extension/implicit_gemm.h"
+#include "./extension/choose.h"
+#include "./extension/fill.h"
+#include "./extension/one_hot.h"
+#include "./extension/slice.h"
+#include "./extension/slice_ex.h"
+#include "./extension/take.h"
+#include "./extension/take_grad.h"
+#include "./extension/reduce_with_axis.h"
+#include "./extension/broadcast_with_axis.h"
+#include "./extension/spatial_upsampling_nearest.h"
+#include "./extension/transpose.h"
+#include "./extension/flip.h"
+#include "./extension/complex.h"
+#include "./extension/range.h"
+#include "./extension/mask.h"
+#endif  // MSHADOW_EXTENSION_H_
diff --git a/3rdparty/mshadow/mshadow/extension/broadcast.h b/3rdparty/mshadow/mshadow/extension/broadcast.h
new file mode 100644
index 000000000000..ea138ccd9e4d
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/broadcast.h
@@ -0,0 +1,165 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file broadcast.h
+ * \brief support for broadcast and repmat
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_BROADCAST_H_
+#define MSHADOW_EXTENSION_BROADCAST_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief broadcast Tensor1D into a higher dimension Tensor
+ * input: Tensor<Device,1>: ishape[0]
+ * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst  target tensor dimension
+ * \tparam dimcast_m_dst  dimdst - dimcast
+ */
+template<typename SrcExp, typename DType, int dimdst, int dimdst_m_cast>
+struct Broadcast1DExp:
+      public MakeTensorExp<Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast>,
+                           SrcExp, dimdst, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  Broadcast1DExp(const SrcExp &src, Shape<dimdst> shape)
+      : src_(src) {
+    this->shape_ = shape;
+  }
+};
+
+/*!
+ * \brief broadcast scalar into a higher dimension Tensor
+ * input: Tensor<Device,1>: ishape = {1}
+ * output: Tensor<Device, dimdst> : oshape[dimcast] = ishape[0]
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst  target tensor dimension
+ */
+template<typename SrcExp, typename DType, int dimdst>
+struct BroadcastScalarExp:
+      public MakeTensorExp<BroadcastScalarExp<SrcExp, DType, dimdst>,
+                           SrcExp, dimdst, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  BroadcastScalarExp(const SrcExp &src, Shape<dimdst> shape)
+      : src_(src) {
+    this->shape_ = shape;
+  }
+};
+
+/*!
+ * \brief a expression that replicate a 1 dimension tensor in dimension dimcast
+ * \param src Tensor<Device,1>: shape[0]
+ * \param shape shape of output
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam dimcast target dimension where the 1D tensor will be broadcasted
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst dimension of destination tensor
+ * \tparam dimcast_lowest the dimension we want to cast the data into
+ */
+template<int dimcast, typename SrcExp, typename DType,
+         int etype, int dimdst>
+inline Broadcast1DExp<SrcExp, DType, dimdst, dimdst - dimcast>
+broadcast(const expr::Exp<SrcExp, DType, etype> &src, Shape<dimdst> shape) {
+  TypeCheckPass<dimcast < dimdst && ExpInfo<SrcExp>::kDim == 1>
+                ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  typedef ShapeCheck<1, SrcExp> ShapeCheckDim1SrcExp;
+  CHECK_EQ(ShapeCheckDim1SrcExp::Check(src.self())[0], shape[dimcast])
+    << "broadcast, shape mismatch";
+  return Broadcast1DExp<SrcExp, DType, dimdst,
+                        dimdst - dimcast>(src.self(), shape);
+}
+
+/*!
+ * \brief a expression that replicate a scalar tensor to target dimension.
+ * \param src Tensor<Device,1>: shape[0] == 1
+ * \param shape shape of output
+ * \return a expresion with type Tensor<Device, dimdst>
+ * \tparam dimcast target dimension where the 1D tensor will be broadcasted
+ * \tparam SrcExp type of input expression
+ * \tparam DType the type of elements
+ * \tparam dimdst dimension of destination tensor
+ */
+template<typename SrcExp, typename DType, int etype, int dimdst>
+inline BroadcastScalarExp<SrcExp, DType, dimdst>
+broadcast_scalar(const expr::Exp<SrcExp, DType, etype> &src, Shape<dimdst> shape) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 1>
+                ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  typedef ShapeCheck<1, SrcExp> ShapeCheckDim1SrcExp;
+  CHECK_EQ(ShapeCheckDim1SrcExp::Check(src.self())[0], 1U)
+      << "broadcast_scalar, source need to be scalar expression";
+  return BroadcastScalarExp<SrcExp, DType, dimdst>(src.self(), shape);
+}
+// short cut functions
+/*!
+ * \brief a expression that replicate a 1 dimension tensor for nrow times
+ * \param src Tensor<Device,1>: shape[0]
+ * \param nrow number of rows to replicate
+ * \return a expresion with type Tensor<Device,2> size(1), size(0) = nrow
+ * \tparam Device which device it lies
+ */
+template<typename SrcExp, typename DType, int etype>
+inline Broadcast1DExp<SrcExp, DType, 2, 1>
+repmat(const expr::Exp<SrcExp, DType, etype> &src, index_t nrow) {
+  return broadcast<1>
+      (src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0]));
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dimdst, int dimdst_m_cast>
+struct Plan<Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast>, DType> {
+ public:
+  static const int dimcast = dimdst - dimdst_m_cast;
+  explicit Plan(const Broadcast1DExp<SrcExp, DType, dimdst, dimdst_m_cast> &e)
+      : src_(MakePlan(e.src_)),
+        ystride_(e.shape_.ProdShape(dimcast + 1, dimdst - 1)),
+        length_(e.shape_[dimcast]) {
+    TypeCheckPass<dimcast != dimdst - 1>
+        ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, (y / ystride_) % length_);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+  const index_t  ystride_, length_;
+};
+
+/*! \brief execution plan of Broadcast1DExp */
+template<typename SrcExp, typename DType, int dimdst>
+struct Plan<Broadcast1DExp<SrcExp, DType, dimdst, 1>, DType>{
+ public:
+  explicit Plan(const Broadcast1DExp<SrcExp, DType, dimdst, 1> &e)
+      : src_(MakePlan(e.src_)) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, x);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+};
+
+/*! \brief execution plan of Broadcast1DExp */
+template<typename SrcExp, typename DType, int dimdst>
+struct Plan<BroadcastScalarExp<SrcExp, DType, dimdst>, DType>{
+ public:
+  explicit Plan(const BroadcastScalarExp<SrcExp, DType, dimdst> &e)
+      : src_(MakePlan(e.src_)) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, 0);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_BROADCAST_H_
diff --git a/3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h b/3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h
new file mode 100644
index 000000000000..49605af67d32
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h
@@ -0,0 +1,258 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file broadcast_with_axis.h
+ * \brief
+ * \author Junyuan Xie, Xingjian Shi
+*/
+#ifndef MSHADOW_EXTENSION_BROADCAST_WITH_AXIS_H_
+#define MSHADOW_EXTENSION_BROADCAST_WITH_AXIS_H_
+
+#include <vector>
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+  /*!
+  * \brief Broadcasting the tensor in the given axis. If keepdim is off, insert the broadcasting dim after axis. Otherwise broadcasting axis.
+  * \tparam SrcExp source expression
+  * \tparam DType  data type
+  * \tparam dimsrc source dimension
+  * \tparam dimdst destination dimension
+  */
+template<typename SrcExp, typename DType, int dimsrc, int dimdst>
+struct BroadcastWithAxisExp:
+    public MakeTensorExp<BroadcastWithAxisExp<SrcExp, DType, dimsrc, dimdst>,
+                         SrcExp, dimdst, DType> {
+  /*! \brief data oprand */
+  const SrcExp &src_;
+  /*! \brief size of the last dimension of dst */
+  index_t dst_last_;
+  /*! \brief product of the dimensions after the broadcasting axis */
+  index_t trailing_;
+  /*! \brief new dimension of the broadcasting axis*/
+  index_t size_;
+  /*! \brief size of the last dimension of src*/
+  index_t last_;
+  /*! constructor */
+  BroadcastWithAxisExp(const SrcExp &src, const int axis, const index_t size)
+    : src_(src), size_(size) {
+    bool keepdim = (dimsrc == dimdst);
+    Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+    this->trailing_ = 1;
+
+    if (!keepdim) {
+      CHECK(dimsrc > axis && axis >= -1) << "broadcast axis (no keepdim) out of bound, "  <<
+        "axis must be between -1 and" << dimsrc - 1 << ", given=" << axis << ".";
+      for (int i = 0; i <= axis; ++i) {
+        this->shape_[i] = src_shape[i];
+      }
+      this->shape_[axis + 1] = size_;
+      for (int i = axis + 1; i < dimsrc; ++i) {
+        this->trailing_ *= src_shape[i];
+        this->shape_[i + 1] = src_shape[i];
+      }
+    } else {
+      CHECK(dimdst > axis && axis >= 0) << "broadcast axis (keepdim) out of bound, " <<
+        "axis must be between 0 and" << dimdst - 1 << ", given=" << axis << ".";
+      CHECK_EQ(src_shape[axis], 1U) << "Size of the dimension of the broadcasting axis must be 1" <<
+        " when keepdim is on, src_shape[" << axis << "]=" << src_shape[axis] << ".";
+      for (int i = 0; i <= axis - 1; ++i) {
+        this->shape_[i] = src_shape[i];
+      }
+      this->shape_[axis] = size_;
+      for (int i = axis + 1; i < dimdst; ++i) {
+        this->trailing_ *= src_shape[i];
+        this->shape_[i] = src_shape[i];
+      }
+    }
+
+    this->last_ = src_shape[dimsrc - 1];
+    this->dst_last_ = this->shape_[dimdst - 1];
+  }
+};  // struct BroadcastWithAxisExp
+
+/*!
+ * \brief Broadcasting the tensor after given axis.
+ * \tparam SrcExp source expression
+ * \tparam DType data type
+ * \tparam etype type of the expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline BroadcastWithAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+  ExpInfo<SrcExp>::kDim + 1>
+broadcast_with_axis(const Exp<SrcExp, DType, etype> &src, const int axis, const index_t size) {
+  return BroadcastWithAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+    ExpInfo<SrcExp>::kDim + 1>(src.self(), axis, size);
+}
+
+/*!
+* \brief Broadcasting the tensor in the given axis (keepdim turned on)
+* \tparam SrcExp source expression
+* \tparam DType data type
+* \tparam etype type of the expression
+*/
+template<typename SrcExp, typename DType, int etype>
+inline BroadcastWithAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+  ExpInfo<SrcExp>::kDim>
+  broadcast_keepdim(const Exp<SrcExp, DType, etype> &src, const int axis, const index_t size) {
+  return BroadcastWithAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+    ExpInfo<SrcExp>::kDim>(src.self(), axis, size);
+}
+
+/*!
+* \brief Broadcasting the tensor in multiple axes. The dimension of the source tensor
+         in the given axes must be 1.
+* \tparam SrcExp source expression
+* \tparam DType  data type
+* \tparam dimsrc source dimension
+* \tparam axesnum number of broadcasting dimensions
+*/
+template<typename SrcExp, typename DType, int dimsrc>
+struct BroadcastWithMultiAxesExp :
+      public MakeTensorExp<BroadcastWithMultiAxesExp<SrcExp, DType, dimsrc>,
+  SrcExp, dimsrc, DType> {
+  /*! \brief data oprand */
+  const SrcExp &src_;
+  /*! \brief size of the last dimension of dst */
+  index_t dst_last_;
+  /*! \brief number of broadcasting axes*/
+  index_t axesnum_;
+  /*! \brief product of the dimensions after the broadcasting axses */
+  Shape<dimsrc> trailings_;
+  /*! \brief new dimension of the broadcasting axes*/
+  Shape<dimsrc> sizes_;
+  /*! \brief size of the last dimension of src*/
+  index_t last_;
+  /*! constructor */
+  template<typename TShape>
+  BroadcastWithMultiAxesExp(const SrcExp &src, const TShape& axes, const TShape& sizes)
+    : src_(src) {
+    Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+    CHECK(axes.ndim() == sizes.ndim()) << "ndim of axes and sizes must be equal.";
+    this->axesnum_ = axes.ndim();
+    CHECK(this->axesnum_ <= dimsrc) << "Number of broadcasting axes must be smaller than"
+      "the source ndim, number of axes=" << this->axesnum_ << " dimsrc=" << dimsrc;
+    for (index_t i = 0; i < this->axesnum_; i++) {
+      CHECK(dimsrc > axes[i]) << "broadcast axis (keepdim) out of bound, " <<
+        "all axes must be between 0 and" << dimsrc - 1 << ", given axes[" << i << "] = " << axes[i]
+        << ".";
+      CHECK_EQ(src_shape[axes[i]], 1U) << "Size of the dimension of the broadcasting axis must be 1"
+        << ", src_shape[" << axes[i] << "]=" << src_shape[axes[i]] << ".";
+      if (i < this->axesnum_ - 1) {
+        CHECK(axes[i] < axes[i + 1]) << "The given axes must be in increasing order.";
+      }
+    }
+    for (index_t i = 0; i < dimsrc; i++) {
+      this->shape_[i] = src_shape[i];
+      this->sizes_[i] = 1;
+      this->trailings_[i] = 1;
+    }
+    for (index_t i = 0; i < this->axesnum_; i++) {
+      this->shape_[axes[i]] = sizes[i];
+      this->sizes_[i] = sizes[i];
+    }
+    for (index_t i = 0; i < this->axesnum_; i++) {
+      this->trailings_[i] = 1;
+      for (index_t j = axes[i] + 1; j < dimsrc; ++j) {
+        this->trailings_[i] *= this->shape_[j];
+      }
+    }
+    this->last_ = src_shape[dimsrc - 1];
+    this->dst_last_ = this->shape_[dimsrc - 1];
+  }
+};  // struct BroadcastWithMultiAxesExp
+
+/*!
+* \brief Broadcasting the tensor in the given axis (keepdim turned on)
+* \param src source
+* \param axes broadcasting axes
+* \param sizes sizes of the broadcasting axes
+* \tparam SrcExp source expression
+* \tparam DType data type
+* \tparam etype type of the expression
+* \tparam TShape the flexible shape type
+*/
+template<typename SrcExp, typename DType, int etype, typename TShape>
+inline BroadcastWithMultiAxesExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+broadcast_multi_axes(const Exp<SrcExp, DType, etype> &src,
+const TShape &axes, const TShape &sizes) {
+  return BroadcastWithMultiAxesExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), axes, sizes);
+}
+
+/*!
+* \brief Broadcasting the tensor to the target shape,
+         dimension of different sizes must be 1 in the original tensor.
+* \param src source
+* \param target_shape shape of the target broadcasting tensor
+* \tparam SrcExp source expression
+* \tparam DType data type
+* \tparam etype type of the expression
+* \tparam TShape the flexible shape type
+*/
+template<typename SrcExp, typename DType, int etype, typename TShape>
+inline BroadcastWithMultiAxesExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+broadcast_to(const Exp<SrcExp, DType, etype> &src, const TShape &target_shape) {
+  static const size_t dimsrc = ExpInfo<SrcExp>::kDim;
+  CHECK_EQ(target_shape.ndim(), dimsrc);
+  std::vector<index_t> axes_vec, sizes_vec;
+  Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src.self());
+  for (size_t i = 0; i < dimsrc; ++i) {
+    if (src_shape[i] != target_shape[i]) {
+      CHECK_EQ(src_shape[i], 1U) << "broadcasting axis must have size 1, received shape="
+        << src_shape << " target_shape=" << target_shape;
+      axes_vec.push_back(i);
+      sizes_vec.push_back(target_shape[i]);
+    }
+  }
+  TShape axes = TShape(axes_vec.begin(), axes_vec.end());
+  TShape sizes = TShape(sizes_vec.begin(), sizes_vec.end());
+  return BroadcastWithMultiAxesExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), axes, sizes);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dimsrc, int dimdst>
+struct Plan<BroadcastWithAxisExp<SrcExp, DType, dimsrc, dimdst>, DType> {
+ public:
+  explicit Plan(const BroadcastWithAxisExp<SrcExp, DType, dimsrc, dimdst> &e)
+       : src_(MakePlan(e.src_)), dst_last_(e.dst_last_),
+         trailing_(e.trailing_), size_(e.size_), last_(e.last_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t x = (i * dst_last_ + j) / trailing_ / size_;
+    index_t y = (i * dst_last_ + j) % trailing_;
+    index_t z = x * trailing_ + y;
+    return src_.Eval(z / last_, z % last_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t dst_last_, trailing_, size_, last_;
+};
+
+template<typename SrcExp, typename DType, int dimsrc>
+struct Plan<BroadcastWithMultiAxesExp<SrcExp, DType, dimsrc>, DType> {
+ public:
+  explicit Plan(const BroadcastWithMultiAxesExp<SrcExp, DType, dimsrc> &e)
+    : src_(MakePlan(e.src_)), dst_last_(e.dst_last_), last_(e.last_), axesnum_(e.axesnum_),
+    trailings_(e.trailings_), sizes_(e.sizes_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t indx = i * dst_last_ + j;
+    for (index_t p = 0; p < dimsrc; ++p) {
+      if (p >= axesnum_) {
+        break;
+      }
+      indx = (indx / trailings_[p] / sizes_[p]) * trailings_[p] + (indx % trailings_[p]);
+    }
+    return src_.Eval(indx / last_, indx % last_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t dst_last_, last_, axesnum_;
+  const Shape<dimsrc> trailings_, sizes_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_BROADCAST_WITH_AXIS_H_
diff --git a/3rdparty/mshadow/mshadow/extension/channel_pool.h b/3rdparty/mshadow/mshadow/extension/channel_pool.h
new file mode 100644
index 000000000000..60d1112f4a61
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/channel_pool.h
@@ -0,0 +1,108 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file channel_pool.h
+ * \brief support for chpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CHANNEL_POOL_H_
+#define MSHADOW_EXTENSION_CHANNEL_POOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief channel pooling expression, do reduction over (local nearby) channels,
+ *        used to implement local response normalization
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct ChannelPoolingExp:
+      public MakeTensorExp<ChannelPoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief neighbor size */
+  index_t nsize_;
+  /*! \brief stride of pooling */
+  index_t stride_;
+  /*! \brief pad of pooling of each side */
+  index_t pad_;
+  index_t src_channel_;
+  /*! \brief constructor */
+  ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad)
+      : src_(src), nsize_(nsize), stride_(stride), pad_(pad) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    this->src_channel_ = this->shape_[srcdim - 3];
+    CHECK_GE(this->shape_[srcdim - 3], nsize_)
+      << "chpool: local size must be smaller than nchannels";
+    this->shape_[srcdim - 3] = (this->src_channel_ - nsize + pad * 2 + 1) / stride;
+  }
+};
+/*!
+ * \brief  channel pooling, do reduction over (local nearby) channels,
+ *         used to implement local response normalization
+ * \param src source data
+ * \param nsize neighbor size
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+chpool(const Exp<SrcExp, DType, etype> &src, index_t nsize) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  CHECK_EQ(nsize % 2, 1U) << "chpool: if no pad is specified, local size must be odd";
+  return ChannelPoolingExp<Reducer, SrcExp,
+                           DType, ExpInfo<SrcExp>::kDim>(src.self(), nsize, 1, nsize / 2);
+}
+
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+chpool(const Exp<SrcExp, DType, etype> &src, index_t nsize, index_t stride, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ChannelPoolingExp<Reducer, SrcExp,
+                           DType, ExpInfo<SrcExp>::kDim>(src.self(), nsize, stride, pad);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<ChannelPoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const ChannelPoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]),
+        height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]),
+        hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_),
+        src_channel_(e.src_channel_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % channel_;
+    const index_t n = i / channel_;
+    const index_t x = j;
+    const index_t cstart = c * stride_ < pad_ ? 0  : c * stride_ - pad_;
+    const index_t cend   = min(c * stride_ - pad_ + hnsize_, channel_);
+    DType res; Reducer::SetInitValue(res);
+    for (index_t cc = cstart; cc < cend; ++cc) {
+      Reducer::Reduce(res, src_.Eval((n * src_channel_ + cc) * height_ + y, x));
+    }
+    return res;
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t channel_, height_, width_, hnsize_, stride_, pad_, src_channel_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CHANNEL_POOL_H_
+
diff --git a/3rdparty/mshadow/mshadow/extension/channel_unpool.h b/3rdparty/mshadow/mshadow/extension/channel_unpool.h
new file mode 100644
index 000000000000..00ba279c1760
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/channel_unpool.h
@@ -0,0 +1,137 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file channel_pool.h
+ * \brief support for chpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_
+#define MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief channel pooling expression, do reduction over (local nearby) channels,
+ *        used to implement local response normalization
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct ChannelUnpoolingExp:
+      public MakeTensorExp<ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source input, corresponds to src in pooling */
+  const SrcExp &data_src_;
+  /*! \brief result of pooled data, corresponds to result of pooling */
+  const SrcExp &data_pooled_;
+  /*! \brief gradient data of pooled part, to be propgate down */
+  const SrcExp &grad_pooled_;
+  /*! \brief channel of pooled expression */
+  index_t pchannel_;
+  /*! \brief kernel size in height */
+  index_t nsize_;
+  /*! \brief kernel size in width */
+  index_t kstride_;
+  /*! \brief pad */
+  index_t pad_;
+  /*! \brief constructor */
+  ChannelUnpoolingExp(const SrcExp &data_src,
+               const SrcExp &data_pooled,
+               const SrcExp &grad_pooled,
+               index_t nsize, index_t kstride, index_t pad)
+      : data_src_(data_src), data_pooled_(data_pooled),
+        grad_pooled_(grad_pooled),
+        nsize_(nsize), kstride_(kstride), pad_(pad) {
+    Shape<srcdim> pshape = ShapeCheck<srcdim, SrcExp>::Check(grad_pooled);
+    typedef ShapeCheck<srcdim, SrcExp> ShapeCheckSrcDimSrcExp;
+    CHECK_EQ(pshape, ShapeCheckSrcDimSrcExp::Check(data_pooled))
+      << "ChannelUnPoolingExp: data and grad shape mismatch";
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(data_src);
+    for (int k = 0; k < srcdim; ++k) {
+      if (k == 1) {
+        continue;
+      }
+      CHECK_EQ(pshape[k], sshape[k])
+        << "ChannelUnPoolingExp: pooled tensor and src tensor shape mismatch"
+        << pshape[k]
+        << " vs "
+        << sshape[k];
+    }
+    pchannel_ = pshape[1];
+    this->shape_ = sshape;
+  }
+};
+/*!
+ * \brief  channel unpooling, do unroll over (local nearby) channels
+ * \param src source data
+ * \param nsize neighbor size
+ * \param stride stride of the pooling
+ * \param pad number of padding at each side
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+ch_unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled,
+      index_t nsize, index_t stride, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+        (data_src.self(), data_pooled.self(), grad_pooled.self(), nsize, stride, pad);
+}
+
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline ChannelUnpoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+ch_unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled, index_t nsize) {
+  return ch_unpool(data_src, data_pooled, grad_pooled, nsize, 1, nsize / 2);
+}
+
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const ChannelUnpoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : data_src_(e.data_src_), data_pooled_(e.data_pooled_),
+        grad_pooled_(e.grad_pooled_), channel_(e.shape_[srcdim - 3]),
+        height_(e.shape_[srcdim - 2]), pchannel_(e.pchannel_),
+        hnsize_(e.nsize_), stride_(e.kstride_), pad_(e.pad_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const DType vsrc = data_src_.Eval(i, j);
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % channel_;
+    const index_t n = i / channel_;
+    const index_t x = j;
+    const index_t cstart = c < hnsize_ - pad_ ? 0
+                        : (c - (hnsize_ - pad_) + stride_) / stride_;
+    const index_t cend = min((c + pad_ + stride_) / stride_, channel_);
+    DType val = static_cast<DType>(0);
+    for (index_t cc = cstart; cc < cend; ++cc) {
+      val += Reducer::PartialGrad(vsrc,
+                                  data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x)) *
+                                  grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x);
+    }
+    return val;
+  }
+
+ private:
+  Plan<SrcExp, DType> data_src_, data_pooled_, grad_pooled_;
+  const index_t channel_, height_, pchannel_, hnsize_, stride_, pad_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_
+
diff --git a/3rdparty/mshadow/mshadow/extension/choose.h b/3rdparty/mshadow/mshadow/extension/choose.h
new file mode 100644
index 000000000000..b1391724d400
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/choose.h
@@ -0,0 +1,90 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file choose.h
+ * \brief support for implicit array selection operation
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CHOOSE_H_
+#define MSHADOW_EXTENSION_CHOOSE_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief Make a choice of index in the lowest changing dimension.
+ * \tparam SrcExp type of lhs expression
+ * \tparam IndexExp type of index expression
+ * \tparam DType the type of elements
+ */
+template<typename SrcExp, typename IndexExp, typename DType>
+struct MatChooseRowElementExp:
+      public Exp<MatChooseRowElementExp<SrcExp, IndexExp, DType>,
+                 DType, type::kChainer> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief index operand */
+  const IndexExp &index_;
+  /*! \brief constructor */
+  MatChooseRowElementExp(const SrcExp &src, const IndexExp &index)
+      : src_(src), index_(index) {}
+};
+
+template<typename SrcExp, typename IndexExp,
+         typename DType, typename IDType, int e1, int e2>
+inline MatChooseRowElementExp<SrcExp, IndexExp, DType>
+mat_choose_row_element(const Exp<SrcExp, DType, e1> &src,
+                       const Exp<IndexExp, IDType, e2> &index) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 2 && ExpInfo<IndexExp>::kDim == 1>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return MatChooseRowElementExp<SrcExp, IndexExp, DType>(src.self(), index.self());
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename IndexExp, typename DType>
+struct Plan<MatChooseRowElementExp<SrcExp, IndexExp, DType>, DType> {
+ public:
+  explicit Plan(const MatChooseRowElementExp<SrcExp, IndexExp, DType> &e)
+      : src_(MakePlan(e.src_)),
+        index_(MakePlan(e.index_)) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    index_t idx = static_cast<index_t>(index_.Eval(0, x));
+    return src_.Eval(x, idx);
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+  expr::Plan<IndexExp, DType> index_;
+};
+
+template<typename SrcExp, typename IndexExp, typename DType>
+inline Plan<MatChooseRowElementExp<SrcExp, IndexExp, DType>, DType>
+MakePlan(const MatChooseRowElementExp<SrcExp, IndexExp, DType> &exp) {
+  return Plan<MatChooseRowElementExp<SrcExp, IndexExp, DType>, DType>(exp);
+}
+
+template<int dim, typename SrcExp, typename IndexExp, typename DType>
+struct ShapeCheck<dim, MatChooseRowElementExp<SrcExp, IndexExp, DType> > {
+  inline static Shape<dim>
+  Check(const MatChooseRowElementExp<SrcExp, IndexExp, DType> &t) {
+    CHECK(dim == 1)
+        << "MatChooseRowElementExp only support 1 dimension output";
+    Shape<2> shape1 = ShapeCheck<2, SrcExp>::Check(t.src_);
+    Shape<dim> shape2 = ShapeCheck<dim, IndexExp>::Check(t.index_);
+    CHECK_EQ(shape1[0], shape2[0])
+        << "mat_choose_row_element index length and number of rows in matrix";
+    return shape2;
+  }
+};
+
+template<typename SrcExp, typename IndexExp, typename DType>
+struct ExpInfo<MatChooseRowElementExp<SrcExp, IndexExp, DType> > {
+  static const int kDim = 1;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask & ExpInfo<IndexExp>::kDevMask;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CHOOSE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/complex.h b/3rdparty/mshadow/mshadow/extension/complex.h
new file mode 100644
index 000000000000..8e79b7eb819c
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/complex.h
@@ -0,0 +1,525 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file complex.h
+ * \brief support for complex operations
+ * \author Xingjian Shi
+ */
+#ifndef MSHADOW_EXTENSION_COMPLEX_H_
+#define MSHADOW_EXTENSION_COMPLEX_H_
+#include <algorithm>
+#include "../extension.h"
+
+namespace mshadow {
+namespace op {
+namespace complex {
+enum BinaryCalculationType { kBinaryCC, kBinaryCR, kBinaryRC};
+enum UnitaryCalculationType { kUnitaryC2R, kUnitaryC2C, kUnitaryR2C };
+struct mul {
+  /*! \brief map a_real, a_imag, b_real, b_imag to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType RealMap(DType a_real, DType a_imag,
+    DType b_real, DType b_imag) {
+    return a_real * b_real - a_imag * b_imag;
+  }
+  template<typename DType>
+  MSHADOW_XINLINE static DType ImagMap(DType a_real, DType a_imag,
+    DType b_real, DType b_imag) {
+    return a_real * b_imag + b_real * a_imag;
+  }
+};
+
+struct div {
+  /*! \brief map a_real, a_imag, b_real, b_imag to result using defined operation */
+  template<typename DType>
+  MSHADOW_XINLINE static DType RealMap(DType a_real, DType a_imag,
+    DType b_real, DType b_imag) {
+    return (a_real * b_real + a_imag * b_imag) / (b_real * b_real + b_imag * b_imag);
+  }
+  template<typename DType>
+  MSHADOW_XINLINE static DType ImagMap(DType a_real, DType a_imag,
+    DType b_real, DType b_imag) {
+    return (b_real * a_imag - a_real * b_imag) / (b_real * b_real + b_imag * b_imag);
+  }
+};
+
+struct conjugate {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    return src_.Eval(real_i, real_j);
+  }
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType ImagMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    return -src_.Eval(imag_i, imag_j);
+  }
+};
+
+struct exchange {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    return src_.Eval(imag_i, imag_j);
+  }
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType ImagMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    return src_.Eval(real_i, real_j);
+  }
+};
+
+// r2c operator
+struct pad_imag {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j) {
+    return src_.Eval(real_i, real_j);
+  }
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType ImagMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j) {
+    return 0;
+  }
+};
+
+// c2r operator
+struct toreal {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    DType real_val = src_.Eval(real_i, real_j);
+    return real_val;
+  }
+};
+
+struct abs_square {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    DType real_val = src_.Eval(real_i, real_j);
+    DType image_val = src_.Eval(imag_i, imag_j);
+    return real_val * real_val + image_val * image_val;
+  }
+};
+
+struct sum_real_imag {
+  template<typename TA, typename DType>
+  MSHADOW_XINLINE static DType RealMap(const expr::Plan<TA, DType> &src_,
+    index_t real_i, index_t real_j, index_t imag_i, index_t imag_j) {
+    DType real_val = src_.Eval(real_i, real_j);
+    DType image_val = src_.Eval(imag_i, imag_j);
+    return real_val + image_val;
+  }
+};
+}  // namespace complex
+}  // namespace op
+
+namespace expr {
+//--------------------
+// ComplexBinaryMapExp
+//--------------------
+  /*!
+* \brief binary map expression lhs [op] rhs where lhs and rhs are complex tensors
+* \tparam OP operator
+* \tparam calctype type of the calculation
+* \tparam TA type of lhs
+* \tparam TB type of rhs
+* \tparam etype expression type, sa namespace::type
+*/
+template<int calctype, typename OP, typename TA, typename TB, typename DType, int etype>
+struct ComplexBinaryMapExp : public Exp<ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype>,
+  DType, etype> {
+  /*! \brief left operand */
+  const TA &lhs_;
+  /*! \brief right operand */
+  const TB &rhs_;
+  /*! \brief constructor */
+  explicit ComplexBinaryMapExp(const TA &lhs, const TB &rhs)
+    :lhs_(lhs), rhs_(rhs) {}
+};
+
+//-------------------
+// ComplexConjExp
+//-------------------
+/*!
+* \brief compute conj(src) where src is a complex tensor
+* \tparam TA type of src
+* \tparam etype expression type, sa namespace::type
+*/
+template<int calctype, typename OP, typename TA, typename DType, int etype>
+struct ComplexUnitaryExp : public Exp<ComplexUnitaryExp<calctype, OP, TA, DType, etype>,
+  DType, etype> {
+  /*! \brief source expression */
+  const TA &src_;
+  /*! \brief constructor */
+  explicit ComplexUnitaryExp(const TA &src) : src_(src) {}
+};
+
+
+
+template<int calctype, typename OP, typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<calctype, OP, TA, TB, DType, (ta | tb | type::kMapper)>
+ComplexF(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexBinaryMapExp<calctype, OP, TA, TB, DType,
+    (ta | tb | type::kMapper)>(lhs.self(), rhs.self());
+}
+
+/*!
+* \brief conj Negation the imaginary part of A where A is a complex tensor
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<int calctype, typename OP, typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<calctype, OP, SrcExp, DType, (e1 | type::kMapper)>
+ComplexF(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexUnitaryExp<calctype, OP, SrcExp, DType, (e1 | type::kMapper)>(src.self());
+}
+
+/*!
+* \brief complex_mul_cc Complex multipilication two complex tensors, A * B
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryCC, op::complex::mul,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_mul_cc(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryCC, op::complex::mul>(lhs, rhs);
+}
+
+/*!
+* \brief complex_mul_cr Complex multipilication a complex tensor A and a real tensor B
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryCR, op::complex::mul,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_mul_cr(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryCR, op::complex::mul>(lhs, rhs);
+}
+
+/*!
+* \brief complex_mul_rc Complex multipilication of a real tensor B and a complex tensor A
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryRC, op::complex::mul,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_mul_rc(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryRC, op::complex::mul>(lhs, rhs);
+}
+
+/*!
+* \brief complex_mul_cc Complex multipilication two complex tensors, A * B
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryCC, op::complex::div,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_div_cc(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryCC, op::complex::div>(lhs, rhs);
+}
+
+/*!
+* \brief complex_mul_cr Complex multipilication a complex tensor A and a real tensor B
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryCR, op::complex::div,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_div_cr(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryCR, op::complex::div>(lhs, rhs);
+}
+
+/*!
+* \brief complex_mul_rc Complex multipilication of a real tensor A and a complex tensor B
+*/
+template<typename TA, typename TB, typename DType, int ta, int tb>
+inline ComplexBinaryMapExp<op::complex::kBinaryRC, op::complex::div,
+  TA, TB, DType, (ta | tb | type::kMapper)>
+complex_div_rc(const Exp<TA, DType, ta> &lhs, const Exp<TB, DType, tb> &rhs) {
+  return ComplexF<op::complex::kBinaryRC, op::complex::div>(lhs, rhs);
+}
+
+/*!
+* \brief conj Negation the imaginary part of A where A is a complex tensor
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryC2C, op::complex::conjugate,
+  SrcExp, DType, (e1|type::kMapper)>
+conj(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryC2C, op::complex::conjugate>(src);
+}
+
+/*!
+* \brief complex_exchange Exchange the real and imaginary part of A where A is a complex tensor
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryC2C, op::complex::exchange,
+  SrcExp, DType, (e1|type::kMapper)>
+complex_exchange(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryC2C, op::complex::exchange>(src);
+}
+
+/*!
+* \brief complex_pad_imag Transform real matrix into complex matrix
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryR2C, op::complex::pad_imag,
+  SrcExp, DType, (e1|type::kMapper)>
+complex_pad_imag(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryR2C, op::complex::pad_imag>(src);
+}
+
+/*!
+* \brief complex_toreal convert complex matrix to real matrix, keep only real part
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryC2R, op::complex::toreal,
+  SrcExp, DType, (e1 | type::kMapper)>
+complex_toreal(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryC2R, op::complex::toreal>(src);
+}
+
+/*!
+* \brief complex_abs_square calculate the square of the modulus of A where A is a complex tensor
+* \param src source tensor
+* \tparam e1 type of source expression
+*/
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryC2R, op::complex::abs_square,
+  SrcExp, DType, (e1 | type::kMapper)>
+complex_abs_square(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryC2R, op::complex::abs_square>(src);
+}
+
+template<typename SrcExp, typename DType, int e1>
+inline ComplexUnitaryExp<op::complex::kUnitaryC2R, op::complex::sum_real_imag,
+  SrcExp, DType, (e1 | type::kMapper)>
+complex_sum_real_imag(const Exp<SrcExp, DType, e1> &src) {
+  return ComplexF<op::complex::kUnitaryC2R, op::complex::sum_real_imag>(src);
+}
+
+template<int dim, int calctype, typename OP, typename TA, typename TB,
+  typename DType, int etype>
+struct ShapeCheck<dim, ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype> > {
+  inline static Shape<dim>
+    Check(const ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype> &t) {
+    Shape<dim> shape1 = ShapeCheck<dim, TA>::Check(t.lhs_);
+    Shape<dim> shape2 = ShapeCheck<dim, TB>::Check(t.rhs_);
+    if (shape1[0] == 0) return shape2;
+    if (shape2[0] == 0) return shape1;
+    if (calctype == op::complex::kBinaryCC) {
+      CHECK_EQ(shape1, shape2) << "ComplexBinaryMapExp (CC): Shapes of operands are not the same.";
+      CHECK_EQ(shape1[dim - 1] % 2, 0) <<
+        "ComplexBinaryMapExp (CC): Shape of the last dimension is not even. "
+        "We must have real part + imaginary part.";
+      return shape1;
+    } else if (calctype == op::complex::kBinaryCR) {
+      for (int i = 0; i < dim - 1; ++i) {
+        CHECK_EQ(shape1.shape_[i], shape2.shape_[i]) <<
+          "ComplexBinaryMapExp (CR): Shapes of operands are not the same.";
+      }
+      CHECK_EQ(shape1[dim - 1], shape2[dim - 1] * 2) <<
+        "ComplexBinaryMapExp (CR): Shapes of operands do not match.";
+      return shape1;
+    } else if (calctype == op::complex::kBinaryRC) {
+      for (int i = 0; i < dim - 1; ++i) {
+        CHECK_EQ(shape1.shape_[i], shape2.shape_[i]) <<
+          "ComplexBinaryMapExp (RC): Shapes of operands are not the same.";
+      }
+      CHECK_EQ(shape2[dim - 1], shape1[dim - 1] * 2) <<
+        "ComplexBinaryMapExp (RC): Shapes of operands do not match.";
+      return shape2;
+    } else {
+      LOG(FATAL) << "ComplexBinaryMapExp: Unexpected Calculation Type!";
+      return shape1;
+    }
+  }
+};
+
+template<int dim, int calctype, typename OP, typename TA, typename DType, int etype>
+struct ShapeCheck<dim, ComplexUnitaryExp<calctype, OP, TA, DType, etype> > {
+  inline static Shape<dim> Check(const ComplexUnitaryExp<calctype, OP, TA, DType, etype> &t) {
+    Shape<dim> s = ShapeCheck<dim, TA>::Check(t.src_);
+    CHECK_EQ(s[dim - 1] % 2, 0) << "ComplexUnitaryExp: Shape of the last dimension is not even. "
+      "We must have real + imaginary.";
+    if (calctype == op::complex::kUnitaryC2C) {
+      return s;
+    } else if (calctype == op::complex::kUnitaryC2R) {
+      Shape<dim> s_ret = s;
+      s_ret[dim - 1] /= 2;
+      return s_ret;
+    } else if (calctype == op::complex::kUnitaryR2C) {
+      Shape<dim> s_ret = s;
+      s_ret[dim-1] *= 2;
+      return s_ret;
+    } else {
+      LOG(FATAL) << "ComplexUnitaryExp: Unexpected Calculation Type!";
+      return s;
+    }
+  }
+};
+
+
+
+// complex binary expression (cc)
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class Plan<ComplexBinaryMapExp<op::complex::kBinaryCC, OP, TA, TB, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
+    : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t base_x = static_cast<index_t>(x / 2) * 2;
+    if (x % 2 == 0) {
+      return OP::RealMap(lhs_.Eval(y, base_x), lhs_.Eval(y, base_x + 1),
+        rhs_.Eval(y, base_x), rhs_.Eval(y, base_x + 1));
+    } else {
+      return OP::ImagMap(lhs_.Eval(y, base_x), lhs_.Eval(y, base_x + 1),
+        rhs_.Eval(y, base_x), rhs_.Eval(y, base_x + 1));
+    }
+  }
+
+ private:
+  Plan<TA, DType> lhs_;
+  Plan<TB, DType> rhs_;
+};
+
+// complex binary expression (cr)
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class Plan<ComplexBinaryMapExp<op::complex::kBinaryCR, OP, TA, TB, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
+    : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t base_x = static_cast<index_t>(x / 2) * 2;
+    if (x % 2 == 0) {
+      return OP::RealMap(lhs_.Eval(y, base_x), lhs_.Eval(y, base_x + 1),
+        rhs_.Eval(y, base_x / 2), static_cast<DType>(0));
+    } else {
+      return OP::ImagMap(lhs_.Eval(y, base_x), lhs_.Eval(y, base_x + 1),
+        rhs_.Eval(y, base_x / 2), static_cast<DType>(0));
+    }
+  }
+
+ private:
+  Plan<TA, DType> lhs_;
+  Plan<TB, DType> rhs_;
+};
+
+
+// complex binary expression (rc)
+template<typename OP, typename TA, typename TB, int etype, typename DType>
+class Plan<ComplexBinaryMapExp<op::complex::kBinaryRC, OP, TA, TB, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
+    : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t base_x = static_cast<index_t>(x / 2) * 2;
+    if (x % 2 == 0) {
+      return OP::RealMap(lhs_.Eval(y, base_x / 2), static_cast<DType>(0),
+        rhs_.Eval(y, base_x), rhs_.Eval(y, base_x + 1));
+    } else {
+      return OP::ImagMap(lhs_.Eval(y, base_x / 2), static_cast<DType>(0),
+        rhs_.Eval(y, base_x), rhs_.Eval(y, base_x + 1));
+    }
+  }
+
+ private:
+  Plan<TA, DType> lhs_;
+  Plan<TB, DType> rhs_;
+};
+
+
+// complex unitary expression (c2c)
+template<typename OP, typename TA, int etype, typename DType>
+class Plan<ComplexUnitaryExp<op::complex::kUnitaryC2C, OP, TA, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t base_x = static_cast<index_t>(x / 2) * 2;
+    if (0 == x % 2) {
+      return OP::RealMap(src_, y, base_x, y, base_x + 1);
+    } else {
+      return OP::ImagMap(src_, y, base_x, y, base_x + 1);
+    }
+  }
+
+ private:
+  Plan<TA, DType> src_;
+};
+
+// complex unitary expression (r2c)
+template<typename OP, typename TA, int etype, typename DType>
+class Plan<ComplexUnitaryExp<op::complex::kUnitaryR2C, OP, TA, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t real_x = static_cast<index_t>(x / 2);
+    if (0 == x%2) {
+      // x,y should be coordinates in the complex matrix
+      // this defines how we will give value to the real part from the real matrix src_,
+      // thus the index has only 2 dimensions
+      return OP::RealMap(src_, y, real_x);
+    } else {
+      return OP::ImagMap(src_, y, real_x);
+    }
+  }
+
+ private:
+  Plan<TA, DType> src_;
+};
+
+// complex unitary expression (c2r)
+template<typename OP, typename TA, int etype, typename DType>
+class Plan<ComplexUnitaryExp<op::complex::kUnitaryC2R, OP, TA, DType, etype>, DType> {
+ public:
+  explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return OP::RealMap(src_, y, x * 2, y, x * 2 + 1);
+  }
+
+ private:
+  Plan<TA, DType> src_;
+};
+
+
+
+template<int calctype, typename OP, typename TA, typename TB, typename DType, int etype>
+inline Plan<ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype>, DType>
+MakePlan(const ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype> &e) {
+  return Plan<ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype>,
+    DType>(MakePlan(e.lhs_), MakePlan(e.rhs_));
+}
+
+template<int calctype, typename OP, typename TA, typename DType, int etype>
+inline Plan<ComplexUnitaryExp<calctype, OP, TA, DType, etype>, DType>
+MakePlan(const ComplexUnitaryExp<calctype, OP, TA, DType, etype> &e) {
+  return Plan<ComplexUnitaryExp<calctype, OP, TA, DType, etype>,
+    DType>(MakePlan(e.src_));
+}
+
+
+
+template<int calctype, typename OP, typename TA, typename TB, typename DType, int etype>
+struct ExpInfo<ComplexBinaryMapExp<calctype, OP, TA, TB, DType, etype> > {
+  static const int kDimLhs = ExpInfo<TA>::kDim;
+  static const int kDimRhs = ExpInfo<TB>::kDim;
+  static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ? \
+    (kDimLhs == 0 ? \
+  kDimRhs : \
+            ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1;
+  static const int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
+};
+
+template<int calctype, typename OP, typename TA, typename DType, int etype>
+struct ExpInfo<ComplexUnitaryExp<calctype, OP, TA, DType, etype> > {
+  static const int kDim = ExpInfo<TA>::kDim;
+  static const int kDevMask = ExpInfo<TA>::kDevMask;
+};
+
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_COMPLEX_H_
diff --git a/3rdparty/mshadow/mshadow/extension/concat.h b/3rdparty/mshadow/mshadow/extension/concat.h
new file mode 100644
index 000000000000..c51b1dcb0a26
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/concat.h
@@ -0,0 +1,194 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file concat.h
+ * \brief support for concatenation
+ */
+#ifndef MSHADOW_EXTENSION_CONCAT_H_
+#define MSHADOW_EXTENSION_CONCAT_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief concat expression, concat two tensor's channel
+ * \tparam LhsExp left expression
+ * \tparam RhsExp right expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ * \tparam dimsrc_m_cat dimsrc - dimcat
+ */
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ConcatExp : public TRValue<ConcatExp<LhsExp, RhsExp,
+                                            Device, DType,
+                                            srcdim, dimsrc_m_cat>,
+                                  Device, srcdim, DType> {
+  static const int dimcat = srcdim - dimsrc_m_cat;
+  const LhsExp &src1_;
+  const RhsExp &src2_;
+  index_t dcat_src1_;
+  index_t dcat_src2_;
+  Shape<4> shape_;
+  ConcatExp(const LhsExp &src1, const RhsExp &src2) : src1_(src1), src2_(src2) {
+    Shape<srcdim> sshape1 = ShapeCheck<srcdim, LhsExp>::Check(src1_);
+    Shape<srcdim> sshape2 = ShapeCheck<srcdim, RhsExp>::Check(src2_);
+    #pragma unroll
+    for (int i = 0; i < srcdim; ++i) {
+      if (i != dimcat) {
+        CHECK_EQ(sshape1[i], sshape2[i]) << "ConcatExp: shape mismatch";
+      }
+    }
+    this->shape_ = sshape1;
+    this->shape_[dimcat] = sshape1[dimcat] + sshape2[dimcat];
+    this->dcat_src1_ = sshape1[dimcat];
+    this->dcat_src2_ = sshape2[dimcat];
+  }
+  template<typename E, int etype>
+  inline void
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    this->__assign(exp);
+  }
+  inline void
+  operator=(const DType &exp) {
+    this->__assign(exp);
+  }
+};  // struct ConcatExp
+/*!
+ * \brief concat two 4D tensor
+ * \param src1 source tensor1
+ * \param src2 source tensor2
+ * \return concated 4D tensor
+ * \tparam cdim the dimension to concatnate on
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<int cdim, typename LhsExp, typename RhsExp,
+         typename Device, typename DType, int srcdim>
+inline ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, srcdim - cdim>
+concat(const TRValue<LhsExp, Device, srcdim, DType> &src1,
+       const TRValue<RhsExp, Device, srcdim, DType> &src2) {
+  TypeCheckPass<ExpInfo<LhsExp>::kDim == ExpInfo<RhsExp>::kDim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  TypeCheckPass<cdim < srcdim && ExpInfo<LhsExp>::kDim == srcdim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, srcdim - cdim>
+      (src1.self(), src2.self());
+}
+//------------------------
+//  engine plugin
+//------------------------
+// runtime shapecheck
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ShapeCheck<srcdim, ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  inline static Shape<srcdim> Check(const ConcatExp<LhsExp, RhsExp,
+                                    Device, DType, srcdim, dimsrc_m_cat> &t) {
+    return t.shape_;
+  }
+};
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct StreamInfo<Device, ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  inline static Stream<Device> *
+  Get(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> &t) {
+    Stream<Device> *lhs = StreamInfo<Device, LhsExp>::Get(t.src1_);
+    Stream<Device> *rhs = StreamInfo<Device, RhsExp>::Get(t.src2_);
+    if (lhs != rhs) return NULL;
+    return lhs;
+  }
+};
+// static typecheck
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct ExpInfo<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> >{
+  static const int kDimLhs = ExpInfo<LhsExp>::kDim;
+  static const int kDimRhs = ExpInfo<RhsExp>::kDim;
+  // copy from binarymap
+  static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\
+      (kDimLhs == 0 ?\
+       kDimRhs :\
+       ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1;
+  static const int kDevMask = ExpInfo<LhsExp>::kDevMask & ExpInfo<RhsExp>::kDevMask;
+};
+//----------------------
+// Execution plan
+//---------------------
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_cat>
+struct Plan<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat>, DType> {
+ public:
+  static const int dimcat = srcdim - dimsrc_m_cat;
+  explicit Plan(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, dimsrc_m_cat> &e)
+      : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)),
+        height_(e.shape_.ProdShape(dimcat + 1, srcdim - 1)),
+        ch_src1_(e.dcat_src1_), ch_src2_(e.dcat_src2_), ch_(e.shape_[dimcat]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    if (c < ch_src1_) {
+      return src1_.Eval((b * ch_src1_ + c) * height_ + y, x);
+    } else {
+      return src2_.Eval((b * ch_src2_ + c - ch_src1_) * height_ + y, x);
+    }
+  }
+  MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    if (c < ch_src1_) {
+      return src1_.REval((b * ch_src1_ + c) * height_ + y, x);
+    } else {
+      return src2_.REval((b * ch_src2_ + c - ch_src1_) * height_ + y, x);
+    }
+  }
+
+ private:
+  Plan<LhsExp, DType> src1_;
+  Plan<RhsExp, DType> src2_;
+  const index_t height_, ch_src1_, ch_src2_, ch_;
+};  // struct Plan
+
+// specialize for concat in x
+template<typename LhsExp, typename RhsExp,
+         typename Device, typename DType,
+         int srcdim>
+struct Plan<ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, 1>, DType> {
+ public:
+  explicit Plan(const ConcatExp<LhsExp, RhsExp, Device, DType, srcdim, 1> &e)
+      : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)),
+        width_src1_(e.dcat_src1_) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    if (x < width_src1_) {
+      return src1_.Eval(y, x);
+    } else {
+      return src2_.Eval(y, x - width_src1_);
+    }
+  }
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    if (x < width_src1_) {
+      return src1_.REval(y, x);
+    } else {
+      return src2_.REval(y, x - width_src1_);
+    }
+  }
+
+ private:
+  Plan<LhsExp, DType> src1_;
+  Plan<RhsExp, DType> src2_;
+  const index_t width_src1_;
+};
+}  // namespace expr
+}   // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CONCAT_H_
diff --git a/3rdparty/mshadow/mshadow/extension/crop.h b/3rdparty/mshadow/mshadow/extension/crop.h
new file mode 100644
index 000000000000..80096a2d22d3
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/crop.h
@@ -0,0 +1,119 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file crop.h
+ * \brief support for crop
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_CROP_H_
+#define MSHADOW_EXTENSION_CROP_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief crop expression, cut off the boundary region, reverse operation of padding
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct CroppingExp:
+      public MakeTensorExp<CroppingExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief pad height */
+  index_t pad_height_;
+  /*! \brief pad height */
+  index_t pad_width_;
+  /*! \brief src height */
+  index_t src_height_;
+  /*! \brief constructor */
+  explicit CroppingExp(const SrcExp &src, Shape<2> cshape)
+      : src_(src) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    CHECK_GE(this->shape_[srcdim - 2], cshape[0]) << "CroppingExp: height requirement not met";
+    CHECK_GE(this->shape_[srcdim - 1], cshape[1]) << "CroppingExp: width requirement not met";
+    pad_height_ = (this->shape_[srcdim - 2] - cshape[0]) / 2;
+    pad_width_ = (this->shape_[srcdim - 1] - cshape[1]) / 2;
+    src_height_ = this->shape_[srcdim - 2];
+    this->shape_[srcdim - 2] = cshape[0];  // height
+    this->shape_[srcdim - 1] = cshape[1];  // width
+  }
+  /*! \brief constructor */
+  explicit CroppingExp(const SrcExp &src, Shape<2> cshape,
+                       index_t start_height, index_t start_width)
+      : src_(src), pad_height_(start_height), pad_width_(start_width) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    CHECK_GE(this->shape_[srcdim - 2], cshape[0] + start_height)
+      << "CroppingExp: height requirement not met";
+    CHECK_GE(this->shape_[srcdim - 1], cshape[1] + start_width)
+      << "CroppingExp: width requirement not met";
+    src_height_ = this->shape_[srcdim - 2];
+    this->shape_[srcdim - 2] = cshape[0];  // height
+    this->shape_[srcdim - 1] = cshape[1];  // width
+  }
+};  // struct CroppingExp
+/*!
+ * \brief revserse operationg of padding, cut off boundaries,
+ *   crop output from center of input
+ * \param src original image batches
+ * \param oshape output shape to be cropped
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+crop(const Exp<SrcExp, DType, etype> &src, Shape<2> oshape) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), oshape);
+}
+/*!
+ * \brief same as crop, but can specify starting position to do cropping
+ * \param src original image batches
+ * \param oshape output shape to be cropped
+ * \param start_height start height position to do cropping
+ * \param start_width  start width position to do cropping
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+crop(const Exp<SrcExp, DType, etype> &src, Shape<2> oshape,
+     index_t start_height, index_t start_width) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return CroppingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), oshape, start_height, start_width);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<CroppingExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const CroppingExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+        pad_height_(e.pad_height_), pad_width_(e.pad_width_),
+        new_height_(e.shape_[srcdim - 2]), src_height_(e.src_height_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x = j;
+    const index_t y = i % new_height_;
+    const index_t c = i / new_height_;
+    const index_t h = y + pad_height_;
+    const index_t w = x + pad_width_;
+    return src_.Eval(c * src_height_ + h, w);
+  }
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t pad_height_, pad_width_;
+  const index_t new_height_;
+  const index_t src_height_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_CROP_H_
diff --git a/3rdparty/mshadow/mshadow/extension/fill.h b/3rdparty/mshadow/mshadow/extension/fill.h
new file mode 100644
index 000000000000..4ac62c1673e5
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/fill.h
@@ -0,0 +1,103 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file fill.h
+ * \brief support for implicit array filling operation
+ * \author Xingjian Shi
+ */
+#ifndef MSHADOW_EXTENSION_FILL_H_
+#define MSHADOW_EXTENSION_FILL_H_
+
+#include "../extension.h"
+
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief Set value of a specific element in each line of the data matrix.
+ * \tparam SrcExp type of src expression
+ * \tparam ValExp type of val expression
+ * \tparam IndexExp type of index expression
+ * \tparam DType the type of ret expression
+ */
+template<typename SrcExp, typename ValExp, typename IndexExp, typename DType>
+struct MatFillRowElementExp:
+      public Exp<MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType>,
+                 DType, type::kChainer> {
+  /*! \brief src operand */
+  const SrcExp &src_;
+  const ValExp &val_;
+  /*! \brief index operand */
+  const IndexExp &index_;
+  /*! \brief constructor */
+  MatFillRowElementExp(const SrcExp &src, const ValExp &val, const IndexExp &index)
+      : src_(src), val_(val), index_(index) {}
+};
+
+template<typename SrcExp, typename ValExp, typename IndexExp,
+        typename SDType, typename VDType, typename IDType, int e1, int e2, int e3>
+inline MatFillRowElementExp<SrcExp, ValExp, IndexExp, SDType>
+mat_fill_row_element(const Exp<SrcExp, SDType, e1> &src,
+                     const Exp<ValExp, VDType, e2> &val,
+                     const Exp<IndexExp, IDType, e3> &index) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 2 && ExpInfo<ValExp>::kDim == 1
+                && ExpInfo<IndexExp>::kDim == 1>::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return MatFillRowElementExp<SrcExp, ValExp, IndexExp, SDType>(src.self(),
+                                                                val.self(), index.self());
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename ValExp, typename IndexExp, typename DType>
+struct Plan<MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType>, DType> {
+ public:
+  explicit Plan(const MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType> &e)
+      : src_(MakePlan(e.src_)),
+        val_(MakePlan(e.val_)),
+        index_(MakePlan(e.index_)) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    index_t idx = static_cast<index_t>(index_.Eval(0, y));
+    if (idx == x) {
+      return static_cast<DType>(val_.Eval(0, y));
+    } else {
+      return static_cast<DType>(src_.Eval(y, x));
+    }
+  }
+
+ private:
+  expr::Plan<SrcExp, DType> src_;
+  expr::Plan<ValExp, DType> val_;
+  expr::Plan<IndexExp, DType> index_;
+};
+
+template<typename SrcExp, typename ValExp, typename IndexExp, typename DType>
+inline Plan<MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType>, DType>
+MakePlan(const MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType> &exp) {
+  return Plan<MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType>, DType>(exp);
+}
+
+template<int dim, typename SrcExp, typename ValExp, typename IndexExp, typename DType>
+struct ShapeCheck<dim, MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType> > {
+  inline static Shape<dim>
+  Check(const MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType> &t) {
+    CHECK(dim == 2)
+        << "MatFillRowElementExp only support 2 dimension output";
+    Shape<2> shape_src = ShapeCheck<2, SrcExp>::Check(t.src_);
+    Shape<1> shape_val = ShapeCheck<1, ValExp>::Check(t.val_);
+    Shape<1> shape_index = ShapeCheck<1, IndexExp>::Check(t.index_);
+    CHECK((shape_src[0] == shape_index[0]) && (shape_index[0] == shape_val[0]))
+        << "mat_fill_row_element index length, val length and number of rows in matrix";
+    return shape_src;
+  }
+};
+
+template<typename SrcExp, typename ValExp, typename IndexExp, typename DType>
+struct ExpInfo<MatFillRowElementExp<SrcExp, ValExp, IndexExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask =
+          ExpInfo<SrcExp>::kDevMask & ExpInfo<ValExp>::kDevMask & ExpInfo<IndexExp>::kDevMask;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_FILL_H_
diff --git a/3rdparty/mshadow/mshadow/extension/flip.h b/3rdparty/mshadow/mshadow/extension/flip.h
new file mode 100644
index 000000000000..17d1894530fc
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/flip.h
@@ -0,0 +1,132 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file flip.h
+ * \brief support for flip a certain dimension.
+ * \author Junyuan Xie
+ */
+#ifndef MSHADOW_EXTENSION_FLIP_H_
+#define MSHADOW_EXTENSION_FLIP_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief slice expression, slice a tensor's channel
+ * \tparam SrcExp left expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ * \tparam dimsrc_m_cat dimsrc - dimcat
+ */
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct FlipExp : public TRValue<FlipExp<SrcExp,
+                                        Device, DType,
+                                        srcdim>,
+                                Device, srcdim, DType> {
+  const SrcExp &src_;
+  index_t trailing_;
+  index_t stride_;
+  index_t stride_j_;
+  Shape<srcdim> shape_;
+  FlipExp(const SrcExp &src, int dim)
+      : src_(src) {
+    shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    stride_ = shape_[dim];
+    stride_j_ = shape_[srcdim-1];
+    trailing_ = 1;
+    for (int i = dim + 1; i < srcdim; ++i) {
+      trailing_ *= shape_[i];
+    }
+  }
+  template<typename E, int etype>
+  inline void
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    this->__assign(exp);
+  }
+  inline void
+  operator=(const DType &exp) {
+    this->__assign(exp);
+  }
+};  // struct Flip
+
+/*!
+ * \brief Flip a Tensor
+ * \param src source tensor
+ * \param begin The beginning slice.
+ * \param end The end slice.
+ * \return sliced tensor
+ * \tparam sdim the dimension to slice on
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+inline FlipExp<SrcExp, Device, DType, srcdim>
+flip(const TRValue<SrcExp, Device, srcdim, DType> &src, int dim) {
+  return FlipExp<SrcExp, Device, DType, srcdim>(src.self(), dim);
+}
+//------------------------
+//  engine plugin
+//------------------------
+// runtime shapecheck
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct ShapeCheck<srcdim, FlipExp<SrcExp, Device, DType, srcdim> >{
+  inline static Shape<srcdim> Check(const FlipExp<SrcExp,
+                                    Device, DType, srcdim> &t) {
+    return t.shape_;
+  }
+};
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct StreamInfo<Device, FlipExp<SrcExp, Device, DType, srcdim> >{
+  inline static Stream<Device> *
+  Get(const FlipExp<SrcExp, Device, DType, srcdim> &t) {
+    return StreamInfo<Device, SrcExp>::Get(t.src_);
+  }
+};
+// static typecheck
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct ExpInfo<FlipExp<SrcExp, Device, DType, srcdim> >{
+  static const int kDim = ExpInfo<SrcExp>::kDim;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+//----------------------
+// Execution plan
+//---------------------
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct Plan<FlipExp<SrcExp, Device, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const FlipExp<SrcExp, Device, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), stride_j_(e.stride_j_),
+        trailing_(e.trailing_), stride_(e.stride_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t idx = i*stride_j_+j;
+    const index_t low = idx%trailing_;
+    index_t high = idx/trailing_;
+    const index_t x = high%stride_;
+    high /= stride_;
+    idx = (high*stride_+stride_-1-x)*trailing_+low;
+    return src_.Eval(idx/stride_j_, idx%stride_j_);
+  }
+  MSHADOW_XINLINE DType &REval(index_t i, index_t j) const {
+    index_t idx = i*stride_j_+j;
+    const index_t low = idx%trailing_;
+    index_t high = idx/trailing_;
+    const index_t x = high%stride_;
+    high /= stride_;
+    idx = (high*stride_+stride_-1-x)*trailing_+low;
+    return src_.REval(idx/stride_j_, idx%stride_j_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t stride_j_, trailing_, stride_;
+};  // struct Plan
+}  // namespace expr
+}   // namespace mshadow
+#endif  // MSHADOW_EXTENSION_FLIP_H_
diff --git a/3rdparty/mshadow/mshadow/extension/implicit_gemm.h b/3rdparty/mshadow/mshadow/extension/implicit_gemm.h
new file mode 100644
index 000000000000..b4b88ea326c8
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/implicit_gemm.h
@@ -0,0 +1,128 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file implicit_gemm.h
+ * \brief support for implicit GEMM operation
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_IMPLICIT_GEMM_H_
+#define MSHADOW_EXTENSION_IMPLICIT_GEMM_H_
+
+#include "../extension.h"
+#include "../packet-inl.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief Matrix multiplication.
+ * \tparam LhsExp type of lhs expression
+ * \tparam LhsExp type of rhs expression
+ * \tparam DType the type of elements
+ */
+template<typename LhsExp, typename RhsExp, typename DType>
+struct ImplicitGEMMExp:
+      public Exp<ImplicitGEMMExp<LhsExp, RhsExp, DType>,
+                 DType, type::kChainer> {
+  /*! \brief lhs operand */
+  const LhsExp &lhs_;
+  /*! \brief rhs operand */
+  const RhsExp &rhs_;
+  /*! \brief internal production size*/
+  index_t prod_size_;
+  /*! \brief the shape of this expression */
+  Shape<2> shape_;
+  /*! \brief constructor */
+  ImplicitGEMMExp(const LhsExp &lhs, const RhsExp &rhs)
+      : lhs_(lhs), rhs_(rhs) {
+    Shape<2> slhs = ShapeCheck<2, LhsExp>::Check(lhs_);
+    Shape<2> srhs = ShapeCheck<2, RhsExp>::Check(rhs_);
+    this->shape_ = mshadow::Shape2(slhs[0], srhs[1]);
+    prod_size_ = slhs[1];
+  }
+};
+
+
+template<typename LhsExp, typename RhsExp, typename DType, int e1, int e2>
+inline ImplicitGEMMExp<LhsExp, RhsExp, DType>
+implicit_dot(const Exp<LhsExp, DType, e1> &lhs,
+             const Exp<RhsExp, DType, e2> &rhs) {
+  TypeCheckPass<ExpInfo<LhsExp>::kDim == 2 && ExpInfo<RhsExp>::kDim == 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return ImplicitGEMMExp<LhsExp, RhsExp, DType>(lhs.self(), rhs.self());
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename LhsExp, typename RhsExp, typename DType>
+struct Plan<ImplicitGEMMExp<LhsExp, RhsExp, DType>, DType> {
+ public:
+  explicit Plan(const ImplicitGEMMExp<LhsExp, RhsExp, DType> &e)
+      : lhs_(MakePlan(e.lhs_)),
+        rhs_(MakePlan(e.rhs_)),
+        prod_size_(e.prod_size_),
+        prod_size_lower_align_(packet::LowerAlign<DType, MSHADOW_DEFAULT_PACKET>(e.prod_size_)) {
+  }
+
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    typedef packet::Packet<DType> Packet;
+    Packet sum = Packet::Fill(0);
+
+    const size_t packetSize = Packet::size;
+    DType lhs_temp[packetSize], rhs_temp[packetSize];
+
+    for (index_t i = 0; i < prod_size_lower_align_; i += packetSize) {
+      // unroll
+      for (index_t j = 0; j < packetSize; ++j) {
+        lhs_temp[j] = lhs_.Eval(y, i + j);
+      }
+      for (index_t j = 0; j < packetSize; ++j) {
+        rhs_temp[j] = rhs_.Eval(i + j, x);
+      }
+      sum = sum + Packet::LoadUnAligned(lhs_temp) * Packet::LoadUnAligned(rhs_temp);
+    }
+    DType ret_result = sum.Sum();
+
+    for (index_t i =  prod_size_lower_align_; i < prod_size_; ++i) {
+      ret_result += lhs_.Eval(y, i) * rhs_.Eval(i, x);
+    }
+    return ret_result;
+  }
+
+ private:
+  expr::Plan<LhsExp, DType> lhs_;
+  expr::Plan<RhsExp, DType> rhs_;
+  const index_t prod_size_;
+  const index_t prod_size_lower_align_;
+};
+
+template<typename LhsExp, typename RhsExp, typename DType>
+inline Plan<ImplicitGEMMExp<LhsExp, RhsExp, DType>, DType>
+MakePlan(const ImplicitGEMMExp<LhsExp, RhsExp, DType> &exp) {
+  return Plan<ImplicitGEMMExp<LhsExp, RhsExp, DType>, DType>(exp);
+}
+
+
+template<int dim, typename LhsExp, typename RhsExp, typename DType>
+struct ShapeCheck<dim, ImplicitGEMMExp<LhsExp, RhsExp, DType> > {
+  inline static Shape<dim>
+  Check(const ImplicitGEMMExp<LhsExp, RhsExp, DType> &t) {
+    CHECK(dim == 2)
+        << "ImplicitGEMMExp only support 2 dimension";
+    Shape<dim> shape1 = ShapeCheck<dim, LhsExp>::Check(t.lhs_);
+    Shape<dim> shape2 = ShapeCheck<dim, RhsExp>::Check(t.rhs_);
+    CHECK_EQ(shape1[1], shape2[0])
+      << "implicit_dot The matrix shape do  not match";
+    return t.shape_;
+  }
+};
+
+template<typename LhsExp, typename RhsExp, typename DType>
+struct ExpInfo<ImplicitGEMMExp<LhsExp, RhsExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask = ExpInfo<LhsExp>::kDevMask & ExpInfo<RhsExp>::kDevMask;
+};
+
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_IMPLICIT_GEMM_H_
+
diff --git a/3rdparty/mshadow/mshadow/extension/mask.h b/3rdparty/mshadow/mshadow/extension/mask.h
new file mode 100644
index 000000000000..0fd4cc6db72e
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/mask.h
@@ -0,0 +1,97 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file mask.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MSHADOW_EXTENSION_MASK_H_
+#define MSHADOW_EXTENSION_MASK_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+/*! \brief Broadcast a mask and do element-wise multiplication
+ *  \tparam IndexExp type of index expression
+ *  \tparam SrcExp type of src expression
+ *  \tparam DType data type
+ */
+template<typename IndexExp, typename SrcExp, typename DType>
+struct MaskExp: public Exp<MaskExp<IndexExp, SrcExp, DType>,
+                           DType, type::kChainer> {
+  /*! \brief index oprand */
+  const IndexExp &index_;
+  /*! \brief matrix oprand */
+  const SrcExp &src_;
+  /*! constructor */
+  MaskExp(const IndexExp &index, const SrcExp &src)
+    : index_(index), src_(src) {}
+};  // struct MaskExp
+
+
+
+template<typename IndexExp,
+         typename SrcExp,
+         typename DType,
+         int e1, int e2>
+inline MaskExp<IndexExp, SrcExp, DType>
+mask(const Exp<IndexExp, DType, e1> &index,
+     const Exp<SrcExp, DType, e2> &src) {
+  return MaskExp<IndexExp, SrcExp, DType>(index.self(), src.self());
+}
+
+
+//----------------------
+// Execution plan
+//----------------------
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct Plan<MaskExp<IndexExp, SrcExp, DType>, DType> {
+ public:
+  explicit Plan(const MaskExp<IndexExp, SrcExp, DType> &e)
+    : index_(MakePlan(e.index_)), src_(MakePlan(e.src_)) {
+  }
+
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return static_cast<DType>(src_.Eval(y, x) * index_.Eval(0, y));
+  }
+
+ private:
+  expr::Plan<IndexExp, DType> index_;
+  expr::Plan<SrcExp, DType> src_;
+};  // struct Plan
+
+template<typename IndexExp, typename SrcExp, typename DType>
+inline Plan<MaskExp<IndexExp, SrcExp, DType>, DType>
+MakePlan(const MaskExp<IndexExp, SrcExp, DType> &exp) {
+  return Plan<MaskExp<IndexExp, SrcExp, DType>, DType>(exp);
+}
+
+template<int dim, typename IndexExp, typename SrcExp, typename DType>
+struct ShapeCheck<dim, MaskExp<IndexExp, SrcExp, DType> > {
+  inline static Shape<dim>
+  Check(const MaskExp<IndexExp, SrcExp, DType> &t) {
+    CHECK(dim == 2)
+      << "MaskExp only support 2D output";
+    Shape<1> dshape = ShapeCheck<1, IndexExp>::Check(t.index_);
+    Shape<2> wshape = ShapeCheck<2, SrcExp>::Check(t.src_);
+    CHECK_EQ(dshape[0], wshape[0]) << "MaskExp require inputs in same first dimention";
+    Shape<dim> ret;
+    ret[0] = wshape[0];
+    ret[1] = wshape[1];
+    return ret;
+  }
+};
+
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct ExpInfo<MaskExp<IndexExp, SrcExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask = ExpInfo<IndexExp>::kDevMask;
+};
+
+}  // namespace expr
+}  // namespace mshadow
+
+#endif  // MSHADOW_EXTENSION_MASK_H_
diff --git a/3rdparty/mshadow/mshadow/extension/mirror.h b/3rdparty/mshadow/mshadow/extension/mirror.h
new file mode 100644
index 000000000000..9e9edc9b6f70
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/mirror.h
@@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file mirror.h
+ * \brief support for mirror
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_MIRROR_H_
+#define MSHADOW_EXTENSION_MIRROR_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief mirror expression, mirror a image in width
+ * \tparam SrcExp source expression to be mirrored
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct MirroringExp:
+      public MakeTensorExp<MirroringExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  explicit MirroringExp(const SrcExp &src) : src_(src) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+  }
+};
+/*!
+ * \brief mirroring expression, mirror images in width
+ * \param src original image batches
+ * \return expression corresponding to mirrored result
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline MirroringExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+mirror(const Exp<SrcExp, DType, etype> &src) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return MirroringExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self());
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<MirroringExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const MirroringExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), width_(e.shape_[srcdim - 1]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    return src_.Eval(i, width_ - j - 1);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_MIRROR_H_
diff --git a/3rdparty/mshadow/mshadow/extension/one_hot.h b/3rdparty/mshadow/mshadow/extension/one_hot.h
new file mode 100644
index 000000000000..326d4c3560eb
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/one_hot.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file one_hot.h
+ * \brief Create one-hot indicator array based on the index.
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_ONE_HOT_H_
+#define MSHADOW_EXTENSION_ONE_HOT_H_
+
+#include "../extension.h"
+
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief Create a one-hot indicator array.
+ * \tparam IndexExp type of index expression
+ * \tparam DType the type of elements
+ */
+template<typename IndexExp, typename DType>
+struct OneHotEncodeExp:
+      public Exp<OneHotEncodeExp<IndexExp, DType>,
+                 DType, type::kChainer> {
+  /*! \brief index operand */
+  const IndexExp &index_;
+  /*! \brief number of choices we can have. */
+  index_t num_choices_;
+  /*! \brief constructor */
+  OneHotEncodeExp(const IndexExp &index, index_t num_choices)
+      : index_(index), num_choices_(num_choices) {}
+};
+
+template<typename IndexExp,
+         typename IDType, int e1>
+inline OneHotEncodeExp<IndexExp, default_real_t>
+one_hot_encode(const Exp<IndexExp, IDType, e1> &index, index_t num_choices) {
+  TypeCheckPass<ExpInfo<IndexExp>::kDim == 1>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return OneHotEncodeExp<IndexExp, default_real_t>(index.self(), num_choices);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename IndexExp, typename DType>
+struct Plan<OneHotEncodeExp<IndexExp, DType>, DType> {
+ public:
+  explicit Plan(const OneHotEncodeExp<IndexExp, DType> &e)
+      : index_(MakePlan(e.index_)) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    index_t idx = static_cast<index_t>(index_.Eval(0, y));
+    return static_cast<DType>(x == idx);
+  }
+
+ private:
+  expr::Plan<IndexExp, DType> index_;
+};
+
+template<typename IndexExp, typename DType>
+inline Plan<OneHotEncodeExp<IndexExp, DType>, DType>
+MakePlan(const OneHotEncodeExp<IndexExp, DType> &exp) {
+  return Plan<OneHotEncodeExp<IndexExp, DType>, DType>(exp);
+}
+
+template<int dim, typename IndexExp, typename DType>
+struct ShapeCheck<dim, OneHotEncodeExp<IndexExp, DType> > {
+  inline static Shape<dim>
+  Check(const OneHotEncodeExp<IndexExp, DType> &t) {
+    CHECK(dim == 2)
+        << "OneHotEncodeExp only support 2 dimension output";
+    Shape<1> shape = ShapeCheck<1, IndexExp>::Check(t.index_);
+    Shape<dim> ret;
+    ret[0] = shape[0];
+    ret[1] = t.num_choices_;
+    return ret;
+  }
+};
+
+template<typename IndexExp, typename DType>
+struct ExpInfo<OneHotEncodeExp<IndexExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask = ExpInfo<IndexExp>::kDevMask;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_ONE_HOT_H_
diff --git a/3rdparty/mshadow/mshadow/extension/pack_col2patch.h b/3rdparty/mshadow/mshadow/extension/pack_col2patch.h
new file mode 100644
index 000000000000..37f1a699ead5
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/pack_col2patch.h
@@ -0,0 +1,154 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file pack_col2patch.h
+ * \brief support for pack
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_PACK_COL2PATCH_H_
+#define MSHADOW_EXTENSION_PACK_COL2PATCH_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reverse operation of UnpackPatchToCol,
+ *    used to backprop gradient back
+ *    this is a version supporting multiple images
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam dstdim destination dimension
+ */
+template<typename SrcExp, typename DType, int dstdim>
+struct PackColToPatchXExp:
+      public MakeTensorExp<PackColToPatchXExp<SrcExp, DType, dstdim>,
+                           SrcExp, dstdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief patch height */
+  index_t psize_y_;
+  /*! \brief patch height */
+  index_t psize_x_;
+  /*! \brief patch stride */
+  index_t pstride_y_;
+  index_t pstride_x_;
+  /*! \brief patch dilate */
+  index_t pdilate_y_;
+  index_t pdilate_x_;
+  /*! \brief constructor */
+  PackColToPatchXExp(const SrcExp &src, Shape<dstdim> imshape,
+                     index_t psize_y, index_t psize_x,
+                     index_t pstride_y, index_t pstride_x,
+                     index_t pdilate_y, index_t pdilate_x)
+      :src_(src), psize_y_(psize_y), psize_x_(psize_x),
+       pstride_y_(pstride_y), pstride_x_(pstride_x),
+       pdilate_y_(pdilate_y), pdilate_x_(pdilate_x){
+    this->shape_ = imshape;
+    const index_t o_height = (imshape[dstdim - 2] -
+        (pdilate_y * (psize_y - 1)+ 1))/pstride_y + 1;
+    const index_t o_width  = (imshape[dstdim - 1] -
+        (pdilate_x * (psize_x - 1) + 1)) / pstride_x + 1;
+    Shape<2> sshape = ShapeCheck<2, SrcExp>::Check(src_);
+    CHECK_EQ(sshape[1], o_height * o_width * imshape.ProdShape(0, dstdim - 3))
+      << "PackColToPatchExp: src.size(1) mismatch";
+    CHECK_EQ(sshape[0], psize_y * psize_x * imshape[dstdim - 3])
+      << "PackColToPatchExp: src.size(0) mismatch";
+  }
+};
+/*!
+ * \brief reverse operation of pack_col2patch, can be used to implement deconvolution
+ * \return packed img expression
+ * \param mat source matrix
+ * \param imshape shape of target img
+ * \param psize_y height of each patch
+ * \param psize_x height of each patch
+ * \param pstride stride of each patch
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam dstdim destination dimension
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int dstdim, int etype>
+inline PackColToPatchXExp<SrcExp, DType, dstdim>
+pack_col2patch(const expr::Exp<SrcExp, DType, etype> &src,
+               Shape<dstdim> imshape, index_t psize_y,
+               index_t psize_x, index_t pstride, index_t pdilate) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  CHECK(imshape[dstdim - 1] >= psize_x && imshape[dstdim - 2] >= psize_y)
+    << "PackColToPatch:image shape smaller than patch size";
+  return PackColToPatchXExp<SrcExp, DType, dstdim>(src.self(), imshape,
+                                                   psize_y, psize_x, pstride, pstride,
+                                                   pdilate, pdilate);
+}
+/*!
+ *if you want to specify kstride_y and kstride_x
+ */
+template<typename SrcExp, typename DType, int dstdim, int etype>
+inline PackColToPatchXExp<SrcExp, DType, dstdim>
+pack_col2patch(const expr::Exp<SrcExp, DType, etype> &src,
+               Shape<dstdim> imshape, index_t psize_y,
+               index_t psize_x, index_t pstride_y, index_t pstride_x,
+               index_t pdilate_y, index_t pdilate_x) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  CHECK(imshape[dstdim - 1] >= psize_x && imshape[dstdim - 2] >= psize_y)
+    << "PackColToPatch:image shape smaller than patch size";
+  return PackColToPatchXExp<SrcExp, DType, dstdim>(src.self(), imshape,
+                                                   psize_y, psize_x, pstride_y, pstride_x,
+                                                   pdilate_y, pdilate_x);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dstdim>
+struct Plan<PackColToPatchXExp<SrcExp, DType, dstdim>, DType> {
+ public:
+  explicit Plan(const PackColToPatchXExp<SrcExp, DType, dstdim> &e)
+      :src_(MakePlan(e.src_)), psize_y_(e.psize_y_),
+       psize_x_(e.psize_x_), pstride_y_(e.pstride_y_), pstride_x_(e.pstride_x_),
+       i_channel_(e.shape_[dstdim - 3]), pdilate_y_(e.pdilate_y_), pdilate_x_(e.pdilate_x_),
+       i_height_(e.shape_[dstdim - 2]),
+       o_height_((e.shape_[dstdim - 2] - (pdilate_y_ * (psize_y_ - 1) + 1)) /
+               pstride_y_ + 1),
+       o_width_((e.shape_[dstdim - 1] - (pdilate_x_ * (psize_x_ - 1) + 1)) /
+               pstride_x_ + 1) {
+    // note: i/o convention are same as unpack
+  }
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t y = i % i_height_;
+    const index_t idivh = i / i_height_;
+    const index_t c = idivh % i_channel_;
+    const index_t n = idivh / i_channel_;
+    const index_t x = j;
+
+    const index_t psize_y_dilate = (pdilate_y_ * (psize_y_ - 1) + 1);
+    const index_t psize_x_dilate = (pdilate_x_ * (psize_x_ - 1) + 1);
+
+    const index_t py_min =
+        y < psize_y_dilate ? y % pdilate_y_ : (y-psize_y_dilate + pstride_y_) / pstride_y_;
+    const index_t px_min =
+        x < psize_x_dilate ? x % pdilate_x_ : (x-psize_x_dilate + pstride_x_) / pstride_x_;
+    const index_t py_max = min((y + pstride_y_) / pstride_y_, o_height_);
+    const index_t px_max = min((x + pstride_x_) / pstride_x_, o_width_);
+    DType res = static_cast<DType>(0);
+    for (index_t py = py_min; py < py_max; py += pdilate_y_) {
+      for (index_t px = px_min; px < px_max; px += pdilate_x_) {
+        res += src_.Eval(((c * psize_y_ + (y - py*pstride_y_) / pdilate_y_) * psize_x_ +
+                         (x - px * pstride_x_) / pdilate_x_),
+                         (n * o_height_ + py) * o_width_ + px);
+      }
+    }
+    return res;
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t psize_y_, psize_x_, pstride_y_, pstride_x_, i_channel_;
+  const index_t pdilate_y_, pdilate_x_;
+  const index_t i_height_, o_height_, o_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_PACK_COL2PATCH_H_
diff --git a/3rdparty/mshadow/mshadow/extension/pad.h b/3rdparty/mshadow/mshadow/extension/pad.h
new file mode 100644
index 000000000000..6622a022acc8
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/pad.h
@@ -0,0 +1,111 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file pad.h
+ * \brief support for pad
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_PAD_H_
+#define MSHADOW_EXTENSION_PAD_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief padding expression, pad a image with zeros
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct PaddingExp:
+      public MakeTensorExp<PaddingExp<SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief pad size in y */
+  index_t pad_y_;
+  /*! \brief pad size in x */
+  index_t pad_x_;
+  /*! \brief source tensor height */
+  index_t src_height_;
+  /*! \brief source tensor width */
+  index_t src_width_;
+  /*! \brief constructor */
+  PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x)
+      : src_(src), pad_y_(pad_y), pad_x_(pad_x) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    src_height_ = this->shape_[srcdim - 2];
+    src_width_  = this->shape_[srcdim - 1];
+    this->shape_[srcdim - 2] += pad_y * 2;  // height
+    this->shape_[srcdim - 1] += pad_x * 2;  // width
+  }
+};
+/*!
+ * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
+ * \param src original image batches
+ * \param pad padding size
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pad(const Exp<SrcExp, DType, etype> &src, index_t pad) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), pad, pad);
+}
+/*!
+ * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
+ * \param src original image batches
+ * \param pad_y padding size in y
+ * \param pad_x padding size in x
+ * \return expression corresponding to padded result
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pad(const Exp<SrcExp, DType, etype> &src, index_t pad_y, index_t pad_x) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PaddingExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), pad_y, pad_x);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<PaddingExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const PaddingExp<SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+        pad_y_(e.pad_y_), pad_x_(e.pad_x_),
+        new_height_(e.shape_[srcdim - 2]),
+        src_height_(e.src_height_), src_width_(e.src_width_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x = j;
+    const index_t y = i % new_height_;
+    const index_t c = i / new_height_;
+    if (y < pad_y_ || x < pad_x_) return static_cast<DType>(0);
+    const index_t h = y - pad_y_;
+    const index_t w = x - pad_x_;
+    if (h < src_height_ && w < src_width_) {
+      return src_.Eval(c * src_height_ + h, w);
+    } else {
+      return static_cast<DType>(0);
+    }
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t pad_y_;
+  const index_t pad_x_;
+  const index_t new_height_;
+  const index_t src_height_;
+  const index_t src_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_PAD_H_
diff --git a/3rdparty/mshadow/mshadow/extension/range.h b/3rdparty/mshadow/mshadow/extension/range.h
new file mode 100644
index 000000000000..ab49b6e3cf18
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/range.h
@@ -0,0 +1,118 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file range.h
+ * \brief support generating a range vector
+ * \author Xingjian Shi
+ */
+#ifndef MSHADOW_EXTENSION_RANGE_H_
+#define MSHADOW_EXTENSION_RANGE_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief Generate a range vector similar to python: range(start, stop[, step][, repeat]).
+          If step is positive, the last element is the largest start + i * step less than stop
+          If step is negative, the last element is the smallest start + i * step greater than stop.
+          All elements are repeated for `repeat` times, e.g range(0, 4, 2, 3) --> 0, 0, 0, 2, 2, 2
+ * \tparam SrcExp type of lhs expression
+ * \tparam IndexExp type of index expression
+ * \tparam DType the type of elements
+ */
+template<typename DType>
+struct RangeExp:
+      public Exp<RangeExp<DType>, DType, type::kMapper> {
+  const DType start_;
+  const DType stop_;
+  const DType step_;
+  const int repeat_;
+  /*! \brief constructor */
+  RangeExp(DType start, DType stop, DType step, int repeat)
+      : start_(start), stop_(stop), step_(step), repeat_(repeat) {}
+};
+
+template<typename DType>
+inline RangeExp<DType>
+range(DType start, DType stop, DType step = 1, int repeat = 1) {
+  return RangeExp<DType>(start, stop, step, repeat);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename DType>
+struct Plan<RangeExp<DType>, DType> {
+ public:
+  explicit Plan(const RangeExp<DType> &e)
+      : start_(e.start_),
+        stop_(e.stop_),
+        step_(e.step_),
+        repeat_(e.repeat_) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return start_ + static_cast<DType>((static_cast<int>(x) / repeat_)) * step_;
+  }
+
+ private:
+  const DType start_;
+  const DType stop_;
+  const DType step_;
+  const int repeat_;
+};
+
+template<typename DType>
+inline Plan<RangeExp<DType>, DType>
+MakePlan(const RangeExp<DType> &exp) {
+  return Plan<RangeExp<DType>, DType>(exp);
+}
+
+
+template<typename DType>
+inline int RangeOutSize(DType start, DType stop, DType step, int repeat) {
+  return repeat * ((stop - start - 1) / step + 1);
+}
+
+template<>
+inline int RangeOutSize<float>(float start, float stop, float step, int repeat) {
+  double d_start = static_cast<double>(start);
+  double d_stop = static_cast<double>(stop);
+  double d_step = static_cast<double>(step);
+  return repeat * static_cast<int>(ceil((d_stop - d_start) / d_step));
+}
+
+template<>
+inline int RangeOutSize<double>(double start, double stop, double step, int repeat) {
+  return repeat * static_cast<int>(ceil((stop - start) / step));
+}
+
+
+template<int dim, typename DType>
+struct ShapeCheck<dim, RangeExp<DType> > {
+  inline static Shape<dim>
+  Check(const RangeExp<DType> &t) {
+    CHECK(dim == 1)
+        << "RangeExp only support 1 dimension output, received " << dim;
+    CHECK(t.step_ != 0)
+        << "RangeExp does not support step=0, received " << t.step_;
+    CHECK(t.repeat_ > 0)
+      << "RangeExp only supports repeat > 0, received " << t.repeat_;
+    if (t.step_ > 0) {
+      CHECK(t.start_ < t.stop_) << "RangeExp does not support (start, stop, step) = "
+                                << "(" << t.start_ << "," << t.stop_ << "," << t.step_ << ")";
+    } else {
+      CHECK(t.start_ > t.stop_) << "RangeExp does not support (start, stop, step)= "
+                                << "(" << t.start_ << "," << t.stop_ << "," << t.step_ << ")";
+    }
+    return Shape1(RangeOutSize<DType>(t.start_, t.stop_, t.step_, t.repeat_));
+  }
+};
+
+template<typename DType>
+struct ExpInfo<RangeExp<DType> > {
+  static const int kDim = 1;
+  static const int kDevMask = 0xffff;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_RANGE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h b/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
new file mode 100644
index 000000000000..54bcc750cfc5
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
@@ -0,0 +1,136 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file reduce_with_axis.h
+ * \brief
+ * \author Junyuan Xie
+*/
+#ifndef MSHADOW_EXTENSION_REDUCE_WITH_AXIS_H_
+#define MSHADOW_EXTENSION_REDUCE_WITH_AXIS_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+/*! \brief reduce out the dimension of src labeled by axis.
+ *  \tparam Reducer type of reducer
+ *  \tparam SrcExp type of source expression
+ *  \tparam DType data type
+ */
+template<typename Reducer, typename SrcExp, typename DType, int dimsrc, bool mask, int dimdst>
+struct ReduceWithAxisExp:
+    public MakeTensorExp<ReduceWithAxisExp<Reducer, SrcExp, DType, dimsrc, mask, dimdst>,
+                         SrcExp, dimdst, DType> {
+  /*! \brief source oprand */
+  const SrcExp &src_;
+  /*! \brief size of last destination dimension */
+  index_t last_dst_dim_;
+  /*! \brief size of trailing dimensions */
+  index_t trailing_;
+  /*! \brief size of axis dimension */
+  index_t size_;
+  /*! \brief size of last src dimension */
+  index_t last_;
+  /*! constructor */
+  explicit ReduceWithAxisExp(const SrcExp &src, int axis)
+    : src_(src) {
+    bool keepdim = (dimsrc == dimdst);
+    CHECK(dimsrc > axis) << "reduce axis out of bound";
+    Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+    for (int i = 0; i < axis; ++i) {
+      this->shape_[i] = src_shape[i];
+    }
+    this->size_ = src_shape[axis];
+    this->trailing_ = 1;
+    if (!keepdim) {
+      for (int i = axis + 1; i < dimsrc; ++i) {
+        this->trailing_ *= src_shape[i];
+        this->shape_[i - 1] = src_shape[i];
+      }
+    } else {
+      this->shape_[axis] = 1;
+      for (index_t i = axis + 1; i < dimsrc; ++i) {
+        this->trailing_ *= src_shape[i];
+        this->shape_[i] = src_shape[i];
+      }
+    }
+
+    this->last_ = src_shape[dimsrc - 1];
+    this->last_dst_dim_ = this->shape_[dimdst - 1];
+  }
+};  // struct ReduceWithAxisExp
+
+/*!
+ * \brief reduce out the dimension of src labeled by axis.
+ * \param Reducer type of the reducing operation
+ * \param mask whether to output the unmask indices
+ * \tparam SrcExp source expression
+ * \tparam DType data type
+ * \tparam etype type of the expression
+ */
+template<typename Reducer, bool mask, typename SrcExp, typename DType, int etype>
+inline ReduceWithAxisExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim, mask,
+  ExpInfo<SrcExp>::kDim - 1>
+reduce_with_axis(const Exp<SrcExp, DType, etype> &src, int axis) {
+  return ReduceWithAxisExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim, mask,
+    ExpInfo<SrcExp>::kDim- 1>(src.self(), axis);
+}
+
+/*!
+* \brief reduce out the dimension of src labeled by axis, keepdim turned on.
+* \param Reducer type of the reducing operation
+* \param mask whether to output the unmask indices
+* \tparam SrcExp source expression
+* \tparam DType data type
+* \tparam etype type of the expression
+*/
+template<typename Reducer, bool mask, typename SrcExp, typename DType, int etype>
+inline ReduceWithAxisExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim, mask,
+  ExpInfo<SrcExp>::kDim>
+  reduce_keepdim(const Exp<SrcExp, DType, etype> &src, int axis) {
+  return ReduceWithAxisExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim, mask,
+    ExpInfo<SrcExp>::kDim>(src.self(), axis);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int dimsrc, bool mask, int dimdst>
+struct Plan<ReduceWithAxisExp<Reducer, SrcExp, DType, dimsrc, mask, dimdst>, DType> {
+ public:
+  explicit Plan(const ReduceWithAxisExp<Reducer, SrcExp, DType, dimsrc, mask, dimdst> &e)
+      : src_(MakePlan(e.src_)), last_dst_dim_(e.last_dst_dim_), trailing_(e.trailing_),
+        size_(e.size_), last_(e.last_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t x = (i*last_dst_dim_ + j)/trailing_;
+    index_t y = (i*last_dst_dim_ + j)%trailing_;
+
+    if (mask) {
+      index_t idx = 0;
+      DType res; Reducer::SetInitValue(res);
+      for (index_t k = 0; k < size_; ++k) {
+        index_t z = (x*size_+k)*trailing_+y;
+        DType tmp = res;
+        Reducer::Reduce(res, src_.Eval(z/last_, z%last_));
+        if (tmp != res) {
+          idx = k;
+        }
+      }
+      return static_cast<DType>(static_cast<int>(idx));
+    } else {
+      DType res; Reducer::SetInitValue(res);
+      for (index_t k = 0; k < size_; ++k) {
+        index_t z = (x*size_+k)*trailing_+y;
+        Reducer::Reduce(res, src_.Eval(z/last_, z%last_));
+      }
+      return res;
+    }
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t last_dst_dim_, trailing_, size_, last_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_REDUCE_WITH_AXIS_H_
diff --git a/3rdparty/mshadow/mshadow/extension/reduceto1d.h b/3rdparty/mshadow/mshadow/extension/reduceto1d.h
new file mode 100644
index 000000000000..09a478ab311e
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/reduceto1d.h
@@ -0,0 +1,104 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reduceto1d.h
+ * \brief support for sum_rows and sumall_except_dim
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_REDUCETO1D_H_
+#define MSHADOW_EXTENSION_REDUCETO1D_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reduction to 1 dimension tensor
+ * input: Tensor<Device,k>: ishape
+ * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
+ *
+ * \tparam SrcExp type of expression to be reduced
+ * \tparam DType the data type of the scalar
+ * \tparam Reducer which reducer to use
+ * \tparam m_dimkeep which dimension to be kept, encoded with dimsrc - dimkeep
+ */
+template<typename SrcExp, typename DType, typename Reducer, int m_dimkeep>
+struct ReduceTo1DExp:
+      public Exp<ReduceTo1DExp<SrcExp, DType, Reducer, m_dimkeep>,
+                 DType, type::kComplex> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief source operand, scale of the  */
+  DType scale_;
+  /*! \brief construct a repmat expression from src and nrow */
+  ReduceTo1DExp(const SrcExp& src, DType scale) : src_(src), scale_(scale) {}
+};
+/*!
+ * \brief a sum over all dimensions, except dimkeep
+ * \param exp input expression that must be a matrix Tensor<?,2>
+ * \return a expresion with type Tensor<Device,1>
+ * \tparam dimkeep the dimension that will be kept
+ * \tparam SrcExp expression
+ * \tparam etype type of expression
+ */
+template<int dimkeep,  typename SrcExp, typename DType, int etype>
+inline ReduceTo1DExp<SrcExp, DType, red::sum,
+                     ExpInfo<SrcExp>::kDim - dimkeep>
+sumall_except_dim(const Exp<SrcExp, DType, etype> &exp) {
+  return ReduceTo1DExp<SrcExp, DType, red::sum,
+                       ExpInfo<SrcExp>::kDim - dimkeep>(exp.self(), DType(1));
+}
+/*!
+ * \brief reduce over all dimensions, except dimkeep
+ * \param exp input expression that must be a matrix Tensor<?,2>
+ * \return a expresion with type Tensor<Device,1>
+ * \tparam dimkeep the dimension that will be kept
+ * \tparam SrcExp expression
+ * \tparam etype type of expression
+ */
+template<int dimkeep, typename Reducer, typename SrcExp, typename DType, int etype>
+inline ReduceTo1DExp<SrcExp, DType, Reducer,
+                     ExpInfo<SrcExp>::kDim - dimkeep>
+reduce_except_dim(const Exp<SrcExp, DType, etype> &exp) {
+  return ReduceTo1DExp<SrcExp, DType, Reducer,
+                       ExpInfo<SrcExp>::kDim - dimkeep>(exp.self(), DType(1));
+}
+/*!
+ * \brief a expression that sum over rows of a matrix
+ * \param exp input expression that must be a matrix Tensor<?, 2>
+ * \return a expresion with type Tensor<Device, 1>
+ * \tparam SrcExp expression
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline ReduceTo1DExp<SrcExp, DType, red::sum, 1>
+sum_rows(const Exp<SrcExp, DType, etype> &exp) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim ==2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return sumall_except_dim<1>(exp);
+}
+template<typename SV, typename Device, typename DType,
+         typename SrcExp, typename Reducer, int m_dimkeep>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, 1, DType>,
+                        ReduceTo1DExp<SrcExp, DType, Reducer, m_dimkeep>,
+                        DType> {
+  static const int dimkeep = ExpInfo<SrcExp>::kDim - m_dimkeep;
+  inline static void Eval(Tensor<Device, 1, DType> *dst,
+                          const ReduceTo1DExp<SrcExp, DType,
+                                              Reducer, m_dimkeep> &exp) {
+    TypeCheckPass<m_dimkeep != 1>
+        ::Error_Expression_Does_Not_Meet_Dimension_Req();
+    MapReduceKeepHighDim<SV, Reducer, dimkeep>(dst, exp.src_, exp.scale_);
+  }
+};
+template<typename SV, typename Device, typename DType,
+         typename SrcExp, typename Reducer>
+struct ExpComplexEngine<SV,
+                        Tensor<Device, 1, DType>,
+                        ReduceTo1DExp<SrcExp, DType, Reducer, 1>, DType> {
+  inline static void Eval(Tensor<Device, 1, DType> *dst,
+                          const ReduceTo1DExp<SrcExp, DType, Reducer, 1> &exp) {
+    MapReduceKeepLowest<SV, Reducer>(dst, exp.src_, exp.scale_);
+  }
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_REDUCETO1D_H_
diff --git a/3rdparty/mshadow/mshadow/extension/reshape.h b/3rdparty/mshadow/mshadow/extension/reshape.h
new file mode 100644
index 000000000000..b310fe69291a
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/reshape.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file reshape.h
+ * \brief support for reshape
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_RESHAPE_H_
+#define MSHADOW_EXTENSION_RESHAPE_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief reshape the content to another shape
+ * input: Tensor<Device,dimsrc>: ishape
+ * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
+ * \tparam SrcExp source expression
+ * \tparam dimdst target dimension
+ * \tparam dimsrc source dimension
+ */
+template<typename SrcExp, typename DType, int dimdst, int dimsrc>
+struct ReshapeExp:
+      public MakeTensorExp<ReshapeExp<SrcExp, DType, dimdst, dimsrc>,
+                           SrcExp, dimdst, DType> {
+  /*! \brief source expression */
+  const SrcExp &src_;
+  /*! \brief smallest dimension of input */
+  index_t ishapex_;
+  /*! \brief constructor */
+  ReshapeExp(const SrcExp &src, Shape<dimdst> shape)
+      : src_(src) {
+    Shape<dimsrc> ishape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+    CHECK_EQ(ishape.Size(), shape.Size()) << "reshape size must match";
+    ishapex_ = ishape[dimsrc - 1];
+    this->shape_ = shape;
+  }
+};
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \param oshape target shape
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam SrcExp source expression
+ * \tparam etype source expression type
+ * \tparam dimdst target dimension
+ */
+template<typename SrcExp, typename DType, int etype, int dimdst>
+inline ReshapeExp<SrcExp, DType, dimdst, ExpInfo<SrcExp>::kDim>
+reshape(const Exp<SrcExp, DType, etype> &src, Shape<dimdst> oshape) {
+  return ReshapeExp<SrcExp, DType, dimdst, ExpInfo<SrcExp>::kDim>
+      (src.self(), oshape);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int dimdst, int dimsrc>
+struct Plan<ReshapeExp<SrcExp, DType, dimdst, dimsrc>, DType> {
+ public:
+  explicit Plan(const ReshapeExp<SrcExp, DType, dimdst, dimsrc> &e)
+      : src_(MakePlan(e.src_)),
+        oshapex_(e.shape_[dimdst - 1]), ishapex_(e.ishapex_) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    const index_t idx = y * oshapex_ + x;
+    return src_.Eval(idx / ishapex_, idx % ishapex_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t oshapex_, ishapex_;
+};
+// special work plan for 1 dimensional data
+template<typename SrcExp, typename DType, int dimdst>
+struct Plan<ReshapeExp<SrcExp, DType, dimdst, 1>, DType> {
+ public:
+  explicit Plan(const ReshapeExp<SrcExp, DType, dimdst, 1> &e)
+      : src_(MakePlan(e.src_)), oshapex_(e.shape_[dimdst - 1]) {
+  }
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(0, y * oshapex_ + x);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t oshapex_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_RESHAPE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/slice.h b/3rdparty/mshadow/mshadow/extension/slice.h
new file mode 100644
index 000000000000..cb2eff4548aa
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/slice.h
@@ -0,0 +1,156 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file slice.h
+ * \brief support for slice a certain dimension.
+ */
+#ifndef MSHADOW_EXTENSION_SLICE_H_
+#define MSHADOW_EXTENSION_SLICE_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief slice expression, slice a tensor's channel
+ * \tparam SrcExp left expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ * \tparam dimsrc_m_cat dimsrc - dimcat
+ */
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_slice>
+struct SliceExp : public TRValue<SliceExp<SrcExp,
+                                          Device, DType,
+                                          srcdim, dimsrc_m_slice>,
+                                 Device, srcdim, DType> {
+  static const int dimslice = srcdim - dimsrc_m_slice;
+  const SrcExp &src_;
+  index_t ch_begin_;
+  index_t ch_old_;
+  Shape<srcdim> shape_;
+  SliceExp(const SrcExp &src, index_t begin, index_t end)
+      : src_(src), ch_begin_(begin) {
+    shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    ch_old_ = shape_[dimslice];
+    CHECK(begin < shape_[dimslice] && end <= shape_[dimslice])
+        << "The slice went out of range";
+    shape_[dimslice] = end - begin;
+  }
+  template<typename E, int etype>
+  inline void
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    this->__assign(exp);
+  }
+  inline void
+  operator=(const DType &exp) {
+    this->__assign(exp);
+  }
+};  // struct Slice
+
+/*!
+ * \brief Slice a Tensor
+ * \param src source tensor
+ * \param begin The beginning slice.
+ * \param end The end slice.
+ * \return sliced tensor
+ * \tparam sdim the dimension to slice on
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<int sdim, typename SrcExp,
+         typename Device, typename DType, int srcdim>
+inline SliceExp<SrcExp, Device, DType, srcdim, srcdim - sdim>
+slice(const TRValue<SrcExp, Device, srcdim, DType> &src, index_t begin, index_t end) {
+  TypeCheckPass<sdim < srcdim && ExpInfo<SrcExp>::kDim == srcdim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return SliceExp<SrcExp, Device, DType, srcdim, srcdim - sdim>(src.self(), begin, end);
+}
+//------------------------
+//  engine plugin
+//------------------------
+// runtime shapecheck
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_slice>
+struct ShapeCheck<srcdim, SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice> >{
+  inline static Shape<srcdim> Check(const SliceExp<SrcExp,
+                                    Device, DType, srcdim, dimsrc_m_slice> &t) {
+    return t.shape_;
+  }
+};
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_slice>
+struct StreamInfo<Device, SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice> >{
+  inline static Stream<Device> *
+  Get(const SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice> &t) {
+    return StreamInfo<Device, SrcExp>::Get(t.src_);
+  }
+};
+// static typecheck
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_slice>
+struct ExpInfo<SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice> >{
+  static const int kDim = ExpInfo<SrcExp>::kDim;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+//----------------------
+// Execution plan
+//---------------------
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim, int dimsrc_m_slice>
+struct Plan<SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice>, DType> {
+ public:
+  static const int dimslice = srcdim - dimsrc_m_slice;
+  explicit Plan(const SliceExp<SrcExp, Device, DType, srcdim, dimsrc_m_slice> &e)
+      : src_(MakePlan(e.src_)),
+        height_(e.shape_.ProdShape(dimslice + 1, srcdim - 1)),
+        ch_begin_(e.ch_begin_), ch_old_(e.ch_old_), ch_(e.shape_[dimslice]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_ + ch_begin_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    return src_.Eval((b * ch_old_ + c) * height_ + y, x);
+  }
+  MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
+    const index_t y = i % height_;
+    i /= height_;
+    const index_t c = i % ch_ + ch_begin_;
+    const index_t b = i / ch_;
+    const index_t x = j;
+    return src_.REval((b * ch_old_ + c) * height_ + y, x);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t height_, ch_begin_, ch_old_, ch_;
+};  // struct Plan
+
+template<typename SrcExp,
+         typename Device, typename DType,
+         int srcdim>
+struct Plan<SliceExp<SrcExp, Device, DType, srcdim, 1>, DType> {
+ public:
+  explicit Plan(const SliceExp<SrcExp, Device, DType, srcdim, 1> &e)
+      : src_(MakePlan(e.src_)),
+        ch_begin_(e.ch_begin_) {}
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    return src_.Eval(y, x + ch_begin_);
+  }
+  MSHADOW_XINLINE DType &REval(index_t y, index_t x) {
+    return src_.REval(y, x + ch_begin_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t ch_begin_;
+};
+}  // namespace expr
+}   // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SLICE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/slice_ex.h b/3rdparty/mshadow/mshadow/extension/slice_ex.h
new file mode 100644
index 000000000000..7f464097fb3b
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/slice_ex.h
@@ -0,0 +1,135 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file slice.h
+ * \brief support for slice a certain dimension.
+ */
+#ifndef MSHADOW_EXTENSION_SLICE_EX_H_
+#define MSHADOW_EXTENSION_SLICE_EX_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief slice expression, slice a tensor's channel
+ * \tparam SrcExp left expression
+ * \tparam DType the type of elements
+ * \tparam srcdim dimension of src
+ * \tparam dimsrc_m_cat dimsrc - dimcat
+ */
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct SliceExExp : public TRValue<SliceExExp<SrcExp,
+                                              Device, DType,
+                                              srcdim>,
+                                   Device, srcdim, DType> {
+  const SrcExp &src_;
+  Shape<srcdim> src_shape_;
+  Shape<srcdim> shape_;
+  const Shape<srcdim> begin_;
+  const Shape<srcdim> end_;
+  SliceExExp(const SrcExp &src, Shape<srcdim> begin, Shape<srcdim> end)
+      : src_(src), begin_(begin), end_(end) {
+    src_shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    for (int i = 0; i < srcdim; ++i) {
+      shape_[i] = end_[i] - begin_[i];
+    }
+  }
+  template<typename E, int etype>
+  inline void
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    this->__assign(exp);
+  }
+  inline void
+  operator=(const DType &exp) {
+    this->__assign(exp);
+  }
+};  // struct SliceEx
+
+/*!
+ * \brief SliceEx a Tensor
+ * \param src source tensor
+ * \param begin The beginning slice.
+ * \param end The end slice.
+ * \return sliced tensor
+ * \tparam sdim the dimension to slice on
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+inline SliceExExp<SrcExp, Device, DType, srcdim>
+slice(const TRValue<SrcExp, Device, srcdim, DType> &src, Shape<srcdim> begin, Shape<srcdim> end) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim == srcdim>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return SliceExExp<SrcExp, Device, DType, srcdim>(src.self(), begin, end);
+}
+//------------------------
+//  engine plugin
+//------------------------
+// runtime shapecheck
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct ShapeCheck<srcdim, SliceExExp<SrcExp, Device, DType, srcdim> >{
+  inline static Shape<srcdim> Check(const SliceExExp<SrcExp,
+                                    Device, DType, srcdim> &t) {
+    return t.shape_;
+  }
+};
+
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct StreamInfo<Device, SliceExExp<SrcExp, Device, DType, srcdim> >{
+  inline static Stream<Device> *
+  Get(const SliceExExp<SrcExp, Device, DType, srcdim> &t) {
+    return StreamInfo<Device, SrcExp>::Get(t.src_);
+  }
+};
+// static typecheck
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct ExpInfo<SliceExExp<SrcExp, Device, DType, srcdim> >{
+  static const int kDim = ExpInfo<SrcExp>::kDim;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+//----------------------
+// Execution plan
+//---------------------
+template<typename SrcExp, typename Device,
+         typename DType, int srcdim>
+struct Plan<SliceExExp<SrcExp, Device, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const SliceExExp<SrcExp, Device, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)), begin_(e.begin_),
+        src_shape_(e.src_shape_), shape_(e.shape_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t idx = 0;
+    index_t stride = 1;
+    #pragma unroll
+    for (int k = srcdim-2; k >= 0; --k) {
+      idx += stride * (i%shape_[k] + begin_[k]);
+      i /= shape_[k];
+      stride *= src_shape_[k];
+    }
+    return src_.Eval(idx, j + begin_[srcdim-1]);
+  }
+  MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
+    index_t idx = 0;
+    index_t stride = 1;
+    #pragma unroll
+    for (int k = srcdim-2; k >= 0; --k) {
+      idx += stride * (i%shape_[k] + begin_[k]);
+      i /= shape_[k];
+      stride *= src_shape_[k];
+    }
+    return src_.REval(idx, j + begin_[srcdim-1]);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const Shape<srcdim> begin_, src_shape_, shape_;
+};  // struct Plan
+}  // namespace expr
+}   // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SLICE_EX_H_
diff --git a/3rdparty/mshadow/mshadow/extension/spatial_pool.h b/3rdparty/mshadow/mshadow/extension/spatial_pool.h
new file mode 100644
index 000000000000..c833fb40ad58
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/spatial_pool.h
@@ -0,0 +1,152 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file spatial_pool.h
+ * \brief support for spatial pooling
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SPATIAL_POOL_H_
+#define MSHADOW_EXTENSION_SPATIAL_POOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief pooling expression, do reduction over local patches of a image
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the content data type
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct PoolingExp:
+      public MakeTensorExp<PoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source operand */
+  const SrcExp &src_;
+  /*! \brief kernel size in height */
+  index_t ksize_y_;
+  /*! \brief kernel size in width */
+  index_t ksize_x_;
+  /*! \brief kernel stride in y directory */
+  index_t kstride_y_;
+  /*! \brief kernel stride in x directory */
+  index_t kstride_x_;
+  /*! \brief source height shape[1] */
+  index_t src_height_;
+  /*! \brief source width shape[0] */
+  index_t src_width_;
+  /*! \brief constructor */
+  PoolingExp(const SrcExp &src,
+             index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x)
+             : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x),
+               kstride_y_(kstride_y), kstride_x_(kstride_x) {
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    CHECK(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y)
+      << "PoolingExp: kernel must be smaller than image";
+    this->src_height_ = sshape[srcdim - 2];
+    this->src_width_  = sshape[srcdim - 1];
+    this->shape_ = sshape;
+    this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride_y + 1;
+    this->shape_[srcdim - 1] = (src_width_  - ksize_x) / kstride_x + 1;
+  }
+  /*! \brief constructor, specify shape */
+  PoolingExp(const SrcExp &src, Shape<2> pshape,
+             index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x)
+             : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x),
+               kstride_y_(kstride_y), kstride_x_(kstride_x) {
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    CHECK(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y)
+      << "PoolingExp: kernel must be smaller than image";
+    this->src_height_ = sshape[srcdim - 2];
+    this->src_width_  = sshape[srcdim - 1];
+    this->shape_ = sshape;
+    this->shape_[srcdim - 2] = pshape[0];
+    this->shape_[srcdim - 1] = pshape[1];
+  }
+};
+/*!
+ * \brief pooling subregion results together
+ * \param src source image, shape: (batch, channel, height, width)
+ * \param ksize_y kernel size in height
+ * \param ksize_x kernel size in width
+ * \param kstride_y stride in y directory
+ * \param kstride_x stride in x directory
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pool(const Exp<SrcExp, DType, etype> &src,
+     index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (src.self(), ksize_y, ksize_x, kstride_y, kstride_x);
+}
+/*!
+ * \brief same as pool, except the output shape is specified by pshape
+ * \param src source image
+ * \param pshape ouput shape
+ * \param ksize_y kernel size in y
+ * \param ksize_x kernel size in x
+ * \param kstride_y stride in y directory
+ * \param kstride_x stride in x directory
+ * \return expression of pooled result
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp,
+         typename DType, int etype>
+inline PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+pool(const Exp<SrcExp, DType, etype> &src, Shape<2> pshape,
+     index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return PoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+     (src.self(), pshape, ksize_y, ksize_x, kstride_y, kstride_x);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<PoolingExp< Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const PoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : src_(MakePlan(e.src_)),
+      ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_),
+      kstride_y_(e.kstride_y_), kstride_x_(e.kstride_x_),
+        src_height_(e.src_height_), src_width_(e.src_width_),
+        new_height_(e.shape_[srcdim - 2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t py = i % new_height_;
+    const index_t y_start = py * kstride_y_;
+    const index_t y_end = min(y_start + ksize_y_, src_height_);
+    const index_t px = j;
+    const index_t x_start = px * kstride_x_;
+    const index_t x_end = min(x_start + ksize_x_, src_width_);
+    const index_t c = i / new_height_;
+
+    DType res; Reducer::SetInitValue(res);
+    for (index_t y = y_start; y < y_end; ++y) {
+      for (index_t x = x_start; x < x_end; ++x) {
+        Reducer::Reduce(res, src_.Eval(c * src_height_ + y, x));
+      }
+    }
+    return res;
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t ksize_y_, ksize_x_, kstride_y_, kstride_x_;
+  const index_t src_height_, src_width_;
+  const index_t new_height_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SPATIAL_POOL_H_
diff --git a/3rdparty/mshadow/mshadow/extension/spatial_unpool.h b/3rdparty/mshadow/mshadow/extension/spatial_unpool.h
new file mode 100644
index 000000000000..e9ca2dfd035b
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/spatial_unpool.h
@@ -0,0 +1,135 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file spatial_unpool.h
+ * \brief support for unpool
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
+#define MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief unpooling expr reverse operation of pooling, used to pass gradient back
+ * \tparam Reducer reduction method during pooling
+ * \tparam SrcExp source expression to be pooled from
+ * \tparam DType the content data type
+ * \tparam srcdim dimension of src
+ */
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct UnPoolingExp:
+      public MakeTensorExp<UnPoolingExp<Reducer, SrcExp, DType, srcdim>,
+                           SrcExp, srcdim, DType> {
+  /*! \brief source input, corresponds to src in pooling */
+  const SrcExp &data_src_;
+  /*! \brief result of pooled data, corresponds to result of pooling */
+  const SrcExp &data_pooled_;
+  /*! \brief gradient data of pooled part, to be propgate down */
+  const SrcExp &grad_pooled_;
+  /*! \brief shape of pooled expression */
+  index_t pshape_y_;
+  /*! \brief shape of pooled expression */
+  index_t pshape_x_;
+  /*! \brief kernel size in height */
+  index_t ksize_y_;
+  /*! \brief kernel size in width */
+  index_t ksize_x_;
+  /*! \brief kernel stride in y directory */
+  index_t kstride_y_;
+  /*! \brief kernel stride in x directory */
+  index_t kstride_x_;
+  /*! \brief constructor */
+  UnPoolingExp(const SrcExp &data_src,
+               const SrcExp &data_pooled,
+               const SrcExp &grad_pooled,
+               index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x)
+      : data_src_(data_src), data_pooled_(data_pooled),
+        grad_pooled_(grad_pooled),
+    ksize_y_(ksize_y), ksize_x_(ksize_x),
+    kstride_y_(kstride_y), kstride_x_(kstride_x) {
+    Shape<srcdim> pshape = ShapeCheck<srcdim, SrcExp>::Check(grad_pooled);
+    typedef ShapeCheck<srcdim, SrcExp> ShapeCheckSrcDimSrcExp;
+    CHECK_EQ(pshape, ShapeCheckSrcDimSrcExp::Check(data_pooled))
+      << "UnPoolingExp: pooled shape mismatch";
+    Shape<srcdim> sshape = ShapeCheck<srcdim, SrcExp>::Check(data_src);
+    for (int k = 0;  k < srcdim - 2; ++k) {
+      CHECK_EQ(pshape[k], sshape[k]) << "UnPoolingExp: pool and src shape mismatch";
+    }
+    pshape_x_ = pshape[srcdim - 1];
+    pshape_y_ = pshape[srcdim - 2];
+    this->shape_ = sshape;
+  }
+};
+/*!
+ * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling,
+ *   same as unpooling, but allows unequal size of kernel
+ * \param data_src  source input, corresponds to src in pooling
+ * \param data_pooled result of pooled data, corresponds to result of pooling
+ * \param grad_pooled gradient data of pooled part, to be propgate down
+ * \param ksize_y kernel height
+ * \param ksize_x kernel width
+ * \param kstride_y stride in y directory
+ * \param kstride_x stride in x directory
+ * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient
+ * \tparam Reducer reducer type
+ * \tparam SrcExp source expression
+ * \tparam DType the content data type
+ * \tparam etype type of expression
+ */
+template<typename Reducer, typename SrcExp, typename DType, int etype>
+inline UnPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+unpool(const Exp<SrcExp, DType, etype> &data_src,
+       const Exp<SrcExp, DType, etype> &data_pooled,
+       const Exp<SrcExp, DType, etype> &grad_pooled,
+       index_t ksize_y, index_t ksize_x, index_t kstride_y, index_t kstride_x) {
+  return UnPoolingExp<Reducer, SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (data_src.self(), data_pooled.self(), grad_pooled.self(),
+       ksize_y, ksize_x, kstride_y, kstride_x);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename Reducer, typename SrcExp, typename DType, int srcdim>
+struct Plan<UnPoolingExp<Reducer, SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const UnPoolingExp<Reducer, SrcExp, DType, srcdim> &e)
+      : data_src_(MakePlan(e.data_src_)), data_pooled_(MakePlan(e.data_pooled_)),
+        grad_pooled_(MakePlan(e.grad_pooled_)), sshape_y_(e.shape_[srcdim - 2]),
+        pshape_y_(e.pshape_y_),  pshape_x_(e.pshape_x_),
+        ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_),
+        kstride_y_(e.kstride_y_), kstride_x_(e.kstride_x_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    using namespace std;
+    const index_t x = j;
+    const index_t y = i % sshape_y_;
+    const index_t c = i / sshape_y_;
+    const DType vsrc = data_src_.Eval(i, j);
+    const index_t py_min =
+        y < ksize_y_ ? 0 : (y - ksize_y_ + kstride_y_) / kstride_y_;
+    const index_t px_min =
+        x < ksize_x_ ? 0 : (x - ksize_x_ + kstride_x_) / kstride_x_;
+    const index_t py_max = min((y + kstride_y_) / kstride_y_, pshape_y_);
+    const index_t px_max = min((x + kstride_x_) / kstride_x_, pshape_x_);
+
+    DType val = static_cast<DType>(0);
+    for (index_t py = py_min; py < py_max; ++py) {
+      for (index_t px = px_min; px < px_max; ++px) {
+        val += Reducer::PartialGrad(vsrc,
+                                    data_pooled_.Eval(c * pshape_y_ + py, px)) *
+                                    grad_pooled_.Eval(c * pshape_y_ + py, px);
+      }
+    }
+
+    return val;
+  }
+
+ private:
+  Plan<SrcExp, DType> data_src_, data_pooled_, grad_pooled_;
+  const index_t sshape_y_, pshape_y_, pshape_x_;
+  const index_t ksize_y_, ksize_x_;
+  const index_t kstride_y_, kstride_x_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_
diff --git a/3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h b/3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h
new file mode 100644
index 000000000000..534fbdd9ebe0
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h
@@ -0,0 +1,71 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file spatial_upsampling.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MSHADOW_EXTENSION_SPATIAL_UPSAMPLING_NEAREST_H_
+#define MSHADOW_EXTENSION_SPATIAL_UPSAMPLING_NEAREST_H_
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+/*! \brief nearest neighboor upsampling
+ *         out(x, y) = in(int(x / scale_x), int(y / scale_y))
+ *  \tparam SrcExp source expression
+ *  \tparam DType data type
+ *  \tparam srcdim source dimension
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct UpSamplingNearestExp :
+  public MakeTensorExp<UpSamplingNearestExp<SrcExp, DType, srcdim>,
+                       SrcExp, srcdim, DType> {
+  /*! \brief source oprand */
+  const SrcExp &src_;
+  /*! \brief up sampling scale */
+  index_t scale_;
+  /*! \brief constructor */
+  UpSamplingNearestExp(const SrcExp &src, index_t scale)
+    : src_(src), scale_(scale) {
+    this->shape_ = ShapeCheck<srcdim, SrcExp>::Check(src_);
+    this->shape_[srcdim - 2] *= scale_;
+    this->shape_[srcdim - 1] *= scale_;
+  }
+};
+
+
+template<typename SrcExp, typename DType, int etype>
+inline UpSamplingNearestExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+upsampling_nearest(const Exp<SrcExp, DType, etype> &src, index_t scale) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 2>
+    ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return UpSamplingNearestExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), scale);
+}
+
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<UpSamplingNearestExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const UpSamplingNearestExp<SrcExp, DType, srcdim> &e)
+    : src_(MakePlan(e.src_)),
+      scale_(e.scale_),
+      new_height_(e.shape_[srcdim - 2]),
+      src_height_(static_cast<index_t>(e.shape_[srcdim - 2] / e.scale_)) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x = j;
+    const index_t y = i % new_height_;
+    const index_t c = i / new_height_;
+    const index_t h = static_cast<index_t>(y / scale_);
+    const index_t w = static_cast<index_t>(x / scale_);
+    return src_.Eval(c * src_height_ + h, w);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t scale_;
+  const index_t new_height_;
+  const index_t src_height_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SPATIAL_UPSAMPLING_NEAREST_H_
diff --git a/3rdparty/mshadow/mshadow/extension/swapaxis.h b/3rdparty/mshadow/mshadow/extension/swapaxis.h
new file mode 100644
index 000000000000..b79aba441175
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/swapaxis.h
@@ -0,0 +1,110 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file swapaxis.h
+ * \brief support for swapaxis
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_SWAPAXIS_H_
+#define MSHADOW_EXTENSION_SWAPAXIS_H_
+#include <algorithm>
+#include <utility>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief swap two axis of a tensor
+ * input: Tensor<Device,dim>: ishape
+ * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
+ *
+ * \tparam SrcExp type of source expression
+ * \tparam DType the type of elements 
+ * \tparam dimsrc source dimension, assert a1 > a2
+ * \tparam m_a1 one dimension to be swapped, encoded by dimsrc - a1 
+ * \tparam a2 second dimension to be swapped, encoded by a2
+ */
+template<typename SrcExp, typename DType, int dimsrc, int m_a1, int a2>
+struct SwapAxisExp:
+      public MakeTensorExp<SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2>,
+                           SrcExp, dimsrc, DType> {
+  // decode the a1, a2
+  static const int a1 = dimsrc - m_a1;
+  /*! \brief source expression */
+  const SrcExp &src_;
+  /*! \brief constructor */
+  explicit SwapAxisExp(const SrcExp &src) : src_(src) {
+    this->shape_ = ShapeCheck<dimsrc, SrcExp>::Check(src);
+    std::swap(this->shape_[a1], this->shape_[a2]);
+  }
+};
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam a1 higher dimension to be swapped, assert a1 > a2
+ * \tparam a2 lower dimension to be swapped
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements 
+ * \tparam etype source expression type
+ */
+template<int a1, int a2, typename SrcExp, typename DType, int etype>
+inline SwapAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+                   ExpInfo<SrcExp>::kDim - a1, a2>
+swapaxis(const Exp<SrcExp, DType, etype> &src) {
+  typedef ExpInfo<SrcExp> Info;
+  TypeCheckPass<Info::kDim >= a1 + 1 && Info::kDim >= a2 + 1 &&
+                a2 < a1>::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return SwapAxisExp<SrcExp, DType, ExpInfo<SrcExp>::kDim,
+                     ExpInfo<SrcExp>::kDim - a1, a2>(src.self());
+}
+template<typename SrcExp, typename DType, int dimsrc, int m_a1, int a2>
+struct Plan<SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2>, DType> {
+ public:
+  // decode the a1
+  static const int a1 = dimsrc - m_a1;
+  explicit Plan(const SwapAxisExp<SrcExp, DType, dimsrc, m_a1, a2> &e)
+      : src_(MakePlan(e.src_)),
+        shapey_(e.shape_.ProdShape(a1 + 1, dimsrc - 1)),
+        shapez_(e.shape_[a1]),
+        shapec_(e.shape_.ProdShape(a2 + 1, a1)),
+        shapen_(e.shape_[a2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t y = i % shapey_;
+    i /= shapey_;
+    const index_t z = i % shapez_;
+    i /= shapez_;
+    const index_t c = i % shapec_;
+    i /= shapec_;
+    const index_t n = i % shapen_;
+    // swap z and n
+    return src_.Eval(((((i / shapen_) * shapez_ + z) * shapec_ +
+                          c) * shapen_ + n) * shapey_ + y, j);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t shapey_, shapez_, shapec_, shapen_;
+};
+template<typename SrcExp, typename DType, int dimsrc, int a2>
+struct Plan<SwapAxisExp<SrcExp, DType, dimsrc, 1, a2>, DType> {
+ public:
+  explicit Plan(const SwapAxisExp<SrcExp, DType, dimsrc, 1, a2> &e)
+      : src_(MakePlan(e.src_)),
+        shapex_(e.shape_[dimsrc - 1]),
+        shapey_(e.shape_.ProdShape(a2 + 1, dimsrc - 1)),
+        shapez_(e.shape_[a2]) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t x) const {
+    // swap x and z
+    const index_t y = i % shapey_;
+    i /= shapey_;
+    const index_t z = i % shapez_;
+    const index_t n = i / shapez_;
+    return src_.Eval((n * shapex_ + x) * shapey_ + y , z);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t shapex_, shapey_, shapez_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_SWAPAXIS_H_
diff --git a/3rdparty/mshadow/mshadow/extension/take.h b/3rdparty/mshadow/mshadow/extension/take.h
new file mode 100644
index 000000000000..76c4f4729491
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/take.h
@@ -0,0 +1,99 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file take.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MSHADOW_EXTENSION_TAKE_H_
+#define MSHADOW_EXTENSION_TAKE_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+/*! \brief Take a column from a matrix
+ *  \tparam IndexExp type of index expression
+ *  \tparam SrcExp type of src expression
+ *  \tparam DType data type
+ */
+template<typename IndexExp, typename SrcExp, typename DType>
+struct TakeExp: public Exp<TakeExp<IndexExp, SrcExp, DType>,
+                           DType, type::kChainer> {
+  /*! \brief index oprand */
+  const IndexExp &index_;
+  /*! \brief embediing oprand */
+  const SrcExp &src_;
+  /*! constructor */
+  TakeExp(const IndexExp &index, const SrcExp &src)
+    : index_(index), src_(src) {}
+};  // struct TakeExp
+
+
+
+template<typename IndexExp,
+         typename SrcExp,
+         typename DType,
+         int e1, int e2>
+inline TakeExp<IndexExp, SrcExp, DType>
+take(const Exp<IndexExp, DType, e1> &index,
+     const Exp<SrcExp, DType, e2> &src) {
+  return TakeExp<IndexExp, SrcExp, DType>(index.self(), src.self());
+}
+
+
+//----------------------
+// Execution plan
+//----------------------
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct Plan<TakeExp<IndexExp, SrcExp, DType>, DType> {
+ public:
+  explicit Plan(const TakeExp<IndexExp, SrcExp, DType> &e)
+    : index_(MakePlan(e.index_)), src_(MakePlan(e.src_)) {
+  }
+
+  // TODO(xx): discuss W shape: in * out or out * in
+  // Now I use in * out
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    index_t idx = static_cast<index_t>(index_.Eval(0, y));
+    return static_cast<DType>(src_.Eval(idx, x));
+  }
+
+ private:
+  expr::Plan<IndexExp, DType> index_;
+  expr::Plan<SrcExp, DType> src_;
+};  // struct Plan
+
+template<typename IndexExp, typename SrcExp, typename DType>
+inline Plan<TakeExp<IndexExp, SrcExp, DType>, DType>
+MakePlan(const TakeExp<IndexExp, SrcExp, DType> &exp) {
+  return Plan<TakeExp<IndexExp, SrcExp, DType>, DType>(exp);
+}
+
+template<int dim, typename IndexExp, typename SrcExp, typename DType>
+struct ShapeCheck<dim, TakeExp<IndexExp, SrcExp, DType> > {
+  inline static Shape<dim>
+  Check(const TakeExp<IndexExp, SrcExp, DType> &t) {
+    CHECK(dim == 2)
+      << "TakeExp only support 2D output";
+    Shape<1> dshape = ShapeCheck<1, IndexExp>::Check(t.index_);
+    Shape<2> wshape = ShapeCheck<2, SrcExp>::Check(t.src_);
+    Shape<dim> ret;
+    ret[0] = dshape[0];
+    ret[1] = wshape[1];
+    return ret;
+  }
+};
+
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct ExpInfo<TakeExp<IndexExp, SrcExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask = ExpInfo<IndexExp>::kDevMask;
+};
+
+}  // namespace expr
+}  // namespace mshadow
+
+#endif  // MSHADOW_EXTENSION_TAKE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/take_grad.h b/3rdparty/mshadow/mshadow/extension/take_grad.h
new file mode 100644
index 000000000000..4479b3e0cd9d
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/take_grad.h
@@ -0,0 +1,111 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file take_grad.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MSHADOW_EXTENSION_TAKE_GRAD_H_
+#define MSHADOW_EXTENSION_TAKE_GRAD_H_
+
+#include "../extension.h"
+
+namespace mshadow {
+namespace expr {
+
+/*! \brief Calculate embedding gradient
+ *  \tparam IndexExp type of index expression
+ *  \tparam SrcExp type of src expression
+ *  \tparam DType data type
+ */
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct TakeGradExp : public Exp<TakeGradExp<IndexExp, SrcExp, DType>,
+                                DType, type::kChainer> {
+  /*! \brief index oprand */
+  const IndexExp &index_;
+  /*! \brief out gradient oprand */
+  const SrcExp &src_;
+  /*! \brief batch size */
+  const index_t input_dim_;
+  /*! \brief constructor */
+  TakeGradExp(const IndexExp &index, const SrcExp &src, const index_t input_dim)
+    : index_(index), src_(src), input_dim_(input_dim) {}
+};  // struct TakeGradExp
+
+
+template<typename IndexExp,
+         typename SrcExp,
+         typename DType,
+         int e1, int e2>
+inline TakeGradExp<IndexExp, SrcExp, DType>
+take_grad(const Exp<IndexExp, DType, e1> &index,
+          const Exp<SrcExp, DType, e2> &src,
+          const index_t input_dim) {
+  return TakeGradExp<IndexExp, SrcExp, DType>(index.self(),
+                                                       src.self(),
+                                                       input_dim);
+}
+
+//----------------------
+// Execution plan
+//----------------------
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct Plan<TakeGradExp<IndexExp, SrcExp, DType>, DType> {
+ public:
+  explicit Plan(const TakeGradExp<IndexExp, SrcExp, DType> &e)
+    : index_(MakePlan(e.index_)),
+      src_(MakePlan(e.src_)),
+      batch_size_(ShapeCheck<1, IndexExp>::Check(e.index_)[0]) {
+  }
+
+  // now return shape: in * out
+  MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
+    DType ret = 0.f;
+    for (index_t i = 0; i < batch_size_; ++i) {
+      index_t idx = static_cast<index_t>(index_.Eval(0, i));
+      if (idx == y) {
+        ret += static_cast<DType>(src_.Eval(i, x));
+      }
+    }
+    return ret;
+  }
+
+ private:
+  expr::Plan<IndexExp, DType> index_;
+  expr::Plan<SrcExp, DType> src_;
+  const index_t batch_size_;
+};  // struct Plan
+
+
+template<typename IndexExp, typename SrcExp, typename DType>
+inline Plan<TakeGradExp<IndexExp, SrcExp, DType>, DType>
+MakePlan(const TakeGradExp<IndexExp, SrcExp, DType> &exp) {
+  return Plan<TakeGradExp<IndexExp, SrcExp, DType>, DType>(exp);
+}
+
+template<int dim, typename IndexExp, typename SrcExp, typename DType>
+struct ShapeCheck<dim, TakeGradExp<IndexExp, SrcExp, DType> > {
+  inline static Shape<dim>
+  Check(const TakeGradExp<IndexExp, SrcExp, DType> &t) {
+    CHECK(dim == 2)
+      << "TakeGradExp only support 2D output";
+    // Shape<1> dshape = ShapeCheck<1, IndexExp>::Check(t.index_);
+    Shape<2> gshape = ShapeCheck<2, SrcExp>::Check(t.src_);
+    Shape<dim> ret;
+    ret[0] = t.input_dim_;
+    ret[1] = gshape[1];
+    return ret;
+  }
+};  // struct ShapeCheck
+
+template<typename IndexExp, typename SrcExp, typename DType>
+struct ExpInfo<TakeGradExp<IndexExp, SrcExp, DType> > {
+  static const int kDim = 2;
+  static const int kDevMask = ExpInfo<IndexExp>::kDevMask;
+};
+
+}  // namespace expr
+}  // namespace mshadow
+
+#endif  // MSHADOW_EXTENSION_TAKE_GRAD_H_
diff --git a/3rdparty/mshadow/mshadow/extension/transpose.h b/3rdparty/mshadow/mshadow/extension/transpose.h
new file mode 100644
index 000000000000..6640153f2100
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/transpose.h
@@ -0,0 +1,200 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file transpose.h
+ * \brief support for transpose
+ * \author Junyuan Xie
+ */
+#ifndef MSHADOW_EXTENSION_TRANSPOSE_H_
+#define MSHADOW_EXTENSION_TRANSPOSE_H_
+#include <algorithm>
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief transpose axes of a tensor
+ * input: Tensor<Device,dim>: ishape
+ * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
+ *
+ * \tparam SrcExp type of source expression
+ * \tparam DType the type of elements
+ * \tparam dimsrc source dimension, assert a1 > a2
+ * \tparam m_a1 one dimension to be swapped, encoded by dimsrc - a1
+ * \tparam a2 second dimension to be swapped, encoded by a2
+ */
+template<typename SrcExp, typename DType, int dimsrc>
+struct TransposeExExp:
+      public MakeTensorExp<TransposeExExp<SrcExp, DType, dimsrc>,
+                           SrcExp, dimsrc, DType> {
+  /*! \brief source expression */
+  const SrcExp &src_;
+  const Shape<dimsrc> axes_;
+  Shape<dimsrc> dst_in_src_stride_;  // Holds the corresponding stride of the dst axes in src
+  index_t src_stride_;
+  /*! \brief constructor */
+  explicit TransposeExExp(const SrcExp &src, Shape<dimsrc> axes) : src_(src), axes_(axes) {
+    Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src);
+    src_stride_ = src_shape[dimsrc - 1];
+    Shape<dimsrc> src_stride;
+    src_stride[dimsrc-1] = 1;
+    for (int i = dimsrc-2; i >= 0; --i) src_stride[i] = src_shape[i+1]*src_stride[i+1];
+    for (int i = 0; i < dimsrc; ++i) {
+      dst_in_src_stride_[i] = src_stride[axes[i]];
+      this->shape_[i] = src_shape[axes[i]];
+    }
+  }
+};
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam a1 higher dimension to be swapped, assert a1 > a2
+ * \tparam a2 lower dimension to be swapped
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype source expression type
+ */
+template<typename SrcExp, typename DType, int etype>
+inline TransposeExExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+transpose(const Exp<SrcExp, DType, etype> &src, Shape<ExpInfo<SrcExp>::kDim> axes) {
+  return TransposeExExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>(src.self(), axes);
+}
+
+template<typename SrcExp, typename DType, int dimsrc>
+struct Plan<TransposeExExp<SrcExp, DType, dimsrc>, DType> {
+ public:
+  explicit Plan(const TransposeExExp<SrcExp, DType, dimsrc> &e)
+      : src_(MakePlan(e.src_)),
+        src_stride_(e.src_stride_),
+        dst_in_src_stride_(e.dst_in_src_stride_),
+        dst_shape_(e.shape_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t idx = j * dst_in_src_stride_[dimsrc - 1];
+    #pragma unroll
+    for (int k = dimsrc-2; k >= 0; --k) {
+      idx += (i % dst_shape_[k]) * dst_in_src_stride_[k];
+      i /= dst_shape_[k];
+    }
+    return src_.Eval(idx/src_stride_, idx%src_stride_);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t src_stride_;
+  const Shape<dimsrc> dst_in_src_stride_, dst_shape_;
+};
+
+/*!
+ * \brief transform contiguous indices of the source tensor to indices of the transposed tensor.
+ * input: Tensor<Device, k>: ishape
+ * output: Tensor<Device, k>: oshape = ishape
+ *
+ * \tparam SrcExp type of source expression
+ * \tparam DType the type of elements
+ * \tparam dimsrc source dimension
+ * \tparam etype source type
+ */
+template<typename SrcExp, typename DType, int dimsrc, int etype>
+struct TransposeIndicesExp:
+      public Exp<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType, etype> {
+  /*! \brief source expression */
+  const SrcExp &src_indices_;  // Expression of the source indices
+  Shape<dimsrc> src_shape_;  // Holds the corresponding stride of the source axes in dst
+  const Shape<dimsrc> axes_;  // The transpose axes
+  Shape<dimsrc> src_in_dst_stride_;  // Holds the corresponding stride of the source axes in dst
+  /*! \brief constructor */
+  explicit TransposeIndicesExp(const SrcExp &src_indices,
+                               Shape<dimsrc> src_shape,
+                               Shape<dimsrc> axes) : src_indices_(src_indices),
+                                                     src_shape_(src_shape), axes_(axes) {
+    Shape<dimsrc> dst_shape_;
+    Shape<dimsrc> dst_stride_;
+    bool axes_checking_flag[dimsrc] = { 0 };
+    for (int i = 0; i < dimsrc; ++i) {
+      CHECK_LT(static_cast<int>(axes[i]), dimsrc)
+        << "Invalid axes input! All elements of axes must be between 0 and " << dimsrc
+        << ", find axes=" << axes;
+      dst_shape_[i] = src_shape[axes[i]];
+      axes_checking_flag[axes[i]] = true;
+    }
+    // check if the input axes is valid
+    for (int i = 0; i < dimsrc; ++i) {
+      CHECK_EQ(axes_checking_flag[i], true)
+        << "Invalid axes input! All elements of axes must be between 0 and " << dimsrc
+        << ", find axes=" << axes;
+    }
+    dst_stride_[dimsrc - 1] = 1;
+    for (int i = dimsrc - 2; i >= 0; --i) dst_stride_[i] = dst_shape_[i+1] * dst_stride_[i+1];
+    for (int i = 0; i < dimsrc; ++i) {
+      src_in_dst_stride_[axes[i]] = dst_stride_[i];
+    }
+  }
+};
+
+/*!
+ * \brief a expression that reshapes a tensor to another shape
+ * \param src Tensor<Device,dimsrc>:
+ * \return a expresion with type Tensor<Device,dimdst>
+ * \tparam a1 higher dimension to be swapped, assert a1 > a2
+ * \tparam a2 lower dimension to be swapped
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype source expression type
+ */
+template<typename SrcExp, typename DType, int dimsrc, int etype>
+inline TransposeIndicesExp<SrcExp, DType, dimsrc, etype>
+transpose_indices(const Exp<SrcExp, DType, etype> &src_indices,
+                  Shape<dimsrc> src_shape,
+                  Shape<dimsrc> axes) {
+  return TransposeIndicesExp<SrcExp, DType, dimsrc, etype>(src_indices.self(), src_shape, axes);
+}
+
+template<typename SrcExp, typename DType, int dimsrc, int etype>
+struct Plan<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType> {
+ public:
+  explicit Plan(const TransposeIndicesExp<SrcExp, DType, dimsrc, etype> &e)
+      : src_indices_(MakePlan(e.src_indices_)),
+        src_in_dst_stride_(e.src_in_dst_stride_),
+        src_shape_(e.src_shape_) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    index_t src_idx = static_cast<index_t>(src_indices_.Eval(i, j));
+    index_t dst_idx = 0;
+    #pragma unroll
+    for (int k = dimsrc - 1; k >= 0; --k) {
+      dst_idx += (src_idx % src_shape_[k]) * src_in_dst_stride_[k];
+      src_idx /= src_shape_[k];
+    }
+    return static_cast<DType>(dst_idx);
+  }
+
+ private:
+  Plan<SrcExp, DType> src_indices_;
+  const Shape<dimsrc> src_in_dst_stride_, src_shape_;
+};
+
+//----------------------
+// Execution plan
+//----------------------
+/*! \brief make expression */
+template<typename SrcExp, typename DType, int dimsrc, int etype>
+inline Plan<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType>
+MakePlan(const TransposeIndicesExp<SrcExp, DType, dimsrc, etype> &e) {
+  return Plan<TransposeIndicesExp<SrcExp, DType, dimsrc, etype>, DType>(e);
+}
+
+template<int dim, typename SrcExp, typename DType, int dimsrc, int etype>
+struct ShapeCheck<dim, TransposeIndicesExp<SrcExp, DType, dimsrc, etype> > {
+  inline static Shape<dim>
+  Check(const TransposeIndicesExp<SrcExp, DType, dimsrc, etype> &t) {
+    Shape<dim> s = ShapeCheck<dim, SrcExp>::Check(t.src_indices_);
+    return s;
+  }
+};
+
+template<typename SrcExp, typename DType, int dimsrc, int etype>
+struct ExpInfo<TransposeIndicesExp<SrcExp, DType, dimsrc, etype> > {
+  static const int kDim = ExpInfo<SrcExp>::kDim;
+  static const int kDevMask = ExpInfo<SrcExp>::kDevMask;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_TRANSPOSE_H_
diff --git a/3rdparty/mshadow/mshadow/extension/unpack_patch2col.h b/3rdparty/mshadow/mshadow/extension/unpack_patch2col.h
new file mode 100644
index 000000000000..ed473f81d496
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/extension/unpack_patch2col.h
@@ -0,0 +1,151 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file unpack_patch2col.h
+ * \brief support for unpack
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
+#define MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
+#include "../extension.h"
+namespace mshadow {
+namespace expr {
+/*!
+ * \brief unpack local (overlap) patches of image to column of mat,
+ *  can be used to implement convolution, this expression allow unpack of a batch
+ *  this is a version support unpacking multiple images
+ *  after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations:
+ * \tparam SrcExp source expression
+ * \tparam dstdim destination dimension
+ */
+template<typename SrcExp, typename DType, int srcdim>
+struct UnpackPatchToColXExp:
+      public MakeTensorExp<UnpackPatchToColXExp<SrcExp, DType, srcdim>,
+                           SrcExp, 2, DType>{
+  /*! \brief source operand */
+  const SrcExp &img_;
+  /*! \brief patch height */
+  index_t psize_y_;
+  /*! \brief patch width */
+  index_t psize_x_;
+  /*! \brief patch stride */
+  index_t pstride_y_;
+  index_t pstride_x_;
+  /*! \brief patch dilate */
+  index_t pdilate_y_;
+  index_t pdilate_x_;
+  /*! \brief number of input channel */
+  index_t i_channel_;
+  /*! \brief height of img */
+  index_t i_height_;
+  /*! \brief width of img */
+  index_t i_width_;
+  /*! \brief constructor */
+  UnpackPatchToColXExp(const SrcExp &img,
+                       index_t psize_y,
+                       index_t psize_x,
+                       index_t pstride_y,
+                       index_t pstride_x,
+                       index_t pdilate_y,
+                       index_t pdilate_x)
+      : img_(img), psize_y_(psize_y), psize_x_(psize_x),
+      pstride_y_(pstride_y), pstride_x_(pstride_x),
+      pdilate_y_(pdilate_y), pdilate_x_(pdilate_x){
+    Shape<srcdim> imshape = ShapeCheck<srcdim, SrcExp>::Check(img_);
+    CHECK(imshape[srcdim - 1] >= psize_x && imshape[srcdim - 2] >= psize_y)
+      << "UnpackPatchToCol:image shape smaller than patch size";
+    this->i_channel_ = imshape[srcdim - 3];
+    this->i_height_  = imshape[srcdim - 2];
+    this->i_width_   = imshape[srcdim - 1];
+    // calculate number of batches
+    const index_t num = imshape.ProdShape(0, srcdim - 3);
+    const index_t o_height = (i_height_ -
+        (pdilate_y * (psize_y - 1) + 1)) / pstride_y + 1;
+    const index_t o_width  = (i_width_  -
+        (pdilate_x * (psize_x - 1) + 1)) / pstride_x + 1;
+    this->shape_[1] = o_height * o_width * num;
+    this->shape_[0] = psize_y * psize_x * i_channel_;
+  }
+};
+
+/*!
+ * \brief  unpack local (overlap) patches of image to column of mat, can be used to implement convolution
+ *  after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations:
+ *
+ *  weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x
+ *  output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images
+ *  out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0
+ *  out_width  = (in_width - psize_x) / pstride + 1
+ *
+ * \return mat target matrix; shape[0]: in_channel*psize_y*psize_x  shape[1]: out_height*out_width * num_of_images
+ * \param img source image; shape[-3]: in_channels, shape[-2]: in_height, shape[-1]: in_width, can be 3D or 4D tensor(multiple images)
+ * \param psize_y height of each patch
+ * \param psize_x width of each patch
+ * \param pstride stride of each patch
+ * \param pdilate dilate of each patch
+ * \tparam SrcExp source expression
+ * \tparam DType the type of elements
+ * \tparam etype type of expression
+ */
+template<typename SrcExp, typename DType, int etype>
+inline UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+unpack_patch2col(const Exp<SrcExp, DType, etype> &img,
+                 index_t psize_y, index_t psize_x, index_t pstride, index_t pdilate) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (img.self(), psize_y, psize_x, pstride, pstride, pdilate, pdilate);
+}
+
+/*!
+ *if you want to specify stride_x and stride_y
+ */
+template<typename SrcExp, typename DType, int etype>
+inline UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+unpack_patch2col(const Exp<SrcExp, DType, etype> &img,
+                 index_t psize_y, index_t psize_x, index_t pstride_y_, index_t pstride_x_,
+                 index_t pdilate_y_, index_t pdilate_x_) {
+  TypeCheckPass<ExpInfo<SrcExp>::kDim >= 3>
+      ::Error_Expression_Does_Not_Meet_Dimension_Req();
+  return UnpackPatchToColXExp<SrcExp, DType, ExpInfo<SrcExp>::kDim>
+      (img.self(), psize_y, psize_x, pstride_y_, pstride_x_, pdilate_y_, pdilate_x_);
+}
+//----------------------
+// Execution plan
+//----------------------
+template<typename SrcExp, typename DType, int srcdim>
+struct Plan<UnpackPatchToColXExp<SrcExp, DType, srcdim>, DType> {
+ public:
+  explicit Plan(const UnpackPatchToColXExp<SrcExp, DType, srcdim> &e)
+      :src_(MakePlan(e.img_)),
+       psize_y_(e.psize_y_), psize_x_(e.psize_x_),
+       pstride_y_(e.pstride_y_), pstride_x_(e.pstride_x_),
+       i_channel_(e.i_channel_), pdilate_y_(e.pdilate_y_), pdilate_x_(e.pdilate_x_),
+       i_height_(e.i_height_), i_width_(e.i_width_),
+       o_height_((i_height_ - (pdilate_y_ * (psize_y_ - 1) + 1)) / pstride_y_ + 1),
+       o_width_((i_width_ - (pdilate_x_ * (psize_x_ - 1) + 1)) / pstride_x_ + 1) {}
+  MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
+    const index_t x_offset = i % psize_x_ * pdilate_x_;
+    const index_t idivp    = i / psize_x_;
+    const index_t y_offset = idivp % psize_y_ * pdilate_y_;
+    const index_t c = idivp / psize_y_;
+    const index_t x = (j % o_width_) * pstride_x_ + x_offset;
+    const index_t jdivw = j / o_width_;
+    const index_t y = (jdivw % o_height_) * pstride_y_ + y_offset;
+    const index_t n = jdivw / o_height_;
+
+    if (x < i_width_ && y < i_height_) {
+      return src_.Eval((n * i_channel_  + c) * i_height_ + y, x);
+    } else {
+      return DType(0.0f);
+    }
+  }
+
+ private:
+  Plan<SrcExp, DType> src_;
+  const index_t psize_y_, psize_x_, pstride_y_, pstride_x_, i_channel_;
+  const index_t pdilate_y_, pdilate_x_;
+  const index_t i_height_, i_width_, o_height_, o_width_;
+};
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_
diff --git a/3rdparty/mshadow/mshadow/half.h b/3rdparty/mshadow/mshadow/half.h
new file mode 100644
index 000000000000..2dded0a7752e
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/half.h
@@ -0,0 +1,354 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file half.h
+ * \brief definition of half (float16) type.
+ *
+ * \author Junyuan Xie
+ */
+#ifndef MSHADOW_HALF_H_
+#define MSHADOW_HALF_H_
+#include "./base.h"
+
+#if MSHADOW_USE_F16C
+  #include <x86intrin.h>
+#endif  // MSHADOW_USE_F16C
+
+// This flag dictates rounding for the float2half() routine only (used generally on Windows),
+// not the f16c lib or cuda v7.5 (or later) behavior which is fixed at round-to-nearest-even.
+#ifndef MSHADOW_HALF_ROUND_TO_NEAREST
+#define MSHADOW_HALF_ROUND_TO_NEAREST 1
+#endif
+
+#if (MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
+  #define MSHADOW_CUDA_HALF 1
+  #include <cuda_fp16.h>
+  #if defined(__CUDA_ARCH__)
+    /*! \brief __half2float_warp */
+    __host__ __device__ float __half2float_warp(const volatile __half& h) { /* NOLINT(*) */
+      __half val;
+#if CUDA_VERSION >= 9000
+      val = const_cast<__half&>(h);
+#else
+      val.x = h.x;
+#endif
+      return __half2float(val);
+    }
+  #endif
+#else
+  #define MSHADOW_CUDA_HALF 0
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/* \brief name space for host/device portable half-precision floats */
+namespace half {
+#define MSHADOW_HALF_OPERATOR(RTYPE, OP)                                  \
+  MSHADOW_XINLINE RTYPE operator OP (half_t a, half_t b) {                \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE RTYPE operator OP (half_t a, T b) {                     \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE RTYPE operator OP (T a, half_t b) {                     \
+    return RTYPE(float(a) OP float(b));  /* NOLINT(*) */                  \
+  }
+
+#define MSHADOW_HALF_ASSIGNOP(AOP, OP)                                    \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half_t operator AOP (const T& a) {                      \
+    return *this = half_t(float(*this) OP float(a));  /* NOLINT(*)*/      \
+  }                                                                       \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half_t operator AOP (const volatile T& a) volatile {    \
+    return *this = half_t(float(*this) OP float(a));  /* NOLINT(*)*/      \
+  }
+
+#if (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(__half2float(cuhalf_));  /* NOLINT(*)*/                      \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(__half2float_warp(cuhalf_));  /* NOLINT(*)*/                 \
+  }
+#elif(MSHADOW_USE_F16C)
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(_cvtsh_ss(half_));   /* NOLINT(*)*/                          \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(_cvtsh_ss(half_));   /* NOLINT(*)*/                          \
+  }
+#else
+#define MSHADOW_HALF_CONVERSIONOP(T)                                      \
+  MSHADOW_XINLINE operator T() const {                                    \
+    return T(half2float(half_));  /* NOLINT(*)*/                          \
+  }                                                                       \
+  MSHADOW_XINLINE operator T() const volatile {                           \
+    return T(half2float(half_));  /* NOLINT(*)*/                          \
+  }
+#endif  // (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+
+class MSHADOW_ALIGNED(2) half_t {
+ public:
+  union {
+    uint16_t half_;
+#if MSHADOW_CUDA_HALF
+    __half cuhalf_;
+#endif  // MSHADOW_CUDA_HALF
+  };
+
+  static MSHADOW_XINLINE half_t Binary(uint16_t value) {
+    half_t res;
+    res.half_ = value;
+    return res;
+  }
+
+  MSHADOW_XINLINE half_t() {}
+
+#if MSHADOW_CUDA_HALF
+  MSHADOW_XINLINE explicit half_t(const __half& value) {
+    cuhalf_ = value;
+  }
+#endif  // MSHADOW_CUDA_HALF
+
+  MSHADOW_XINLINE half_t(const float& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const double& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int8_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint8_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int32_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint32_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const int64_t& value) { constructor(value); }
+  MSHADOW_XINLINE explicit half_t(const uint64_t& value) { constructor(value); }
+
+  MSHADOW_HALF_CONVERSIONOP(float)
+
+  MSHADOW_HALF_ASSIGNOP(+=, +)
+  MSHADOW_HALF_ASSIGNOP(-=, -)
+  MSHADOW_HALF_ASSIGNOP(*=, *)
+  MSHADOW_HALF_ASSIGNOP(/=, /)
+
+  MSHADOW_XINLINE half_t operator+() {
+    return *this;
+  }
+
+  MSHADOW_XINLINE half_t operator-() {
+    return half_t(-float(*this));  // NOLINT(*)
+  }
+
+  MSHADOW_XINLINE half_t operator=(const half_t& a) {
+    half_ = a.half_;
+    return a;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE half_t operator=(const T& a) {
+    return *this = half_t(a);  /* NOLINT(*)*/
+  }
+
+  MSHADOW_XINLINE half_t operator=(const half_t& a) volatile {
+    half_ = a.half_;
+    return a;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE half_t operator=(const T& a) volatile {
+    return *this = half_t(a);  /* NOLINT(*)*/
+  }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const fp16FractionBits = 10;
+  static int const fp32FractionBits = 23;
+  static int32_t const fp32FractionMask = ~(~0u << fp32FractionBits);  // == 0x7fffff
+  static int32_t const fp32HiddenBit = 1 << fp32FractionBits;         // == 0x800000
+  static int const shift = fp32FractionBits - fp16FractionBits;       // == 13
+  static int const shiftSign = 16;
+  static int32_t const expAdjust = 127 - 15;    // exp32-127 = exp16-15, so exp16 = exp32 - (127-15)
+
+  static int32_t const infN = 0x7F800000;  // flt32 infinity
+  static int32_t const maxN = 0x477FFFFF;  // max flt32 that's a flt16 normal after >> by shift
+  static int32_t const minN = 0x38800000;  // min flt16 normal as a flt32
+  static int32_t const maxZ = 0x33000000;  // max fp32 number that's still rounded to zero in fp16
+  static int32_t const signN = 0x80000000;  // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift;  // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign;  // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000;  // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000;  // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF;  // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400;  // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  MSHADOW_XINLINE uint16_t float2half(const float& value) const {
+    Bits v;
+    v.f = value;
+    uint32_t sign = v.si & signN;    // grab sign bit
+    v.si ^= sign;                    // clear sign bit from v
+    sign >>= shiftSign;              // logical shift sign to fp16 position
+
+    if (v.si <= maxZ) {
+      // Handle eventual zeros here to ensure vshift will not exceed 32 below.
+      v.ui = 0;
+    } else if (v.si < minN) {
+      // Handle denorms
+      uint32_t exp32 = v.ui >> fp32FractionBits;
+      int32_t exp16 = exp32 - expAdjust;
+      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
+      // Smaller (so negative) exp16 values should result in greater right shifts.
+      uint32_t vshift = 1 - exp16;
+      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
+      v.ui = significand >> vshift;
+      // The only time it's *not* OK to add 0x1000 (i.e. half the flt16 fraction lsb) is
+      // when the lsb of the flt16 fraction == 0 (so not rounding up to even) and the additional
+      // bits to the right of the lsb are 1000... (including flt32 significand bits
+      // that may be lost during the above vshift).  The first term below will always
+      // be true for vshift >=12 (since even the 'hidden bit' has been shifted to the
+      // right of the '1' bit in 0x1000). And when vshift <= 11, both terms combine to make
+      // the proper test of the flt32 significand bits, including those lost during the vshift.
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent to 1, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
+#endif
+    } else if (v.si <= maxN) {
+      // Handle norms
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent, possibly creating an inf, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
+#endif
+      v.ui -= expAdjust << fp32FractionBits;
+    } else if (v.si <= infN) {
+      v.si = infN;
+    } else if (v.si < nanN) {
+      v.si = nanN;
+    }
+
+    v.ui >>= shift;
+    return sign | (v.ui & 0x7fff);
+  }
+
+  // Same as above routine, except for addition of volatile keyword
+  MSHADOW_XINLINE uint16_t float2half(const volatile float& value) const volatile {  // NOLINT (*)
+    Bits v;
+    v.f = value;
+    uint32_t sign = v.si & signN;    // grab sign bit
+    v.si ^= sign;                    // clear sign bit from v
+    sign >>= shiftSign;              // logical shift sign to fp16 position
+
+    if (v.si <= maxZ) {
+      // Handle eventual zeros here to ensure vshift will not exceed 32 below.
+      v.ui = 0;
+    } else if (v.si < minN) {
+      // Handle denorms
+      uint32_t exp32 = v.ui >> fp32FractionBits;
+      int32_t exp16 = exp32 - expAdjust;
+      // If exp16 == 0 (just into the denorm range), then significant should be shifted right 1.
+      // Smaller (so negative) exp16 values should result in greater right shifts.
+      uint32_t vshift = 1 - exp16;
+      uint32_t significand = fp32HiddenBit | (v.ui & fp32FractionMask);
+      v.ui = significand >> vshift;
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent to 1, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 || (significand & 0x7ff) ? 0x1000 : 0;
+#endif
+    } else if (v.si <= maxN) {
+      // Handle norms
+#if MSHADOW_HALF_ROUND_TO_NEAREST == 1
+      // Rounding may increase the exponent, possibly creating an inf, but that's OK.
+      v.ui += (v.ui & 0x3fff) != 0x1000 ? 0x1000 : 0;
+#endif
+      v.ui -= expAdjust << fp32FractionBits;
+    } else if (v.si <= infN) {
+      v.si = infN;
+    } else if (v.si < nanN) {
+      v.si = nanN;
+    }
+
+    v.ui >>= shift;
+    return sign | (v.ui & 0x7fff);
+  }
+
+  MSHADOW_XINLINE float half2float(const uint16_t& value) const {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+
+  MSHADOW_XINLINE float half2float(const volatile uint16_t& value) const volatile {  // NOLINT(*)
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+
+  template<typename T>
+  MSHADOW_XINLINE void constructor(const T& value) {
+#if (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
+    cuhalf_ = __float2half(float(value));  // NOLINT(*)
+#elif(MSHADOW_USE_F16C)
+    half_ = _cvtss_sh(static_cast<float>(value), 0);
+#else /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
+    half_ = float2half(float(value));  // NOLINT(*)
+#endif /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
+  }
+};
+
+/*! \brief overloaded + operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, +)
+/*! \brief overloaded - operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, -)
+/*! \brief overloaded * operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, *)
+/*! \brief overloaded / operator for half_t */
+MSHADOW_HALF_OPERATOR(half_t, /)
+/*! \brief overloaded > operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, >)
+/*! \brief overloaded < operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, <)
+/*! \brief overloaded >= operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, >=)
+/*! \brief overloaded <= operator for half_t */
+MSHADOW_HALF_OPERATOR(bool, <=)
+
+#define MSHADOW_HALF_MIN mshadow::half::half_t::Binary(0xFBFF);
+#define MSHADOW_HALF_MAX mshadow::half::half_t::Binary(0x7BFF);
+}  // namespace half
+}  // namespace mshadow
+#endif  // MSHADOW_HALF_H_
diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
new file mode 100755
index 000000000000..3e130c85ba63
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/half2.h
@@ -0,0 +1,143 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file half2.h
+ * \brief definition of vector float16, half2 type.
+ *
+ * \author Antti-Pekka Hynninen
+ */
+#ifndef MSHADOW_HALF2_H_
+#define MSHADOW_HALF2_H_
+
+#if (defined(__CUDACC__) && __CUDA_ARCH__ >= 530 && MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
+  #define MSHADOW_CUDA_HALF2 1
+  #include <cuda_fp16.h>
+#else
+  #define MSHADOW_CUDA_HALF2 0
+#endif
+
+#include<math.h>
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/* \brief name space for host/device portable half-precision floats */
+namespace half {
+
+#define MSHADOW_HALF2_ASSIGNOP(AOP, OP)                                   \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half2_t operator AOP (const T& a) {                     \
+    return *this = half2_t(*this OP a);  /* NOLINT(*)*/                   \
+  }                                                                       \
+
+class MSHADOW_ALIGNED(4) half2_t {
+ public:
+#if MSHADOW_CUDA_HALF2
+  half2 half2_;
+#else
+  half_t half_t2[2];
+#endif
+
+  MSHADOW_XINLINE half2_t() {}
+
+#if MSHADOW_CUDA_HALF2
+  MSHADOW_XINLINE explicit half2_t(half2 a) : half2_(a) {}
+#else
+  MSHADOW_XINLINE explicit half2_t(half_t a, half_t b) {
+    half_t2[0] = a;
+    half_t2[1] = b;
+  }
+#endif
+
+  MSHADOW_XINLINE explicit half2_t(int a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = __half2half2(__int2half_rz(a));
+#else
+    half_t2[0] = (half_t)a;
+    half_t2[1] = (half_t)a;
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator+() {
+    return *this;
+  }
+
+  MSHADOW_XINLINE half2_t operator-() {
+#if MSHADOW_CUDA_HALF2
+    return half2_t(__hneg2(half2_));
+#else
+    return half2_t(-half_t2[0], -half_t2[1]);
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator=(const half2_t& a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = a.half2_;
+#else
+    half_t2[0] = a.half_t2[0];
+    half_t2[1] = a.half_t2[1];
+#endif
+    return a;
+  }
+
+  MSHADOW_HALF2_ASSIGNOP(+=, +)
+  MSHADOW_HALF2_ASSIGNOP(-=, -)
+  MSHADOW_HALF2_ASSIGNOP(*=, *)
+  MSHADOW_HALF2_ASSIGNOP(/=, /)
+};
+
+/*! \brief overloaded + operator for half2_t */
+MSHADOW_XINLINE half2_t operator+(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) + __low2float(b.half2_),
+                                   __high2float(a.half2_) + __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] + b.half_t2[0], a.half_t2[1] + b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded - operator for half2_t */
+MSHADOW_XINLINE half2_t operator-(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) - __low2float(b.half2_),
+                                   __high2float(a.half2_) - __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] - b.half_t2[0], a.half_t2[1] - b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded * operator for half2_t */
+MSHADOW_XINLINE half2_t operator*(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) * __low2float(b.half2_),
+                                   __high2float(a.half2_) * __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] * b.half_t2[0], a.half_t2[1] * b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded / operator for half2_t */
+MSHADOW_XINLINE half2_t operator/(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) / __low2float(b.half2_),
+                                   __high2float(a.half2_) / __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] / b.half_t2[0], a.half_t2[1] / b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded % operator for half2_t */
+MSHADOW_XINLINE half2_t operator%(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(::fmod(__low2float(a.half2_), __low2float(b.half2_)),
+                                   ::fmod(__high2float(a.half2_), __high2float(b.half2_))));
+#else
+  return half2_t(::fmod(a.half_t2[0], b.half_t2[0]), ::fmod(a.half_t2[1], b.half_t2[1]));
+#endif
+}
+/*! \brief overloaded == operator for half2_t */
+MSHADOW_XINLINE bool operator==(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return __hbeq2(a.half2_, b.half2_);
+#else
+  return (a.half_t2[0] == b.half_t2[0] && a.half_t2[1] == b.half_t2[1]);
+#endif
+}
+
+}  // namespace half
+}  // namespace mshadow
+#endif  // MSHADOW_HALF2_H_
diff --git a/3rdparty/mshadow/mshadow/io.h b/3rdparty/mshadow/mshadow/io.h
new file mode 100644
index 000000000000..2d0efc3aa56b
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/io.h
@@ -0,0 +1,137 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file io.h
+ * \brief definitions of I/O functions for mshadow tensor
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_IO_H_
+#define MSHADOW_IO_H_
+#include "./tensor.h"
+
+namespace mshadow {
+namespace utils {
+/*!
+ * \brief interface of stream I/O, used to serialize data,
+ *   mshadow does not restricted to only this interface in SaveBinary/LoadBinary
+ *   mshadow accept all class that implements Read and Write
+ */
+class IStream {
+ public:
+  /*!
+   * \brief read data from stream
+   * \param ptr pointer to memory buffer
+   * \param size size of block
+   * \return usually is the size of data readed
+   */
+  virtual size_t Read(void *ptr, size_t size) = 0;
+  /*!
+   * \brief write data to stream
+   * \param ptr pointer to memory buffer
+   * \param size size of block
+   */
+  virtual void Write(const void *ptr, size_t size) = 0;
+  /*! \brief virtual destructor */
+  virtual ~IStream(void) {}
+};
+}  // namespace utils
+/*!
+ * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ * \param fo output binary stream
+ * \param src source data file
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<cpu, dim, DType> &src);  // NOLINT(*)
+/*!
+ * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ * \param fo output binary stream
+ * \param src source data file
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<gpu, dim, DType> &src); // NOLINT(*)
+/*!
+ * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
+ *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
+ * \param fi output binary stream
+ * \param dst destination file
+ * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi,  // NOLINT(*)
+                       Tensor<cpu, dim, DType> *dst, bool pre_alloc);
+/*!
+ * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+ *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
+ *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
+ * \param fi output binary stream
+ * \param dst destination file
+ * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
+ * \tparam dim dimension of tensor
+ * \tparam DType type of element in tensor
+ * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+ */
+
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi, // NOLINT(*)
+                       Tensor<gpu, dim, DType> *dst, bool pre_alloc);
+
+// implementations
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<cpu, dim, DType> &src_) { // NOLINT(*)
+  fo.Write(&src_.shape_, sizeof(src_.shape_));
+  Tensor<cpu, 2, DType> src = src_.FlatTo2D();
+  for (index_t i = 0; i < src.size(0); ++i) {
+    fo.Write(src[i].dptr_, sizeof(DType) * src.size(1));
+  }
+}
+template<int dim, typename DType, typename TStream>
+inline void SaveBinary(TStream &fo, const Tensor<gpu, dim, DType> &src) { // NOLINT(*)
+  // copy to CPU, then save
+  Tensor<cpu, dim, DType> tmp(src.shape_);
+  AllocSpace(&tmp);
+  Stream<gpu> stream;
+  Copy(tmp, src, &stream);
+  SaveBinary(fo, tmp);
+  FreeSpace(&tmp);
+}
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi, // NOLINT(*)
+                       Tensor<cpu, dim, DType> *dst_, bool pre_alloc) {
+  Shape<dim> shape;
+  CHECK_NE(fi.Read(&shape, sizeof(shape)), 0) << "mshadow::LoadBinary";
+  if (pre_alloc) {
+    CHECK_EQ(shape, dst_->shape_) << "LoadBinary, shape do not match pre-allocated shape";
+  } else {
+    dst_->shape_ = shape; AllocSpace(dst_);
+  }
+  Tensor<cpu, 2, DType> dst = dst_->FlatTo2D();
+  if (dst.size(0) == 0) return;
+  for (index_t i = 0; i < dst.size(0); ++i) {
+    CHECK_NE(fi.Read(dst[i].dptr_, sizeof(DType) * dst.size(1)), 0) << "mshadow::LoadBinary";
+  }
+}
+template<int dim, typename DType, typename TStream>
+inline void LoadBinary(TStream &fi, // NOLINT(*)
+                       Tensor<gpu, dim, DType> *dst, bool pre_alloc) {
+  Tensor<cpu, dim, DType> tmp;
+  LoadBinary(fi, &tmp, false);
+  if (pre_alloc) {
+    CHECK_EQ(tmp.shape, dst->shape_) << "LoadBinary, shape do not match pre-allocated shape";
+  } else {
+    dst->shape = tmp.shape; AllocSpace(dst);
+  }
+  Stream<gpu> stream;
+  Copy(*dst, tmp, &stream);
+  FreeSpace(&tmp);
+}
+}  // namespace mshadow
+#endif  // MSHADOW_IO_H_
diff --git a/3rdparty/mshadow/mshadow/logging.h b/3rdparty/mshadow/mshadow/logging.h
new file mode 100644
index 000000000000..002b90097595
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/logging.h
@@ -0,0 +1,234 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file logging.h
+ * \brief defines logging macros of dmlc
+ *  allows use of GLOG, fall back to internal
+ *  implementation when disabled
+ */
+#ifndef MSHADOW_LOGGING_H_
+#define MSHADOW_LOGGING_H_
+#ifndef DMLC_LOGGING_H_
+#define DMLC_LOGGING_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include "./base.h"
+
+namespace dmlc {
+/*! \brief taken from DMLC directly */
+
+/*!
+ * \brief exception class that will be thrown by
+ *  default logger if DMLC_LOG_FATAL_THROW == 1
+ */
+struct Error : public std::runtime_error {
+  /*!
+   * \brief constructor
+   * \param s the error message
+   */
+  explicit Error(const std::string &s) : std::runtime_error(s) {}
+};
+}  // namespace dmlc
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define noexcept(a)
+#endif
+
+#if DMLC_USE_GLOG
+#include <glog/logging.h>
+
+namespace dmlc {
+/*! \brief taken from DMLC directly */
+inline void InitLogging(const char* argv0) {
+  google::InitGoogleLogging(argv0);
+}
+}  // namespace dmlc
+
+#else
+// use a light version of glog
+#include <assert.h>
+#include <iostream>
+#include <sstream>
+#include <ctime>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4722)
+#endif
+
+namespace dmlc {
+inline void InitLogging(const char* argv0) {
+  // DO NOTHING
+}
+
+// Always-on checking
+#define CHECK(x)                                           \
+  if (!(x))                                                \
+    dmlc::LogMessageFatal(__FILE__, __LINE__).stream() << "Check "  \
+      "failed: " #x << ' '
+#define CHECK_LT(x, y) CHECK((x) < (y))
+#define CHECK_GT(x, y) CHECK((x) > (y))
+#define CHECK_LE(x, y) CHECK((x) <= (y))
+#define CHECK_GE(x, y) CHECK((x) >= (y))
+#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_NE(x, y) CHECK((x) != (y))
+#define CHECK_NOTNULL(x) \
+  ((x) == NULL ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() << "Check  notnull: "  #x << ' ', (x) : (x)) // NOLINT(*)
+// Debug-only checking.
+#ifdef NDEBUG
+#define DCHECK(x) \
+  while (false) CHECK(x)
+#define DCHECK_LT(x, y) \
+  while (false) CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+  while (false) CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+  while (false) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+  while (false) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+  while (false) CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+  while (false) CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif  // NDEBUG
+
+#define LOG_INFO dmlc::LogMessage(__FILE__, __LINE__)
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL dmlc::LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// Poor man version of VLOG
+#define VLOG(x) LOG_INFO.stream()
+
+#define LOG(severity) LOG_##severity.stream()
+#define LG LOG_INFO.stream()
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+
+#ifdef NDEBUG
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) true ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+  (true || !(condition)) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#else
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG(severity)
+#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
+#endif
+
+// Poor man version of LOG_EVERY_N
+#define LOG_EVERY_N(severity, n) LOG(severity)
+
+class DateLogger {
+ public:
+  DateLogger() {
+#if defined(_MSC_VER)
+    _tzset();
+#endif
+  }
+  const char* HumanDate() {
+#if defined(_MSC_VER)
+    _strtime_s(buffer_, sizeof(buffer_));
+#else
+    time_t time_value = time(NULL);
+    struct tm now;
+    localtime_r(&time_value, &now);
+    snprintf(buffer_, sizeof(buffer_), "%02d:%02d:%02d", now.tm_hour,
+             now.tm_min, now.tm_sec);
+#endif
+    return buffer_;
+  }
+ private:
+  char buffer_[9];
+};
+
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line)
+      :
+#ifdef __ANDROID__
+        log_stream_(std::cout)
+#else
+        log_stream_(std::cerr)
+#endif
+  {
+    log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  ~LogMessage() { log_stream_ << "\n"; }
+  std::ostream& stream() { return log_stream_; }
+
+ protected:
+  std::ostream& log_stream_;
+
+ private:
+  DateLogger pretty_date_;
+  LogMessage(const LogMessage&);
+  void operator=(const LogMessage&);
+};
+
+#if DMLC_LOG_FATAL_THROW == 0
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) : LogMessage(file, line) {}
+  ~LogMessageFatal() {
+    log_stream_ << "\n";
+    abort();
+  }
+
+ private:
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+};
+#else
+class LogMessageFatal {
+ public:
+  LogMessageFatal(const char* file, int line) {
+    log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  std::ostringstream &stream() { return log_stream_; }
+  ~LogMessageFatal() DMLC_THROW_EXCEPTION {
+    // throwing out of destructor is evil
+    // hopefully we can do it here
+    throw Error(log_stream_.str());
+  }
+
+ private:
+  std::ostringstream log_stream_;
+  DateLogger pretty_date_;
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+};
+#endif
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than "?:". See its usage.
+  void operator&(std::ostream&) {}
+};
+
+}  // namespace dmlc
+
+#endif
+#endif  // DMLC_LOGGING_H_
+#endif  // MSHADOW_LOGGING_H_
+
diff --git a/3rdparty/mshadow/mshadow/packet-inl.h b/3rdparty/mshadow/mshadow/packet-inl.h
new file mode 100644
index 000000000000..f5a89bfa8421
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/packet-inl.h
@@ -0,0 +1,413 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file packet-inl.h
+ * \brief Generic packet vectorization code
+ */
+#ifndef MSHADOW_PACKET_INL_H_
+#define MSHADOW_PACKET_INL_H_
+
+#ifdef __APPLE__
+#include <stdlib.h>
+#else
+#include <malloc.h>
+#endif
+#include "./base.h"
+#include "./tensor.h"
+#include "./expression.h"
+
+
+namespace mshadow {
+/*! \brief namespace of packet math*/
+namespace packet {
+
+enum PacketArch {
+  kPlain,
+  kSSE2,
+};
+
+#if MSHADOW_USE_SSE
+#define MSHADOW_DEFAULT_PACKET  ::mshadow::packet::kSSE2
+#else
+#define MSHADOW_DEFAULT_PACKET  ::mshadow::packet::kPlain
+#endif
+
+// whether packet operator is enabled.
+/*!
+ * \brief Generic packet type
+ * \tparam DType The data type of the packet.
+ * \tparam Arch the Arch of the packet.
+ */
+template<typename DType, PacketArch Arch = MSHADOW_DEFAULT_PACKET>
+struct Packet;
+
+template<PacketArch Arch>
+struct AlignBytes {
+  static const index_t value = 4;
+};
+
+}  // namespace packet
+}  // namespace mshadow
+
+namespace mshadow {
+namespace packet {
+/*!
+ * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
+ * \param out_pitch output parameter, the actuall space allocated for each line
+ * \param lspace number of cells required for each line
+ * \param num_line number of lines to be allocated
+ */
+inline void* AlignedMallocPitch(size_t *out_pitch,
+                                size_t lspace,
+                                size_t num_line) {
+  const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
+  const index_t mask = (1 << bits) - 1;
+
+  size_t pitch = ((lspace + mask) >> bits) << bits;
+  *out_pitch = pitch;
+#ifdef _MSC_VER
+  void *res = _aligned_malloc(pitch * num_line, 1 << bits);
+#else
+  void *res;
+  int ret = posix_memalign(&res, 1 << bits, pitch * num_line);
+  CHECK_EQ(ret, 0) << "AlignedMallocPitch failed";
+#endif
+  if (res == NULL) {
+    LOG(FATAL) << "AlignedMallocPitch failed";
+  }
+  return res;
+}
+
+/*!
+ * \brief free aligned space
+ * \param ptr pointer to space to be freed
+ */
+inline void AlignedFree(void *ptr) {
+#ifdef _MSC_VER
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+/*! \brief check if a pointer is aligned */
+template<PacketArch Arch>
+inline bool CheckAlign(size_t pitch) {
+  const index_t bits = AlignBytes<Arch>::value;
+  return !(pitch & ((1 << bits) - 1));
+}
+
+/*! \brief check if a pointer is aligned */
+template<PacketArch Arch>
+inline bool CheckAlign(void *ptr) {
+  return CheckAlign<Arch>(reinterpret_cast<size_t>(ptr));
+}
+
+/*!
+ * \brief get upper bound of aligned index of size
+ * \param size size of the array
+ * \param fsize size of float
+ */
+template<typename DType, PacketArch Arch>
+inline index_t UpperAlign(index_t size) {
+  const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
+  const index_t mask = (1 << bits) - 1;
+  const index_t fsize = sizeof(DType);
+  return (((size * fsize + mask) >> bits) << bits) / fsize;
+}
+
+/*!
+ * \brief get lower bound of aligned index of size
+ * \param size size of the array
+ * \param fsize size of float
+ */
+template<typename DType, PacketArch Arch>
+inline index_t LowerAlign(index_t size) {
+  const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
+  const index_t fsize = sizeof(DType);
+  return (((size * fsize) >> bits) << bits) / fsize;
+}
+
+/*!
+ * \brief generic Packet operator
+ * \tparam OP The operator
+ * \tparam DType The data type
+ * \tparam Arch The architecture.
+ */
+template<typename OP, typename DType, PacketArch Arch>
+struct PacketOp {
+  static const bool kEnabled = false;
+};
+// specialization of operators
+template<typename DType, PacketArch Arch>
+struct PacketOp<op::plus, DType, Arch> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
+                                                   const Packet<DType, Arch>& rhs) {
+    return lhs + rhs;
+  }
+};
+template<typename DType, PacketArch Arch>
+struct PacketOp<op::minus, DType, Arch> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
+                                                  const Packet<DType, Arch>& rhs) {
+    return lhs - rhs;
+  }
+};
+template<typename DType, PacketArch Arch>
+struct PacketOp<op::mul, DType, Arch> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
+                                                  const Packet<DType, Arch>& rhs) {
+    return lhs * rhs;
+  }
+};
+template<typename DType, PacketArch Arch>
+struct PacketOp<op::div, DType, Arch> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
+                                                  const Packet<DType, Arch>& rhs) {
+    return lhs / rhs;
+  }
+};
+
+template<typename DType, PacketArch Arch>
+struct PacketOp<op::identity, DType, Arch> {
+  static const bool kEnabled = true;
+  MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& src) {
+    return src;
+  }
+};
+
+
+// savers to do storage
+template<typename SV, typename TFloat, PacketArch Arch>
+struct Saver{
+  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
+    Packet<TFloat, Arch> lhs = Packet<TFloat, Arch>::Load(dst);
+    Packet<TFloat, Arch> ans = PacketOp<typename SV::OPType, TFloat, Arch>::Map(lhs, src);
+    ans.Store(dst);
+  }
+};
+template<typename TFloat, PacketArch Arch>
+struct Saver<sv::saveto, TFloat, Arch> {
+  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
+    src.Store(dst);
+  }
+};
+}  // namespace packet
+}  // namespace mshadow
+
+#include "packet/plain-inl.h"
+#if MSHADOW_USE_SSE && !defined(__CUDACC__)
+#include "packet/sse-inl.h"
+#endif
+
+namespace mshadow {
+namespace expr {
+
+typedef packet::PacketArch PacketArch;
+
+// same as plan, but use packet
+template<typename ExpType, typename DType, PacketArch Arch>
+class PacketPlan {
+ public:
+  /*!
+   * \brief evaluate the expression at index [y][x],
+   * x will be aligned to Packet<DType, Arch>::Size()
+   */
+  MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const;
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
+};
+
+template <typename Device, int dim, typename DType, PacketArch Arch>
+class PacketPlan<Tensor<Device, dim, DType>, DType, Arch> {
+ public:
+  explicit PacketPlan(const Tensor<Device, dim, DType> &t)
+      :dptr_(t.dptr_), stride_(t.stride_) {}
+  MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
+    return packet::Packet<DType, Arch>::Load(&dptr_[y * stride_ + x]);
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return dptr_[y * stride_ + x];
+  }
+
+ private:
+  const DType  *dptr_;
+  index_t stride_;
+};
+
+template<typename DType, PacketArch Arch>
+class PacketPlan<ScalarExp<DType>, DType, Arch> {
+ public:
+  explicit PacketPlan(DType scalar) : scalar_(scalar) {}
+  MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
+    return packet::Packet<DType, Arch>::Fill(scalar_);
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return scalar_;
+  }
+
+ private:
+  DType scalar_;
+};
+
+template<typename OP, typename TA, typename TB, int etype, typename DType, PacketArch Arch>
+class PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch> {
+ public:
+  PacketPlan(const PacketPlan<TA, DType, Arch> &lhs, const PacketPlan<TB, DType, Arch> &rhs)
+      : lhs_(lhs), rhs_(rhs) {}
+  MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
+    return packet::PacketOp<OP, DType, Arch>::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x));
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
+  }
+
+ private:
+  PacketPlan<TA, DType, Arch> lhs_;
+  PacketPlan<TB, DType, Arch> rhs_;
+};
+
+template<typename OP, typename TA, int etype, typename DType, PacketArch Arch>
+class PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch> {
+ public:
+  PacketPlan(const PacketPlan<TA, DType, Arch> &src) : src_(src) {}
+  MSHADOW_CINLINE packet::Packet<DType> EvalPacket(index_t y, index_t x) const {
+    return packet::PacketOp<OP, DType, Arch>::Map(src_.EvalPacket(y, x));
+  }
+  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
+    return OP::Map(src_.Eval(y, x));
+  }
+
+ private:
+  PacketPlan<TA, DType, Arch> src_;
+};
+
+template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
+inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
+MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
+
+template<PacketArch Arch, typename DType>
+inline PacketPlan<ScalarExp<DType>, DType, Arch> MakePacketPlan(const ScalarExp<DType> &e) {
+  return PacketPlan<ScalarExp<DType>, DType, Arch>(e.scalar_);
+}
+template<PacketArch Arch, typename T, typename DType>
+inline PacketPlan<T, DType, Arch> MakePacketPlan(const RValueExp<T, DType> &e) {
+  return PacketPlan<T, DType, Arch>(e.self());
+}
+template<PacketArch Arch, typename T, int dim, typename DType>
+inline PacketPlan<T, DType, Arch>
+MakePacketPlan(const MakeTensorExp<T, cpu, dim, DType> &e) {
+  return PacketPlan<T, DType, Arch>(e.real_self());
+}
+template<PacketArch Arch, typename OP, typename TA, typename DType, int etype>
+inline PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>
+MakePacketPlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
+  return PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>(MakePacketPlan<Arch>(e.src_));
+}
+template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
+inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
+MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
+  return PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
+                    DType, Arch>(MakePacketPlan<Arch>(e.lhs_), MakePacketPlan<Arch>(e.rhs_));
+}
+
+/*!
+ * \brief static check packet enable
+ *
+ * \tparam Device the type of Device
+ * \tparam dim dimension of the tensor
+ * \tparam E expression
+ */
+template<typename E, PacketArch Arch>
+struct PacketCheck{
+  static const bool kPass = false;
+};
+template<PacketArch Arch>
+struct PacketCheck<float, Arch> {
+  static const bool kPass = true;
+};
+template<PacketArch Arch>
+struct PacketCheck<double, Arch> {
+  static const bool kPass = true;
+};
+template<typename DType, PacketArch Arch>
+struct PacketCheck<ScalarExp<DType>, Arch> {
+  static const bool kPass = PacketCheck<DType, Arch>::kPass;
+};
+template<int dim, typename DType, PacketArch Arch>
+struct PacketCheck<Tensor<cpu, dim, DType>, Arch> {
+  static const bool kPass = PacketCheck<DType, Arch>::kPass;
+};
+template<typename OP, typename TA, typename DType, int etype, PacketArch Arch>
+struct PacketCheck<UnaryMapExp<OP, TA, DType, etype>, Arch> {
+  static const bool kPass = PacketCheck<TA, Arch>::kPass &&
+      packet::PacketOp<OP, DType, Arch>::kEnabled;
+};
+template<typename OP, typename TA, typename TB, typename DType, int etype, PacketArch Arch>
+struct PacketCheck< BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
+  static const bool kPass = packet::PacketOp<OP, DType, Arch>::kEnabled &&
+      PacketCheck<TA, Arch>::kPass && PacketCheck<TB, Arch>::kPass;
+};
+//----------------------------------------------------
+// Check if data is aligned and allow packet operation
+//----------------------------------------------------
+template<int dim, typename E, PacketArch Arch>
+struct PacketAlignCheck {
+  inline static bool Check(const E &exp) {
+    return false;
+  }
+};
+template<int dim, typename DType, PacketArch Arch>
+struct PacketAlignCheck<dim, ScalarExp<DType>, Arch> {
+  inline static bool Check(const ScalarExp<DType> &exp) {
+    return true;
+  }
+};
+template<int dim, typename DType, PacketArch Arch>
+struct PacketAlignCheck<dim, Tensor<cpu, dim, DType>, Arch> {
+  inline static bool Check(const Tensor<cpu, dim, DType> &t) {
+    return packet::CheckAlign<Arch>(t.dptr_) &&
+        packet::CheckAlign<Arch>(t.stride_ * sizeof(DType));
+  }
+};
+template<int dim, typename OP, typename TA, typename DType, int etype, PacketArch Arch>
+struct PacketAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype>, Arch> {
+  inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
+    return PacketAlignCheck<dim, TA, Arch>::Check(t.src_);
+  }
+};
+template<int dim, typename OP, typename TA, typename TB,
+         typename DType, int etype, PacketArch Arch>
+struct PacketAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
+  inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
+    return PacketAlignCheck<dim, TA, Arch>::Check(t.lhs_) &&
+        PacketAlignCheck<dim, TB, Arch>::Check(t.rhs_);
+  }
+};
+
+/*!
+ * \brief use PacketPlan to compute result
+ */
+template<typename SV, typename E, int dim, typename DType, PacketArch Arch>
+inline void MapPacketPlan(Tensor<cpu, dim, DType> _dst,
+                          const expr::PacketPlan<E, DType, Arch>& plan) {
+  Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
+  const index_t xlen = packet::LowerAlign<DType, Arch>(dst.size(1));
+  const size_t packetSize = packet::Packet<DType, Arch>::size;
+#ifndef __CUDACC__
+  #pragma omp parallel for
+#endif
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    for (index_t x = 0; x < xlen; x += packetSize) {
+      packet::Saver<SV, DType, Arch>::Save(&dst[y][x], plan.EvalPacket(y, x));
+    }
+    for (index_t x = xlen; x < dst.size(1); ++x) {
+      SV::Save(dst[y][x], plan.Eval(y, x));
+    }
+  }
+}
+}  // namespace expr
+}  // namespace mshadow
+#endif  // MSHADOW_PACKET_INL_H_
diff --git a/3rdparty/mshadow/mshadow/packet/plain-inl.h b/3rdparty/mshadow/mshadow/packet/plain-inl.h
new file mode 100644
index 000000000000..de28ad7b4894
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/packet/plain-inl.h
@@ -0,0 +1,76 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file plain-inl.h
+ * \brief support of plain packet that use the plain datatype.
+ */
+#ifndef MSHADOW_PACKET_PLAIN_INL_H_
+#define MSHADOW_PACKET_PLAIN_INL_H_
+
+#include "../base.h"
+#include "../packet-inl.h"
+
+namespace mshadow {
+namespace packet {
+template<typename DType>
+struct Packet<DType, kPlain> {
+ public:
+  /*! \brief number of float in vector */
+  static constexpr index_t size = 1;
+  /*! \brief The internal data */
+  DType data_;
+  // enable default copy constructor
+  Packet(void) {}
+  // constructor from the intrinsic type
+  explicit Packet(DType data) : data_(data) {}
+  // create a fill with the target value s
+  MSHADOW_CINLINE static Packet<DType, kPlain> Fill(DType s) {
+    return Packet<DType, kPlain>(s);
+  }
+  // load from address
+  MSHADOW_CINLINE static Packet<DType, kPlain> Load(const DType* src) {
+    return Packet<DType, kPlain>(*src);
+  }
+  // load from address
+  MSHADOW_CINLINE static Packet<DType, kPlain> LoadUnAligned(const DType* src) {
+    return Packet<DType, kPlain>(*src);
+  }
+  // fill it with value s
+  MSHADOW_CINLINE Packet<DType, kPlain>& operator=(DType s) {
+    data_ = s;
+    return *this;
+  }
+  // store data into dst
+  MSHADOW_CINLINE void Store(DType* dst) const {
+    *dst = data_;
+  }
+  // get the sum of all contents
+  MSHADOW_CINLINE DType Sum() const {
+    return data_;
+  }
+};
+
+template<typename DType>
+MSHADOW_CINLINE Packet<DType, kPlain> operator+(const Packet<DType, kPlain>& lhs,
+                                                const Packet<DType, kPlain>& rhs) {
+  return Packet<DType, kPlain>(lhs.data_ + rhs.data_);
+}
+
+template<typename DType>
+MSHADOW_CINLINE Packet<DType, kPlain> operator-(const Packet<DType, kPlain>& lhs,
+                                                const Packet<DType, kPlain>& rhs) {
+  return Packet<DType, kPlain>(lhs.data_ - rhs.data_);
+}
+template<typename DType>
+MSHADOW_CINLINE Packet<DType, kPlain> operator*(const Packet<DType, kPlain>& lhs,
+                                                    const Packet<DType, kPlain>& rhs) {
+  return Packet<DType, kPlain>(lhs.data_ * rhs.data_);
+}
+
+template<typename DType>
+MSHADOW_CINLINE Packet<DType, kPlain> operator/(const Packet<DType, kPlain>& lhs,
+                                                    const Packet<DType, kPlain>& rhs) {
+  return Packet<DType, kPlain>(lhs.data_ / rhs.data_);
+}
+}  // namespace packet
+}  // namespace mshadow
+#endif  // MSHADOW_PACKET_PLAIN_INL_H_
diff --git a/3rdparty/mshadow/mshadow/packet/sse-inl.h b/3rdparty/mshadow/mshadow/packet/sse-inl.h
new file mode 100644
index 000000000000..923a5f60de38
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/packet/sse-inl.h
@@ -0,0 +1,147 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file sse-inl.h
+ * \brief support of sse2 packet optimization of some operations
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_PACKET_SSE_INL_H_
+#define MSHADOW_PACKET_SSE_INL_H_
+
+#include <emmintrin.h>
+#include "../base.h"
+#include "../packet-inl.h"
+
+namespace mshadow {
+namespace packet {
+template<>
+struct Packet<float, kSSE2> {
+ public:
+  /*! \brief number of float in vector */
+  static constexpr index_t size = 4;
+  /*! \brief The internal data */
+  __m128 data_;
+  // enable default copy constructor
+  Packet(void) {}
+  // constructor from the intrinsic type
+  explicit Packet(__m128 data) : data_(data) {}
+  // create a fill with the target value s
+  MSHADOW_CINLINE static Packet<float, kSSE2> Fill(float s) {
+    return Packet<float, kSSE2>(_mm_set1_ps(s));
+  }
+  // load from address
+  MSHADOW_CINLINE static Packet<float, kSSE2> Load(const float* src) {
+    return Packet<float, kSSE2>(_mm_load_ps(src));
+  }
+  // load from address
+  MSHADOW_CINLINE static Packet<float, kSSE2> LoadUnAligned(const float* src) {
+    return Packet<float, kSSE2>(_mm_loadu_ps(src));
+  }
+  // fill it with value s
+  MSHADOW_CINLINE Packet<float, kSSE2>& operator=(float s) {
+    data_ = _mm_set1_ps(s);
+    return *this;
+  }
+  // store data into dst
+  MSHADOW_CINLINE void Store(float* dst) const {
+    _mm_store_ps(dst, data_);
+  }
+  // get the sum of all contents
+  MSHADOW_CINLINE float Sum() const {
+    __m128 ans  = _mm_add_ps(data_, _mm_movehl_ps(data_, data_));
+    __m128 rst  = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1));
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+    return rst.m128_f32[0];
+#else
+    float rr = _mm_cvtss_f32(rst);
+    return rr;
+#endif
+  }
+};
+
+
+/*! \brief vector real type for float */
+template<>
+struct Packet<double, kSSE2> {
+  /*! \brief number of float in vector */
+  static constexpr index_t size = 2;
+  // internal data
+  __m128d data_;
+  // constructor
+  Packet(void) {}
+  explicit Packet(__m128d data) : data_(data) {}
+  // create a fill with the target value s
+  MSHADOW_CINLINE static Packet<double, kSSE2> Fill(double s) {
+    return Packet<double, kSSE2>(_mm_set1_pd(s));
+  }
+  // load from address
+  MSHADOW_CINLINE static Packet<double, kSSE2> Load(const double* src) {
+    return Packet<double, kSSE2>(_mm_load_pd(src));
+  }
+  MSHADOW_CINLINE static Packet<double, kSSE2> LoadUnAligned(const double* src) {
+    return Packet<double, kSSE2>(_mm_loadu_pd(src));
+  }
+  // fill it with value s
+  MSHADOW_CINLINE Packet<double, kSSE2>& operator=(double s) {
+    data_ = _mm_set1_pd(s);
+    return *this;
+  }
+  // store data into dst
+  MSHADOW_CINLINE void Store(double* dst) const {
+    _mm_store_pd(dst, data_);
+  }
+  // get sum of all content
+  inline double Sum(void) const {
+    __m128d tmp =  _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_));
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+    return tmp.m128d_f64[0];
+#else
+    double ans = _mm_cvtsd_f64(tmp);
+    return ans;
+#endif
+  }
+};
+
+MSHADOW_CINLINE Packet<float, kSSE2> operator+(const Packet<float, kSSE2>& lhs,
+                                                    const Packet<float, kSSE2>& rhs) {
+  return Packet<float, kSSE2>(_mm_add_ps(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<double, kSSE2> operator+(const Packet<double, kSSE2>& lhs,
+                                                     const Packet<double, kSSE2>& rhs) {
+  return Packet<double, kSSE2>(_mm_add_pd(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<float, kSSE2> operator-(const Packet<float, kSSE2>& lhs,
+                                                    const Packet<float, kSSE2>& rhs) {
+  return Packet<float, kSSE2>(_mm_sub_ps(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<double, kSSE2> operator-(const Packet<double, kSSE2>& lhs,
+                                                     const Packet<double, kSSE2>& rhs) {
+  return Packet<double, kSSE2>(_mm_sub_pd(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<float, kSSE2> operator*(const Packet<float, kSSE2>& lhs,
+                                                    const Packet<float, kSSE2>& rhs) {
+  return Packet<float, kSSE2>(_mm_mul_ps(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<double, kSSE2> operator*(const Packet<double, kSSE2>& lhs,
+                                                     const Packet<double, kSSE2>& rhs) {
+  return Packet<double, kSSE2>(_mm_mul_pd(lhs.data_, rhs.data_));
+}
+
+
+MSHADOW_CINLINE Packet<float, kSSE2> operator/(const Packet<float, kSSE2>& lhs,
+                                                    const Packet<float, kSSE2>& rhs) {
+  return Packet<float, kSSE2>(_mm_div_ps(lhs.data_, rhs.data_));
+}
+
+MSHADOW_CINLINE Packet<double, kSSE2> operator/(const Packet<double, kSSE2>& lhs,
+                                                     const Packet<double, kSSE2>& rhs) {
+  return Packet<double, kSSE2>(_mm_div_pd(lhs.data_, rhs.data_));
+}
+
+}  // namespace packet
+}  // namespace mshadow
+#endif  // MSHADOW_PACKET_SSE_INL_H_
diff --git a/3rdparty/mshadow/mshadow/random.h b/3rdparty/mshadow/mshadow/random.h
new file mode 100644
index 000000000000..c136f4f67809
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/random.h
@@ -0,0 +1,570 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ *  \file random.h
+ *  \brief Random inline functions for tensor.
+ *  \author Bing Xu, Tianqi Chen
+ *   Based on curand|MKL|stdlib
+ */
+#ifndef MSHADOW_RANDOM_H_
+#define MSHADOW_RANDOM_H_
+
+#include <cstdlib>
+#include <algorithm>
+#include <random>
+#include "./base.h"
+#include "./tensor.h"
+#include "./tensor_container.h"
+
+#if MSHADOW_IN_CXX11
+#include <random>  // use cxx11 random by default
+#endif
+
+#if _MSC_VER
+#define rand_r(x) rand()
+#endif
+
+
+namespace mshadow {
+/*!
+ * \brief random number generator
+ * \tparam Device the device of random number generator
+ * \tparam DType the target data type of random number can be float for double
+ */
+template<typename Device, typename DType MSHADOW_DEFAULT_DTYPE>
+class Random {};
+
+/*! \brief CPU random number generator */
+template<typename DType>
+class Random<cpu, DType> {
+ public:
+  /*!
+   * \brief constructor of random engine
+   * \param seed random number seed
+   */
+  explicit Random(int seed) {
+    this->Seed(seed);
+    buffer_.Resize(Shape1(kRandBufferSize));
+  }
+  ~Random(void) {
+  }
+  /*!
+   * \brief seed random number generator using this seed
+   * \param seed seed of prng
+   */
+  inline void Seed(int seed) {
+#if MSHADOW_IN_CXX11
+    rnd_engine_.seed(seed);
+#endif
+    this->rseed_ = static_cast<unsigned>(seed);
+  }
+  /*!
+   * \brief get random seed used in random generator
+   * \return seed in unsigned
+   */
+  inline unsigned GetSeed() const {
+    return rseed_;
+  }
+  /*!
+   * \brief set the stream of computation
+   * \param stream computation stream
+   */
+  inline void set_stream(Stream<cpu> *stream) {
+  }
+
+// These samplers are only avail in C++11.
+#if MSHADOW_IN_CXX11
+
+  /*!
+   * \brief get some random integer
+   * \return integer as unsigned
+   */
+  inline unsigned GetRandInt() {
+    return rnd_engine_();
+  }
+
+  /*!
+   * \brief get a set of random integers
+   */
+  inline void GetRandInt(const Tensor<cpu, 1, unsigned>& dst) {
+    std::generate_n(dst.dptr_, dst.size(0), [&](){ return rnd_engine_(); });
+  }
+
+  /*!
+   * \brief generate data from a distribution
+   * \param dst destination
+   * \tparam dim dimension of tensor
+   * \param sampler sampler of the distribution
+   */
+  template<int dim, class Sampler>
+  inline void SampleDistribution(Tensor<cpu, dim, DType> *dst, Sampler sampler) {
+    if (dst->CheckContiguous()) {
+      std::generate_n(dst->dptr_, dst->shape_.Size(), sampler);
+    } else {
+      Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
+      for (index_t i = 0; i < mat.size(0); ++i) {
+        std::generate_n(mat[i].dptr_, mat.size(1), sampler);
+      }
+    }
+  }
+
+  /*!
+   * \brief generate data from uniform [a,b)
+   * \param dst destination
+   * \param a lower bound of uniform
+   * \param b upper bound of uniform
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SampleUniform(Tensor<cpu, dim, DType> *dst,
+                            PType a = 0.0f , PType b = 1.0f ) {
+    // Ensure that half_t is handled correctly.
+    typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                      DType, double>::type FType;
+    typedef typename std::conditional<std::is_integral<DType>::value,
+                                      std::uniform_int_distribution<DType>,
+                                      std::uniform_real_distribution<FType>>::type GType;
+    GType dist_uniform(a, b);
+    SampleDistribution(dst, [&](){ return dist_uniform(rnd_engine_);});
+  }
+
+  /*!
+   * \brief generate data from standard gaussian
+   * \param dst destination
+   * \param mu mean variable
+   * \param sigma standard deviation
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SampleGaussian(Tensor<cpu, dim, DType> *dst,
+                             PType mu = 0.0f, PType sigma = 1.0f ) {
+    if (sigma <= 0) {
+      *dst = mu; return;
+    }
+    typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                      DType, double>::type GType;
+    std::normal_distribution<GType> dist_normal(mu, sigma);
+    SampleDistribution(dst, [&](){ return dist_normal(rnd_engine_);});
+  }
+
+  /*!
+   * \brief generate data from a gamma distribution
+   * \param dst destination
+   * \param alpha (shape) parameter
+   * \param beta (scale) parameter
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SampleGamma(Tensor<cpu, dim, DType> *dst,
+                          PType alpha, PType beta) {
+    typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                      DType, double>::type GType;
+    std::gamma_distribution<GType> dist_gamma(alpha, beta);
+    SampleDistribution(dst, [&](){ return dist_gamma(rnd_engine_);});
+  }
+
+  /*!
+   * \brief generate data from an exponential distribution
+   * \param dst destination
+   * \param lambda parameter (rate) of the distribution
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SampleExponential(Tensor<cpu, dim, DType> *dst, PType lambda ) {
+    typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                      DType, double>::type GType;
+    std::exponential_distribution<GType> dist_exp(lambda);
+    SampleDistribution(dst, [&](){ return dist_exp(rnd_engine_);});
+  }
+
+  /*!
+   * \brief generate data from a poisson distribution
+   * \param dst destination
+   * \param lambda parameter (rate) of the distribution
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SamplePoisson(Tensor<cpu, dim, DType> *dst, PType lambda) {
+    typedef typename std::conditional<std::is_integral<DType>::value, DType, int>::type GType;
+    std::poisson_distribution<GType> dist_poisson(lambda);
+    SampleDistribution(dst, [&](){ return static_cast<DType>(dist_poisson(rnd_engine_));});
+  }
+
+  /*!
+   * \brief generate data from a negative binomial distribution
+   * \param dst destination
+   * \param k limit on number of failures
+   * \param p success probability
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType1, typename PType2>
+  inline void SampleNegativeBinomial(Tensor<cpu, dim, DType> *dst, PType1 k, PType2 p) {
+    typedef typename std::conditional<std::is_integral<DType>::value, DType, int>::type GType;
+    std::negative_binomial_distribution<GType> dist_negbinomial(k, p);
+    SampleDistribution(dst, [&](){ return static_cast<DType>(dist_negbinomial(rnd_engine_));});
+  }
+
+  /*!
+   * \brief generate data from a generalized negative binomial distribution
+   * \param dst destination
+   * \param mu parameter (mean) of the distribution
+   * \param alpha parameter (over dispersion) of the distribution
+   *   (for alpha=0 this gives a Poisson)
+   * \tparam dim dimension of tensor
+   */
+  template<int dim, typename PType>
+  inline void SampleGeneralizedNegativeBinomial(Tensor<cpu, dim, DType> *dst,
+                                                PType mu, PType alpha) {
+    if (alpha == PType(0)) {
+      SamplePoisson(dst, mu);  // limit of Poisson
+    } else {
+      PType r(PType(1) / alpha);
+      PType beta = mu * alpha;
+      std::gamma_distribution<> dist_gamma(r, beta);
+      typedef typename std::conditional<std::is_integral<DType>::value, DType, int>::type GType;
+      SampleDistribution(dst,
+        [&](){ std::poisson_distribution<GType> dist_poisson(dist_gamma(rnd_engine_));
+               return static_cast<DType>(dist_poisson(rnd_engine_));});
+    }
+  }
+#endif
+
+  /*!
+   * \brief return a temporal expression storing standard gaussian random variables
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard gaussian random variables
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<cpu, 1, DType>, DType, dim, 1>
+  gaussian(Shape<dim> shape) {
+    buffer_.Resize(Shape1(shape.Size()));
+    this->SampleGaussian(&buffer_, 0.0f, 1.0f);
+    return expr::reshape(buffer_, shape);
+  }
+  /*!
+   * \brief return a temporal expression storing standard uniform [0,1)
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = uniform(s1) * uniform(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard uniform [0,1)
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<cpu, 1, DType>, DType, dim, 1>
+  uniform(Shape<dim> shape) {
+    buffer_.Resize(Shape1(shape.Size()));
+    this->SampleUniform(&buffer_, 0.0f, 1.0f);
+    return expr::reshape(buffer_, shape);
+  }
+
+  std::mt19937 &GetRndEngine() {
+    return rnd_engine_;
+  }
+
+ private:
+#if MSHADOW_IN_CXX11
+  /*! \brief use c++11 random engine. */
+  std::mt19937 rnd_engine_;
+  /*! \brief random number seed used in random engine */
+  unsigned rseed_;
+
+#else
+
+  /*! \brief random number seed used by PRNG */
+  unsigned rseed_;
+  // functions
+  template<int dim>
+  inline void SampleUniform(Tensor<cpu, dim, DType> *dst,
+                            DType a = 0.0f, DType b = 1.0f) {
+    if (dst->CheckContiguous()) {
+      this->GenUniform(dst->dptr_, dst->shape_.Size(), a, b);
+    } else {
+      Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
+      for (index_t i = 0; i < mat.size(0); ++i) {
+        this->GenUniform(mat[i].dptr_, mat.size(1), a, b);
+      }
+    }
+  }
+  template<int dim>
+  inline void SampleGaussian(Tensor<cpu, dim, DType> *dst,
+                             DType mu = 0.0f, DType sigma = 1.0f) {
+    if (sigma <= 0.0f) {
+      *dst = mu; return;
+    }
+    if (dst->CheckContiguous()) {
+      this->GenGaussian(dst->dptr_, dst->shape_.Size(), mu, sigma);
+    } else {
+      Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
+      for (index_t i = 0; i < mat.size(0); ++i) {
+        this->GenGaussian(mat[i].dptr_, mat.size(1), mu, sigma);
+      }
+    }
+  }
+  inline void GenUniform(float *dptr, index_t size, float a, float b) {
+    for (index_t j = 0; j < size; ++j) {
+      dptr[j] = static_cast<float>(RandNext()) * (b - a) + a;
+    }
+  }
+  inline void GenUniform(double *dptr, index_t size, double a, double b) {
+    for (index_t j = 0; j < size; ++j) {
+      dptr[j] = static_cast<double>(RandNext()) * (b - a) + a;
+    }
+  }
+  inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) {
+    this->GenGaussianX(dptr, size, mu, sigma);
+  }
+  inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) {
+    this->GenGaussianX(dptr, size, mu, sigma);
+  }
+  inline void GenGaussianX(DType *dptr, index_t size, DType mu, DType sigma) {
+    DType g1 = 0.0f, g2 = 0.0f;
+    for (index_t j = 0; j < size; ++j) {
+      if ((j & 1) == 0) {
+        this->SampleNormal2D(&g1, &g2);
+        dptr[j] = mu + g1 * sigma;
+      } else {
+        dptr[j] = mu + g2 * sigma;
+      }
+    }
+  }
+  /*! \brief get next random number from rand */
+  inline DType RandNext(void) {
+    return static_cast<DType>(rand_r(&rseed_)) /
+        (static_cast<DType>(RAND_MAX) + 1.0f);
+  }
+  /*! \brief return a real numer uniform in (0,1) */
+  inline DType RandNext2(void) {
+    return (static_cast<DType>(rand_r(&rseed_)) + 1.0f) /
+        (static_cast<DType>(RAND_MAX) + 2.0f);
+  }
+  /*!
+   * \brief sample iid xx,yy ~N(0,1)
+   * \param xx first  gaussian output
+   * \param yy second gaussian output
+   */
+  inline void SampleNormal2D(DType *xx_, DType *yy_) {
+    DType &xx = *xx_, &yy = *yy_;
+    DType x, y, s;
+    do {
+      x = 2.0f * RandNext2() - 1.0f;
+      y = 2.0f * RandNext2() - 1.0f;
+      s = x * x + y * y;
+    } while (s >= 1.0f || s == 0.0f);
+    DType t = std::sqrt(-2.0f * std::log(s) / s);
+    xx = x * t; yy = y * t;
+  }
+#endif
+  /*! \brief temporal space used to store random numbers */
+  TensorContainer<cpu, 1, DType> buffer_;
+};  // class Random<cpu, DType>
+
+// only allow GPU PRNG when cuda is enabled
+#if MSHADOW_USE_CUDA
+/*! \brief GPU random number generator */
+template<typename DType>
+class Random<gpu, DType> {
+ public:
+  /*!
+   * \brief constructor of random engine
+   * \param seed random number seed
+   */
+  explicit Random(int seed) : gen_(NULL) {
+    this->Seed(seed);
+    buffer_.Resize(Shape1(kRandBufferSize));
+  }
+  ~Random(void) MSHADOW_THROW_EXCEPTION {
+    DeleteGenerator();
+  }
+  /*!
+   * \brief set the stream of computation
+   * \param stream computation stream
+   */
+  inline void set_stream(Stream<gpu> *stream) {
+    curandStatus_t status;
+    status = curandSetStream(gen_, Stream<gpu>::GetStream(stream));
+
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "set_stream CURAND failed";
+  }
+  /*!
+   * \brief seed random number generator using this seed
+   * \param seed seed of prng
+   */
+  inline void Seed(int seed) {
+    // Create a new rng, either initially or if the RNG type can't reset its offset.
+    if (gen_ == NULL || (curandSetGeneratorOffset(gen_, 0ULL) != CURAND_STATUS_SUCCESS))
+      CreateGenerator();
+    // Now set the seed.
+    curandStatus_t status;
+    status = curandSetPseudoRandomGeneratorSeed(gen_, static_cast<uint64_t>(seed));
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "Set CURAND seed failed.";
+  }
+  /*!
+   * \brief get a set of random integers
+   */
+  inline void GetRandInt(const Tensor<gpu, 1, unsigned>& dst) {
+    curandStatus_t status = curandGenerate(gen_, dst.dptr_, dst.size(0));
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen rand ints failed.";
+  }
+  /*!
+   * \brief generate data from uniform [a,b)
+   * \param dst destination
+   * \param a lower bound of uniform
+   * \param b upper bound of uniform
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleUniform(Tensor<gpu, dim, DType> *dst,
+                            DType a = 0.0f, DType b = 1.0f);
+
+  /*!
+   * \brief generate data from standard gaussian
+   * \param dst destination
+   * \param mu mean variable
+   * \param sigma standard deviation
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline void SampleGaussian(Tensor<gpu, dim, DType> *dst,
+                             DType mu = 0.0f, DType sigma = 1.0f);
+  /*!
+   * \brief return a temporal expression storing standard gaussian random variables
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \param mu mean
+   * \param sigma variance
+   * \return a temporal expression storing standard gaussian random variables
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+  gaussian(Shape<dim> shape, DType mu = 0.0f, DType sigma = 1.0f);
+  /*!
+   * \brief return a temporal expression storing standard uniform [0,1)
+   *        the temporal tensor is only valid before next call of gaussian or uniform
+   *        can be used as part of expression
+   *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+   *           since second call of gaussian(s2) makes gaussian(s1) invalid
+   *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+   * \param shape shape of the tensor
+   * \return a temporal expression storing standard uniform [0,1)
+   * \tparam dim dimension of tensor
+   */
+  template<int dim>
+  inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+  uniform(Shape<dim> shape);
+
+ private:
+  inline void GenGaussian(float *dptr, size_t size, float mu, float sigma) {
+    curandStatus_t status;
+    status = curandGenerateNormal(gen_, dptr, size, mu, sigma);
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Normal float failed."
+                                            << " size = " << size
+                                            << ",mu = " << mu
+                                            << ",sigma = " << sigma;
+  }
+  inline void GenGaussian(double *dptr, size_t size, double mu, double sigma) {
+    curandStatus_t status;
+    status = curandGenerateNormalDouble(gen_, dptr, size, mu, sigma);
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Normal double failed."
+                                            << " size = " << size
+                                            << ",mu = " << mu
+                                            << ",sigma = " << sigma;
+  }
+  inline void GenUniform(float *dptr, size_t size) {
+    curandStatus_t status;
+    status = curandGenerateUniform(gen_, dptr, size);
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform float failed."
+                                            << " size = " << size;
+  }
+  inline void GenUniform(double *dptr, size_t size) {
+    curandStatus_t status;
+    status = curandGenerateUniformDouble(gen_, dptr, size);
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "CURAND Gen Uniform double failed."
+                                            << " size = " << size;
+  }
+  inline void CreateGenerator() {
+    if (gen_ != NULL)
+      DeleteGenerator();
+    curandStatus_t status;
+    status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "Cannot create CURAND Generator";
+  }
+  inline void DeleteGenerator() {
+    if (gen_ != NULL) {
+      curandStatus_t status;
+      status = curandDestroyGenerator(gen_);
+      CHECK_EQ(status, CURAND_STATUS_SUCCESS) << "Destory CURAND Gen failed";
+      gen_ = NULL;
+    }
+  }
+  /*! \brief random number generator */
+  curandGenerator_t gen_;
+  /*! \brief templ buffer */
+  TensorContainer<gpu, 1, DType> buffer_;
+};  // class Random<gpu, DType>
+#endif  // MSHADOW_USE_CUDA
+
+#ifdef __CUDACC__
+// implementations that depends on cuda kernels
+template<typename DType>
+template<int dim>
+inline void Random<gpu, DType>::SampleUniform(
+    Tensor<gpu, dim, DType> *dst, DType a, DType b) {
+  if (a == 0.0f && b == 1.0f) {
+    if (dst->CheckContiguous()) {
+      this->GenUniform(dst->dptr_, dst->shape_.Size());
+    } else {
+      *dst = this->uniform(dst->shape_);
+    }
+  } else {
+    *dst = this->uniform(dst->shape_) * (b - a) + a;
+  }
+}
+template<typename DType>
+template<int dim>
+inline void Random<gpu, DType>::SampleGaussian(
+    Tensor<gpu, dim, DType> *dst, DType mu, DType sigma) {
+  // We need to check whether the shape size is even since CuRand supports only normal distribution
+  // generation of even number of elements.
+  if (dst->CheckContiguous() && (dst->shape_.Size() % 2 == 0)) {
+    this->GenGaussian(dst->dptr_, dst->shape_.Size(), mu, sigma);
+  } else {
+    *dst = this->gaussian(dst->shape_, mu, sigma);
+  }
+}
+
+template<typename DType>
+template<int dim>
+inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+Random<gpu, DType>::gaussian(Shape<dim> shape, DType mu, DType sigma) {
+  size_t aligned_sz = ((shape.Size() + 1UL) >> 1) << 1;
+  // allocate alligned size
+  buffer_.Resize(Shape1(aligned_sz));
+  buffer_.Resize(Shape1(shape.Size()));
+  this->GenGaussian(buffer_.dptr_, aligned_sz, mu, sigma);
+  return expr::reshape(buffer_, shape);
+}
+
+template<typename DType>
+template<int dim>
+inline expr::ReshapeExp<Tensor<gpu, 1, DType>, DType, dim, 1>
+Random<gpu, DType>::uniform(Shape<dim> shape) {
+  buffer_.Resize(Shape1(shape.Size()));
+  this->GenUniform(buffer_.dptr_, buffer_.size(0));
+  return expr::reshape(buffer_, shape);
+}
+#endif  // __CUDACC__
+}  // namespace mshadow
+#endif  // MSHADOW_RANDOM_H_
diff --git a/3rdparty/mshadow/mshadow/stream_gpu-inl.h b/3rdparty/mshadow/mshadow/stream_gpu-inl.h
new file mode 100644
index 000000000000..603da9767e81
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/stream_gpu-inl.h
@@ -0,0 +1,214 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file stream_gpu-inl.h
+ * \brief implementation of GPU code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_STREAM_GPU_INL_H_
+#define MSHADOW_STREAM_GPU_INL_H_
+#include <memory>
+#include "./base.h"
+#include "./tensor.h"
+#include "./logging.h"
+
+namespace mshadow {
+#if MSHADOW_USE_CUDA == 1
+// Stream alocation
+// actual implementation of GPU stream in CUDA
+template<>
+struct Stream<gpu> {
+  /*! \brief handle state */
+  enum HandleState {
+    NoHandle = 0,
+    OwnHandle = 1,
+  };
+  /*! \brief cudaStream */
+  cudaStream_t stream_;
+  /*! \brief cublas handle */
+  cublasHandle_t blas_handle_;
+  /*! \brief cusolver handle */
+  #if MSHADOW_USE_CUSOLVER == 1
+  cusolverDnHandle_t solver_handle_;
+  #endif
+  /*! \brief cudnn handle */
+  #if MSHADOW_USE_CUDNN == 1
+  cudnnHandle_t dnn_handle_;
+  #endif
+  /*! \brief cublas handle ownership */
+  HandleState blas_handle_ownership_;
+  /*! \brief cusolver handle ownership */
+  HandleState solver_handle_ownership_;
+  /*! \brief cudnn handle ownership */
+  HandleState dnn_handle_ownership_;
+  /*! \brief cudaDeviceProp */
+  cudaDeviceProp prop;
+  /*! \brief dev id */
+  int dev_id;
+
+  Stream(void)
+    : stream_(0)
+      , blas_handle_(0)
+#if MSHADOW_USE_CUDNN == 1
+      , dnn_handle_(0)
+#endif
+      , blas_handle_ownership_(NoHandle)
+      , solver_handle_ownership_(NoHandle)
+      , dnn_handle_ownership_(NoHandle) {}
+  /*!
+   * \brief wait for all the computation associated
+   *  with this stream to complete
+   */
+  inline void Wait(void) {
+    MSHADOW_CUDA_CALL(cudaStreamSynchronize(stream_));
+  }
+  /*!
+   * \brief query whether the the stream is idle
+   * \return true if the stream is idle and all the job have been completed
+   */
+  inline bool CheckIdle(void) {
+    cudaError_t err = cudaStreamQuery(stream_);
+    if (err == cudaSuccess) return true;
+    if (err == cudaErrorNotReady) return false;
+    LOG(FATAL) << cudaGetErrorString(err);
+    return false;
+  }
+  /*!
+   * \brief returns actual cudaStream_t given an input GPU stream pointer
+   * \param stream pointer to GPU stream
+   */
+  inline static cudaStream_t GetStream(Stream<gpu> *stream) {
+    if (stream == NULL) {
+#if MSHADOW_FORCE_STREAM
+      LOG(FATAL) << "Default GPU stream was used when MSHADOW_FORCE_STREAM was on";
+#endif
+      return 0;
+    } else {
+      return stream->stream_;
+    }
+  }
+  /*!
+   * \brief return actual cublasHandle
+   * \param pointer to GPU stream
+   */
+  inline static cublasHandle_t GetBlasHandle(Stream<gpu> *stream) {
+    if (stream == NULL) {
+      return 0;
+    } else {
+      CHECK_NE(stream->blas_handle_ownership_, NoHandle)
+        << "No handle exist in source stream";
+      return stream->blas_handle_;
+    }
+  }
+  /*! \brief Destory cublas handle if own it */
+  inline void DestroyBlasHandle() {
+    if (blas_handle_ownership_ == OwnHandle) {
+      cublasStatus_t err = cublasDestroy(blas_handle_);
+      blas_handle_ownership_ = NoHandle;
+      CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Destory cublas handle failed";
+    }
+  }
+  /*! \brief Destory original blas handle and create a new one */
+  inline void CreateBlasHandle() {
+    this->DestroyBlasHandle();
+    cublasStatus_t err = cublasCreate(&blas_handle_);
+    blas_handle_ownership_ = OwnHandle;
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Create cublas handle failed";
+    err = cublasSetStream(blas_handle_, stream_);
+    CHECK_EQ(err, CUBLAS_STATUS_SUCCESS) << "Setting cublas stream failed";
+  }
+#if MSHADOW_USE_CUSOLVER == 1
+  inline static cusolverDnHandle_t GetSolverHandle(Stream<gpu> *stream) {
+    if (stream == NULL) {
+      return 0;
+    } else {
+      CHECK_NE(stream->solver_handle_ownership_, NoHandle) << "No handle exist in source stream";
+      return stream->solver_handle_;
+    }
+  }
+#endif
+  inline void DestroySolverHandle() {
+#if MSHADOW_USE_CUSOLVER == 1
+    if (solver_handle_ownership_ == OwnHandle) {
+      cusolverStatus_t err = cusolverDnDestroy(solver_handle_);
+      CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Destory cusolver handle failed";
+    }
+#endif
+  }
+  inline void CreateSolverHandle() {
+#if MSHADOW_USE_CUSOLVER == 1
+    this->DestroySolverHandle();
+    cusolverStatus_t err = cusolverDnCreate(&solver_handle_);
+    CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Create cusolver handle failed";
+    err = cusolverDnSetStream(solver_handle_, stream_);
+    CHECK_EQ(err, CUSOLVER_STATUS_SUCCESS) << "Setting cusolver stream failed";
+    this->solver_handle_ownership_ = OwnHandle;
+#endif
+  }
+// #if MSHADOW_USE_CUDNN && defined(__CUDACC__)
+#if MSHADOW_USE_CUDNN == 1
+  inline static cudnnHandle_t GetDnnHandle(Stream<gpu> *stream) {
+    if (stream == NULL) {
+      return 0;
+    } else {
+      CHECK_NE(stream->dnn_handle_ownership_, NoHandle) << "No handle exist in source stream";
+      return stream->dnn_handle_;
+    }
+  }
+#endif
+  inline void DestroyDnnHandle() {
+// #if MSHADOW_USE_CUDNN && defined(__CUDACC__)
+#if MSHADOW_USE_CUDNN == 1
+    if (dnn_handle_ownership_ == OwnHandle) {
+      cudnnStatus_t err = cudnnDestroy(dnn_handle_);
+      this->dnn_handle_ownership_ = NoHandle;
+      CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
+    }
+#endif
+  }
+  inline void CreateDnnHandle() {
+// #if MSHADOW_USE_CUDNN == 1 && defined(__CUDACC__)
+#if MSHADOW_USE_CUDNN == 1
+    this->DestroyDnnHandle();
+    cudnnStatus_t err = cudnnCreate(&dnn_handle_);
+    CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
+    // At this point, we have the resource which may need to be freed
+    this->dnn_handle_ownership_ = OwnHandle;
+    err = cudnnSetStream(dnn_handle_, stream_);
+    CHECK_EQ(err, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(err);
+#endif
+  }
+};
+template<>
+inline void DeleteStream<gpu>(Stream<gpu> *stream) {
+  if (stream) {
+    MSHADOW_CUDA_CALL(cudaStreamDestroy(stream->stream_));
+    stream->DestroyBlasHandle();
+    stream->DestroySolverHandle();
+    stream->DestroyDnnHandle();
+    delete stream;
+  }
+}
+template<>
+inline Stream<gpu> *NewStream<gpu>(bool create_blas_handle,
+                                   bool create_dnn_handle,
+                                   int dev_id) {
+  // RAII on Cuda exception
+  struct StreamDeleter { void operator()(Stream<gpu> *ptr) const { DeleteStream<gpu>(ptr); } };
+  std::unique_ptr<Stream<gpu>, StreamDeleter> st(new Stream<gpu>());
+  MSHADOW_CUDA_CALL(cudaStreamCreate(&st->stream_));
+  if (create_blas_handle) {
+    st->CreateBlasHandle();
+    st->CreateSolverHandle();
+  }
+  if (create_dnn_handle) {
+    st->CreateDnnHandle();
+  }
+  st->dev_id = dev_id;
+  if (dev_id != -1) {
+    MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&st->prop, dev_id));
+  }
+  return st.release();
+}
+#endif
+}  // namespace mshadow
+#endif  // MSHADOW_STREAM_GPU_INL_H_
diff --git a/3rdparty/mshadow/mshadow/tensor.h b/3rdparty/mshadow/mshadow/tensor.h
new file mode 100755
index 000000000000..0d662621aa4d
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/tensor.h
@@ -0,0 +1,1081 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor.h
+ * \brief header file of tensor data structure and functions
+ *  This lib requires explicit memory allocation and de-allocation
+ *  all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
+ *  no memory allocation is happening during calculation
+ *
+ *  For STL style tensor, see tensor_container.h
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_H_
+#define MSHADOW_TENSOR_H_
+#include <string>
+#include <iostream>
+#include "./base.h"
+#include "./expression.h"
+
+namespace mshadow {
+/*! \brief device name CPU */
+struct cpu {
+  /*! \brief whether this device is CPU or not */
+  static const bool kDevCPU = true;
+  /*! \brief device flag number, identifies this device */
+  static const int kDevMask = 1 << 0;
+};
+/*! \brief device name GPU */
+struct gpu {
+  /*! \brief whether this device is CPU or not */
+  static const bool kDevCPU = false;
+  /*! \brief device flag number, identifies this device */
+  static const int kDevMask = 1 << 1;
+};
+template<int ndim>
+struct Shape;
+
+/*!
+ * \brief allow string printing of the shape
+ * \param os the output stream
+ * \param shape the shape
+ * \return the ostream
+ */
+template<int ndim>
+inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape); // NOLINT(*)
+
+/*!
+ * \brief shape of a tensor
+ * \tparam dimension dimension of tensor
+ */
+template<int dimension>
+struct Shape {
+  /*! \brief dimension of current shape */
+  static const int kDimension = dimension;
+  /*! \brief dimension of current shape minus one */
+  static const int kSubdim = dimension - 1;
+  /*! \brief storing the dimension information */
+  index_t shape_[kDimension];
+  /*! \brief default constructor, do nothing */
+  MSHADOW_XINLINE Shape(void) {}
+  /*! \brief constuctor */
+  MSHADOW_XINLINE Shape(const Shape<kDimension> &s) {
+    #pragma unroll
+    for (int i = 0; i < kDimension; ++i) {
+      this->shape_[i] = s[i];
+    }
+  }
+  /*!
+   * \brief get corresponding index
+   * \param idx dimension index
+   * \return the corresponding dimension size
+   */
+  MSHADOW_XINLINE index_t &operator[](index_t idx) {
+    return shape_[idx];
+  }
+  /*!
+   * \brief get corresponding index
+   * \param idx dimension index
+   * \return the corresponding dimension size
+   */
+  MSHADOW_XINLINE const index_t &operator[](index_t idx) const {
+    return shape_[idx];
+  }
+  /*!
+   * \return whether two shape equals
+   * \param s the shape to compare against
+   */
+  MSHADOW_XINLINE bool operator==(const Shape<kDimension> &s) const {
+    #pragma unroll
+    for (int i = 0; i < kDimension; ++i) {
+      if (s.shape_[i] != this->shape_[i]) return false;
+    }
+    return true;
+  }
+  /*!
+   * \return whether two shape not equal
+   * \param s the shape to compare against
+   */
+  MSHADOW_XINLINE bool operator!=(const Shape<kDimension> &s) const {
+    return !(*this == s);
+  }
+  /*!
+   * flatten the tensor, return a 1D shape
+   * \return the flat 1d shape
+   */
+  MSHADOW_XINLINE Shape<1> FlatTo1D(void) const {
+    Shape<1> s;
+    s[0] = this->Size();
+    return s;
+  }
+  /*!
+   * flatten the higher dimension to second dimension, return a 2D shape
+   * \return the flat 2d shape
+   */
+  MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
+    Shape<2> s;
+    s.shape_[1] = this->shape_[kDimension - 1];
+    index_t ymax = 1;
+    #pragma unroll
+    for (int i = 0; i < kDimension - 1; ++i) {
+      ymax *= this->shape_[i];
+    }
+    s.shape_[0] = ymax;
+    return s;
+  }
+  /*! \return number of valid elements */
+  MSHADOW_XINLINE index_t Size(void) const {
+    index_t size = this->shape_[0];
+    #pragma unroll
+    for (int i = 1; i < kDimension; ++i) {
+      size *= this->shape_[i];
+    }
+    return size;
+  }
+  /*!
+   * \return product shape in [dimstart,dimend)
+   * \param dimstart start dimension
+   * \param dimend end dimension
+   */
+  MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const {
+    index_t num = 1;
+    #pragma unroll
+    for (int i = dimstart; i < dimend; ++i) {
+      num *= this->shape_[i];
+    }
+    return num;
+  }
+  /*!
+   * \brief get subshape that takes off largest dimension
+v   * \return subshape
+   */
+  MSHADOW_XINLINE Shape<kSubdim> SubShape(void) const {
+    Shape<kSubdim> s;
+    // for cuda
+    #pragma unroll
+    for (int i = 0; i < kSubdim; ++i) {
+      s.shape_[i] = this->shape_[i + 1];
+    }
+    return s;
+  }
+  /*!
+   * \brief slice the shape from start to end
+   * \tparam dimstart start dimension
+   * \tparam dimend end dimension
+   * \return the sliced shape
+   */
+  template<int dimstart, int dimend>
+  MSHADOW_XINLINE Shape<dimend - dimstart> Slice(void) const {
+    Shape<dimend - dimstart> s;
+    #pragma unroll
+    for (int i = dimstart; i < dimend; ++i) {
+      s[i - dimstart] = this->shape_[i];
+    }
+    return s;
+  }
+  //! \cond Doxygen_Suppress
+  template<int dim>
+  friend std::ostream &operator<<(std::ostream &os, const Shape<dim> &shape); // NOLINT(*)
+  //! \endcond
+};  // Shape
+//------------------------------------------------
+// useful construction functions to generate shape
+//-------------------------------------------------
+/*!
+ * \brief construct a one dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {
+  Shape<1> s; s[0] = s0;
+  return s;
+}
+/*!
+ * \brief construct a two dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {
+  Shape<2> s; s[0] = s0; s[1] = s1;
+  return s;
+}
+/*!
+ * \brief construct a three dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \param s2 size of dimension 2
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {
+  Shape<3> s;
+  s[0] = s0; s[1] = s1; s[2] = s2;
+  return s;
+}
+/*!
+ * \brief construct a four dimension shape, stride will equal s0
+ * \param s0 size of dimension 0
+ * \param s1 size of dimension 1
+ * \param s2 size of dimension 2
+ * \param s3 size of dimension 3
+ * \return the shape construction
+ */
+MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,
+                                index_t s2, index_t s3) {
+  Shape<4> s;
+  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+  return s;
+}
+/*!
+* \brief construct a five dimension shape, stride will equal s0
+* \param s0 size of dimension 0
+* \param s1 size of dimension 1
+* \param s2 size of dimension 2
+* \param s3 size of dimension 3
+* \param s4 size of dimension 4
+* \return the shape construction
+*/
+MSHADOW_XINLINE Shape<5> Shape5(index_t s0, index_t s1, index_t s2,
+                                index_t s3, index_t s4) {
+  Shape<5> s;
+  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s[4] = s4;
+  return s;
+}
+
+/*!
+* \brief Convert shape in src_layout to shape in dst_layout
+* \param src original shape
+* \param src_layout layout of original shape
+* \param dst_layout target layout
+* \return shape in target layout
+*/
+inline Shape<3> ConvertLayout(const Shape<3>& src, int src_layout, int dst_layout) {
+  Shape<3> dst;
+  switch (src_layout) {
+  case kNCW:
+    dst = src;
+    break;
+  case kNWC:
+    dst[0] = src[0];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;
+  }
+  switch (dst_layout) {
+  case kNCW:
+    return dst;
+  case kNWC:
+    {
+      index_t tmp = dst[1];
+      dst[1] = dst[2];
+      dst[2] = tmp;
+    }
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;
+  }
+  return dst;
+}
+
+/*!
+* \brief Convert shape in src_layout to shape in dst_layout
+* \param src original shape
+* \param src_layout layout of original shape
+* \param dst_layout target layout
+* \return shape in target layout
+*/
+inline Shape<4> ConvertLayout(const Shape<4>& src, int src_layout, int dst_layout) {
+  Shape<4> dst;
+  switch (src_layout) {
+  case kNCHW:
+    dst = src;
+    break;
+  case kNHWC:
+    dst[0] = src[0];
+    dst[2] = src[1];
+    dst[3] = src[2];
+    dst[1] = src[3];
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
+    dst = src;  // fixes compiler warning
+  }
+  Shape<4> dst2;
+  switch (dst_layout) {
+  case kNCHW:
+    return dst;
+  case kNHWC:
+    dst2[0] = dst[0];
+    dst2[1] = dst[2];
+    dst2[2] = dst[3];
+    dst2[3] = dst[1];
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
+    dst2 = src;  // fixes compiler warning
+  }
+  return dst2;
+}
+
+/*!
+* \brief Convert shape in src_layout to shape in dst_layout
+* \param src original shape
+* \param src_layout layout of original shape
+* \param dst_layout target layout
+* \return shape in target layout
+*/
+inline Shape<5> ConvertLayout(const Shape<5>& src, int src_layout, int dst_layout) {
+  Shape<5> dst;
+  switch (src_layout) {
+  case kNCDHW:
+    dst = src;
+    break;
+  case kNDHWC:
+    dst[0] = src[0];
+    dst[2] = src[1];
+    dst[3] = src[2];
+    dst[4] = src[3];
+    dst[1] = src[4];
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
+  }
+  Shape<5> dst2;
+  switch (dst_layout) {
+  case kNCDHW:
+    return dst;
+  case kNDHWC:
+    dst2[0] = dst[0];
+    dst2[1] = dst[2];
+    dst2[2] = dst[3];
+    dst2[3] = dst[4];
+    dst2[4] = dst[1];
+    break;
+  default:
+    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
+  }
+  return dst2;
+}
+
+/*!
+ * \brief computaion stream structure, used for asynchronous computations
+ */
+template<typename Device>
+struct Stream {
+  // this is only a dummy implementation for CPU
+  // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h
+  /*!
+   * \brief wait for all the computations associated
+   *  with this stream to complete
+   */
+  inline void Wait(void) {}
+  /*!
+   * \brief query whether the the stream is idle
+   * \return true if the stream is idle and all the jobs have been completed
+   */
+  inline bool CheckIdle(void) {
+    return true;
+  }
+  /*! \brief create a blas handle */
+  inline void CreateBlasHandle() {}
+};
+/*!
+ * \brief Tensor RValue, this is the super type of all kinds of possible tensors
+ * \tparam Container the tensor type
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ * \tparam DType the type of elements in the tensor
+ */
+template<typename Container, typename Device, int dimension, typename DType>
+struct TRValue: public expr::RValueExp<Container, DType> {
+};
+// more compact template
+/*!
+ * \brief general tensor
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ * \tparam DType the type of elements in the tensor
+ */
+template<typename Device, int dimension,
+         typename DType MSHADOW_DEFAULT_DTYPE>
+struct Tensor: public TRValue<Tensor<Device, dimension, DType>,
+                              Device, dimension, DType> {
+ public:
+  //--------------------------------
+  // struct memembers
+  //--------------------------------
+  /*! \brief whether current type lies in cpu */
+  static const bool kDevCPU = Device::kDevCPU;
+  /*! \brief dimension of subtype */
+  static const int  kSubdim = dimension - 1;
+  //--------------------------------
+  // struct memembers
+  //--------------------------------
+  /*! \brief pointer to the data */
+  DType *dptr_ = nullptr;
+  /*! \brief shape of the tensor */
+  Shape<dimension> shape_;
+  /*!
+   * \brief storing the stride information in x dimension
+   *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
+   */
+  index_t stride_;
+  /*!
+   * \brief stream where the computation lies
+   * stream is a device dependency concept where each computation
+   */
+  Stream<Device> *stream_;
+  //--------------------------------
+  // functions
+  //--------------------------------
+  /*! \brief default constructor */
+  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
+  /*! \brief constructor from shape  */
+  MSHADOW_XINLINE Tensor(const Shape<dimension> &shape)
+      : shape_(shape), stream_(NULL) {}
+  /*! \brief constructor from data pointer and shape, without stride */
+  MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape)
+      : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {}
+  /*! \brief constructor from data pointer and shape, without stride */
+  MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape,
+                         Stream<Device> *stream)
+    : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(stream) {}
+  /*! \brief constructor from data pointer and shape  */
+  MSHADOW_XINLINE Tensor(DType *dptr,
+                         const Shape<dimension> &shape,
+                         index_t stride, Stream<Device> *stream)
+      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
+  /*!
+   * \brief set the stream to do computation of current tensor
+   * \param stream the computation stream
+   */
+  inline void set_stream(Stream<Device> *stream) {
+    this->stream_ = stream;
+  }
+  /*!
+   * \return memory cost of the tensor, including the aligned x dimension
+   * \tparam startdim the starting dimension
+   */
+  template<int startdim>
+  MSHADOW_XINLINE index_t MemSize(void) const {
+    index_t memsz = this->stride_;
+    #pragma unroll
+    for (int i = startdim; i < kSubdim; ++i) {
+      memsz *= this->shape_[i];
+    }
+    return memsz;
+  }
+  /*!
+   * \return whether the tensor's memory is continuous
+   * x dimension same as stride
+   */
+  MSHADOW_XINLINE bool CheckContiguous(void) const {
+    return this->shape_[dimension - 1] == stride_;
+  }
+  /*!
+   * \return memory cost of the tensor, including the aligned x dimension
+   */
+  MSHADOW_XINLINE index_t MSize(void) const {
+    return this->MemSize<0>();
+  }
+  /*!
+   * \brief return size of i-th dimension, start counting from highest dimension
+   * \param idx the dimension count from the highest dimensin
+   * \return the size
+   */
+  MSHADOW_XINLINE index_t size(index_t idx) const {
+    return shape_[idx];
+  }
+  /*!
+   * \brief flatten the tensor to 1 dimension
+   * \return tensor after flatten
+   */
+  MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
+    return Tensor<Device, 1, DType>(dptr_, shape_.FlatTo1D(), stride_, stream_);
+  }
+  /*!
+   * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
+   * \return tensor after flatten
+   */
+  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
+    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
+  }
+  /*!
+   * \brief get a element of dimension - 1
+   * \param idx index
+   * \return the result tensor
+   */
+  MSHADOW_XINLINE Tensor<Device, kSubdim, DType> operator[](index_t idx) const {
+    return Tensor<Device, kSubdim, DType>(dptr_ + this->MemSize<1>() * idx,
+                                          shape_.SubShape(), stride_, stream_);
+  }
+  /*!
+   * \brief slice the tensor in highest dimension [begin,end)
+   * \param begin begin position of slice
+   * \param end end position of slice
+   * \return tensor after slice
+   */
+  MSHADOW_XINLINE Tensor<Device, dimension, DType>
+  Slice(index_t begin, index_t end) const {
+    Shape<dimension> s = this->shape_;
+    s[0] = end - begin;
+    return Tensor<Device, dimension, DType>(dptr_ + this->MemSize<1>() * begin,
+                                            s, stride_, stream_);
+  }
+  /*!\brief implement the assignment of same type */
+  inline Tensor<Device, dimension, DType> &
+  operator=(const Tensor<Device, dimension, DType> &exp) {
+    dptr_ = exp.dptr_;
+    shape_ = exp.shape_;
+    stride_ = exp.stride_;
+    stream_ = exp.stream_;
+    return *this;
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E, int etype>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  inline Tensor<Device, dimension, DType> &operator=(const DType &exp) {
+    return this->__assign(exp);
+  }
+};
+/*
+ *  respecialized class Tensor1D, thei is due to different implementation in operator[]
+ */
+template<typename Device, typename DType>
+struct Tensor<Device, 1, DType>:
+      public TRValue<Tensor<Device, 1, DType>, Device, 1, DType> {
+ public:
+  DType *dptr_;
+  Shape<1> shape_;
+  index_t stride_;
+  Stream<Device> *stream_;
+  // constructor
+  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(const Shape<1> &shape)
+      : shape_(shape), stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape)
+      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {}
+  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, Stream<Device> *stream)
+      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(stream) {}
+  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape,
+                         index_t stride, Stream<Device> *stream)
+      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
+  inline void set_stream(Stream<Device> *stream) {
+    this->stream_ = stream;
+  }
+  MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
+    return *this;
+  }
+  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
+    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
+  }
+  MSHADOW_XINLINE Tensor<Device, 1, DType> Slice(index_t begin, index_t end) const {
+    Shape<1> s;
+    s[0] = end  - begin;
+    return Tensor<Device, 1, DType>(dptr_ + begin, s, s[0], stream_);
+  }
+  MSHADOW_XINLINE bool CheckContiguous(void) const {
+    return true;
+  }
+  MSHADOW_XINLINE index_t MSize(void) const {
+    return shape_[0];
+  }
+  MSHADOW_XINLINE index_t size(index_t i) const {
+    return shape_[0];
+  }
+  MSHADOW_XINLINE DType &operator[](index_t idx) {
+    return dptr_[idx];
+  }
+  MSHADOW_XINLINE const DType &operator[](index_t idx) const {
+    return dptr_[idx];
+  }
+  /*!\brief implement the assignment of same type */
+  inline Tensor<Device, 1, DType> &
+  operator=(const Tensor<Device, 1, DType> &exp) {
+    dptr_ = exp.dptr_;
+    shape_ = exp.shape_;
+    stride_ = exp.stride_;
+    stream_ = exp.stream_;
+    return *this;
+  }
+  template<typename E, int etype>
+  inline Tensor<Device, 1, DType> &
+  operator=(const expr::Exp<E, DType, etype> &exp) {
+    return this->__assign(exp);
+  }
+  inline Tensor<Device, 1, DType> &operator=(const DType &exp) {
+    return this->__assign(exp);
+  }
+};
+//------------------------
+// Function Declarations
+//-----------------------
+/*!
+ * \brief initialize tensor engine, used to call intialization functions of dependent libs
+ *        this function should be called before all GPU tensor operations,
+ *        for using tensors in CPU, this call is actually not needed
+ * \param device_id GPU device id to be choosed
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void InitTensorEngine(int device_id = 0);
+/*!
+ * \brief Shutdown tensor engine on current device
+ *     this function should be called after all GPU tensor operations,
+ *     for using tensors in CPU, this call is actually not needed
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void ShutdownTensorEngine(void);
+/*!
+ * \brief set the device of current thread to work on
+ * \param devid the device id
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline void SetDevice(int devid);
+/*!
+ * \brief create a new stream from system
+ * \param create_blas_handle whether create blas & cusolver handle in stream
+ * \param create_dnn_handle whether create cudnn handle in stream
+ * \param dev_id device id
+ * \return a pointer to the created stream
+ * \tparam Device the device type
+ */
+template<typename Device>
+inline Stream<Device> *NewStream(bool create_blas_handle,
+                                 bool create_dnn_handle,
+                                 int dev_id = -1);
+/*! \brief default behavior: create cublas handle
+ *  \param dev_id device id
+ *  \return a pointer to the created stream
+ */
+template<typename Device>
+inline Stream<Device> *NewStream(int dev_id) {
+  return NewStream<Device>(true, false, dev_id);
+}
+/*!
+ * \brief delete the computing stream
+ * \param stream the stream parameter to be deleted
+ */
+template<typename Device>
+inline void DeleteStream(Stream<Device> *stream);
+/*!
+ * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
+ *        this function is responsible to set the stride_ in each obj.shape
+ * \param obj the tensor object, with shape specified
+ * \param pad whether padding dimension 0, to make last dimension aligned,
+ *            padding may help improve efficiency of matrix multiplications
+ *            if true, will allocate space with stride_ that may not equals shape[0]
+ *            if false, will allocate continuous space
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<cpu, dim, DType> *obj,
+                       bool pad = MSHADOW_ALLOC_PAD);
+/*!
+ * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
+ *        this function is responsible to set the stride_ in each obj.shape
+ * \param obj the tensor object, with shape specified
+ * \param pad whether padding dimension 0, to make last dimension aligned,
+ *            padding may help improve efficiency of matrix multiplications
+ *            if true, will allocate space with stride_ that may not equals shape[0]
+ *            if false, will allocate continuous space
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<gpu, dim, DType> *obj,
+                       bool pad = MSHADOW_ALLOC_PAD);
+/*!
+ * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+ * \param obj the tensor object
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<cpu, dim, DType> *obj);
+/*!
+ * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+ * \param obj the tensor object
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<gpu, dim, DType> *obj);
+/*!
+ * \brief CPU/GPU: short cut to allocate and initialize a Tensor
+ * \param shape: shape of tensor
+ * \param initv: initialization value
+ * \param pad : padding option
+ * \param stream : stream of tensor
+ * \tparam Device device of tensor
+ * \tparam DType type of element in tensor
+ * \tparam dim dimention of tensor
+ * \return a new allocated tensor
+ * \sa AllocSpace
+ */
+template<typename Device, typename DType, int dim>
+inline Tensor<Device, dim, DType> NewTensor(const Shape<dim> &shape,
+                                            DType initv,
+                                            bool pad = MSHADOW_ALLOC_PAD,
+                                            Stream<Device> *stream = NULL);
+/*!
+ * \brief copy data from one tensor to another, with same shape
+ * \param dst target tensor
+ * \param src source tensor
+ * \param stream the stream, when specified, the copy can exhibit asynchronize behavior
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<cpu> *stream = NULL);
+/*!
+ * \brief copy data from one tensor to another, with same shape
+ * \param dst target tensor
+ * \param src source tensor
+ * \param stream the stream, when specified, the copy can exhibit asynchronize behavior
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*!
+ * \brief copy data from one tensor to another, with same shape
+ * \param dst target tensor
+ * \param src source tensor
+ * \param stream the stream, when specified, the copy can exhibit asynchronize behavior
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*!
+ * \brief copy data from one tensor to another, with same shape
+ * \param dst target tensor
+ * \param src source tensor
+ * \param stream the stream, when specified, the copy can exhibit asynchronize behavior
+ * \tparam dim specify the dim of tensor
+ * \tparam DType type of element in tensor
+ */
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream = NULL);
+/*!
+ * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
+ * \param dst destination
+ * \param energy input energy
+ */
+template<typename DType>
+inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &energy);
+/*!
+ * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
+ * \param dst destination
+ * \param energy input energy
+ */
+template<typename DType>
+inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);
+
+/*!
+ * \brief CPU/GPU: softmax gradient
+ * \param dst destination
+ * \param src source output
+ * \param label label info
+ */
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label);
+/*!
+ * \brief CPU/GPU: softmax gradient
+ * \param dst destination
+ * \param src source output
+ * \param label label info
+ */
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label);
+/*!
+ * \brief CPU/GPU: Gradient accumulate of embedding matrix.
+                   dst[index[i]] += src[i]
+                   Called when the featuredim of src is much larger than the batchsize
+ * \param dst destination
+ * \param index index to take
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 1, IndexType>& index,
+                        const Tensor<cpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Gradient accumulate of embedding matrix.
+                   dst[index[i]] += src[i]
+                   Called when the featuredim of src is much larger than the batchsize
+ * \param dst destination
+ * \param index index to take
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
+                        const Tensor<gpu, 1, IndexType>& index,
+                        const Tensor<gpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Gradient accumulate of embedding matrix.
+                   dst[sorted[i]] += src[index[i]]
+                   Called when the batchsize of src is larger than the featuredim
+ * \param dst destination
+ * \param sorted the sorted indices
+ * \param index original index of the sorted indices
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void AddTakeGradLargeBatch(Tensor<cpu, 2, DType> dst,
+                                  const Tensor<cpu, 1, IndexType>& sorted,
+                                  const Tensor<cpu, 1, IndexType>& index,
+                                  const Tensor<cpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Gradient accumulate of embedding matrix.
+                   dst[sorted[i]] += src[index[i]]
+                   Called when the batchsize of src is larger than the featuredim
+ * \param dst destination
+ * \param sorted the sorted indices
+ * \param index original index of the sorted indices
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
+                                  const Tensor<gpu, 1, IndexType>& sorted,
+                                  const Tensor<gpu, 1, IndexType>& index,
+                                  const Tensor<gpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix.
+                   dst[index[i]] = src[i]
+                   Will use atomicAdd in the inner implementation and the result may not be deterministic.
+ * \param dst destination
+ * \param index the index to accumulate value
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void IndexFill(Tensor<cpu, 2, DType> dst,
+                      const Tensor<cpu, 1, IndexType>& index,
+                      const Tensor<cpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix.
+                   dst[index[i]] = src[i]
+                   Will use atomicAdd in the inner implementation and the result may not be deterministic.
+ * \param dst destination
+ * \param index the index to accumulate value
+ * \param src source output
+ */
+template<typename IndexType, typename DType>
+inline void IndexFill(Tensor<gpu, 2, DType> dst,
+                      const Tensor<gpu, 1, IndexType>& index,
+                      const Tensor<gpu, 2, DType> &src);
+/*!
+ * \brief CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!)
+ * \param keys the keys to sort
+ * \param values the values that sorts w.r.t the key
+ * \param is_ascend whether to sort key in ascending order
+ */
+template<typename KDType, typename VDType>
+inline void SortByKey(Tensor<cpu, 1, KDType> keys, Tensor<cpu, 1, VDType> values,
+                      bool is_ascend = true);
+/*!
+ * \brief CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!)
+ * \param keys the keys to sort
+ * \param values the values that sorts w.r.t the key
+ * \param is_ascend whether to sort key in ascending order
+ */
+template<typename KDType, typename VDType>
+inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,
+                      bool is_ascend = true);
+/*!
+ * \brief CPU/GPU: Sort the keys within each segment. (Stable sort is performed!)
+                   Segments is defined as an ascending ordered vector like [0, 0, 0, 1, 1, 2, 3, 3, 3,...]
+                   We sort separately the keys labeled by 0 and 1, 2, 3, etc.
+                   Currently only supports sorting in ascending order !!
+ * \param values the data to sort
+ * \param segments segment indicator
+ */
+template<typename Device, typename VDType, typename SDType>
+inline void VectorizedSort(Tensor<Device, 1, VDType> values, Tensor<Device, 1, SDType> segments);
+
+// function declarations to support expression, no need to understand them
+// these functions do not need to be directly used
+/*!
+ * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
+ * \tparam Saver specify storage method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+ */
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, cpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp);
+/*!
+ * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
+ * \tparam Saver specify storage method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+ */
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale = 1);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer, typename R,
+         typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale = 1);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale = 1);
+/*!
+ * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
+ * \tparam Saver specify storage method
+ * \tparam Reducer specify a reducer method
+ * \tparam R specifies the storage type of the tensor
+ * \tparam DType the type of elements in the tensor
+ * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
+ * \tparam E specifies the expression type, not need to specify this parameter during usage
+ * \tparam etype expression type
+ * \param dst destination
+ * \param exp expression
+ * \param scale scale the result before save
+ * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+ */
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale = 1);
+/*!
+ * \brief CPU/GPU: 1 dimension vector dot
+ * \param dst Length 1 vector, used to hold the result.
+ * \param lhs Left operand vector
+ * \param rhs Right operand vector
+ */
+template<typename Device, typename DType>
+inline void VectorDot(Tensor<Device, 1, DType> dst,
+                      const Tensor<Device, 1, DType> &lhs,
+                      const Tensor<Device, 1, DType> &rhs);
+/*!
+ * \brief CPU/GPU: dst = alpha * op(lhs) op(rhs) + beta * dst
+ * \param dst Length 3 tensor, used to hold the result
+ * \param lhs Left operand vector
+ * \param rhs Right operand vector
+ * \param alpha multiplier of op(lhs)op(rhs)
+ * \param beta multiplier of dst
+ * \param workspace Workspace for casting DType* to DType** (batched-view), must have size >= 3 * batch_size
+ */
+template<bool transpose_left, bool transpose_right, typename Device, typename DType>
+inline void BatchGEMM(Tensor<Device, 3, DType> dst,
+                      const Tensor<Device, 3, DType> &lhs,
+                      const Tensor<Device, 3, DType> &rhs,
+                      DType alpha,
+                      DType beta,
+                      Tensor<Device, 1, DType*> workspace);
+}  // namespace mshadow
+// include headers
+#include "./stream_gpu-inl.h"
+#include "./extension.h"
+#include "./expr_engine-inl.h"
+#include "./tensor_cpu-inl.h"
+#include "./tensor_gpu-inl.h"
+#include "./io.h"
+#include "./tensor_container.h"
+#include "./random.h"
+// add definition of scalar related operators
+#ifdef MSHADOW_SCALAR_
+  #error "MSHADOW_SCALAR_ must not be defined"
+#endif
+// enumerate all the scalar data type we aim to be good at
+#define MSHADOW_SCALAR_ float
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ double
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ int32_t
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ int64_t
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#define MSHADOW_SCALAR_ mshadow::half::half_t
+#include "./expr_scalar-inl.h"
+#undef MSHADOW_SCALAR_
+#endif  // MSHADOW_TENSOR_H_
diff --git a/3rdparty/mshadow/mshadow/tensor_container.h b/3rdparty/mshadow/mshadow/tensor_container.h
new file mode 100644
index 000000000000..8dee1de10eb9
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/tensor_container.h
@@ -0,0 +1,208 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_container.h
+ * \brief tensor container that does memory allocation and resize like STL
+ * \author Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_CONTAINER_H_
+#define MSHADOW_TENSOR_CONTAINER_H_
+#include "./tensor.h"
+#include "./io.h"
+
+namespace mshadow {
+/*!
+ * \brief tensor container that does memory allocation and resize like STL,
+ *        use it to save the lines of FreeSpace in class.
+ *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
+ *
+ * \tparam Device which device the tensor is on
+ * \tparam dimension dimension of the tensor
+ */
+template<typename Device, int dimension, typename DType = default_real_t>
+class TensorContainer: public Tensor<Device, dimension, DType> {
+ public:
+  /*!
+   * \brief constructor
+   * \param pad whether use padding alignment in space allocation
+   */
+  explicit TensorContainer(bool pad = MSHADOW_ALLOC_PAD) {
+    this->pad_ = pad;
+    this->dptr_ = data_.dptr_ = NULL;
+    this->shape_[0] = 0;
+    this->stride_ = 0;
+    this->data_.stride_ = 0;
+    this->data_.shape_[0] = 0;
+  }
+  /*!
+   * \brief constructor
+   * \param shape intial shape
+   */
+  explicit TensorContainer(const Shape<dimension> &shape) {
+    this->pad_ = MSHADOW_ALLOC_PAD;
+    data_.dptr_ = NULL;
+    this->AllocByShape(shape);
+  }
+  /*!
+   * \brief constructor
+   * \param shape intial shape
+   * \param initv intial value
+   */
+  explicit TensorContainer(const Shape<dimension> &shape, DType initv) {
+    this->pad_ = MSHADOW_ALLOC_PAD;
+    data_.dptr_ = NULL;
+    this->AllocByShape(shape);
+    (*this) = initv;
+  }
+  /*!
+   * \brief copy constructor
+   * \param src source value
+   */
+  TensorContainer
+  (const TensorContainer<Device, dimension, DType> &src)
+      : pad_(src.pad_) {
+    this->dptr_ = data_.dptr_ = NULL;
+    this->shape_[0] = 0;
+    this->stride_ = 0;
+    this->data_.stride_ = 0;
+    this->data_.shape_[0] = 0;
+    this->stream_ = src.stream_;
+    if (src.dptr_ != NULL) {
+      this->AllocByShape(src.shape_);
+      mshadow::Copy(*this, src, this->stream_);
+    }
+  }
+  ~TensorContainer(void) MSHADOW_THROW_EXCEPTION {
+    this->Release();
+  }
+  /*!
+   * \brief resize the container to given shape, content is NOT preserved
+   * \param shape target shape
+   */
+  inline void Resize(const Shape<dimension> &shape) {
+    Shape<2> s2 = shape.FlatTo2D();
+    if (s2.shape_[1] > data_.stride_ || s2.shape_[0] > data_.size(0)) {
+      this->AllocByShape(shape);
+    } else {
+      this->shape_ = shape;
+      if (this->pad_) {
+        this->stride_ = data_.stride_;
+      } else {
+        this->stride_ = s2.shape_[1];
+      }
+    }
+  }
+  /*!
+   * \brief resize the container to given shape, and initialize, content is NOT preserved
+   * \param shape target shape
+   * \param initv initialization value
+   */
+  inline void Resize(const Shape<dimension> &shape, DType initv) {
+    this->Resize(shape);
+    (*this) = initv;
+  }
+  /*! \brief set whether padding is allowed in tensor */
+  inline void set_pad(bool pad) {
+    this->pad_ = pad;
+  }
+  /*!
+   * \brief save by binary format
+   * \param fo output binary stream
+   * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+   */
+  template<typename TStream>
+  inline void SaveBinary(TStream &fo) const { // NOLINT(*)
+    mshadow::SaveBinary(fo, *this);
+  }
+  /*!
+   * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
+   * \param fi input binary stream
+   * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+   */
+  template<typename TStream>
+  inline void LoadBinary(TStream &fi) { // NOLINT(*)
+    Tensor<cpu, dimension, DType> tmp;
+    mshadow::LoadBinary(fi, &tmp, false);
+    this->Resize(tmp.shape_);
+    Stream<Device> stream;
+    Copy(*this, tmp, &stream);
+    mshadow::FreeSpace(&tmp);
+  }
+  /*!
+   * \brief assign operator from TensorContainer
+   * \param src source value
+   * \return reference of self
+   */
+  inline TensorContainer &operator=
+  (const TensorContainer<Device, dimension, DType> &src) {
+    this->pad_ = src.pad_;
+    this->stream_ = src.stream_;
+    if (src.dptr_ != NULL) {
+      this->Resize(src.shape_);
+      mshadow::Copy(*this, src, this->stream_);
+    }
+    return *this;
+  }
+  /*!\brief functions to fit expression template */
+  inline Tensor<Device, dimension, DType> &operator=(DType s) {
+    return this->__assign(s);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kMapper> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kChainer> &exp) {
+    return this->__assign(exp);
+  }
+  /*!\brief functions to fit expression template */
+  template<typename E>
+  inline Tensor<Device, dimension, DType> &
+  operator=(const expr::Exp<E, DType, expr::type::kComplex> &exp) {
+    return this->__assign(exp);
+  }
+  /*!
+   * \brief Release the llocated space,
+   *  The TensorContainer is still functionable,
+   *  but will restart allocating space when Resize is called.
+   */
+  inline void Release(void) {
+    if (data_.dptr_ != NULL) {
+      this->shape_[0] = 0;
+      this->stride_ = 0;
+      this->data_.stride_ = 0;
+      this->data_.shape_[0] = 0;
+      try {
+        mshadow::FreeSpace(&data_);
+      } catch (const dmlc::Error &e) {
+        this->dptr_ = data_.dptr_ = NULL;
+        throw e;
+      }
+      this->dptr_ = data_.dptr_ = NULL;
+    }
+  }
+
+ private:
+  /*! \brief whether we do padding in the space */
+  bool pad_;
+  /*! \brief the shape of data_ is actually current data space */
+  Tensor<Device, 2, DType> data_;
+
+  inline void AllocByShape(const Shape<dimension>& shape) {
+    if (data_.dptr_ != NULL) this->Release();
+    data_.shape_ = shape.FlatTo2D();
+    mshadow::AllocSpace(&data_, pad_);
+    this->dptr_ = data_.dptr_;
+    this->shape_ = shape;
+    if (this->pad_) {
+      this->stride_ = data_.stride_;
+    } else {
+      this->stride_ = data_.size(1);
+    }
+  }
+};
+}  // namespace mshadow
+#endif  // MSHADOW_TENSOR_CONTAINER_H_
diff --git a/3rdparty/mshadow/mshadow/tensor_cpu-inl.h b/3rdparty/mshadow/mshadow/tensor_cpu-inl.h
new file mode 100755
index 000000000000..ab5f9a68df14
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/tensor_cpu-inl.h
@@ -0,0 +1,627 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_cpu-inl.h
+ * \brief implementation of CPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_CPU_INL_H_
+#define MSHADOW_TENSOR_CPU_INL_H_
+#include <cstring>
+#include <functional>
+#include <utility>
+#include <vector>
+#include "./base.h"
+#include "./tensor.h"
+#include "./packet-inl.h"
+#include "./dot_engine-inl.h"
+
+namespace mshadow {
+template<>
+inline void InitTensorEngine<cpu>(int dev_id) {
+}
+template<>
+inline void ShutdownTensorEngine<cpu>(void) {
+}
+
+template<>
+inline void SetDevice<cpu>(int devid) {
+}
+template<>
+inline Stream<cpu> *NewStream<cpu>(bool create_blas_handle,
+                                   bool create_dnn_handle,
+                                   int dev_id) {
+  return new Stream<cpu>();
+}
+template<>
+inline void DeleteStream<cpu>(Stream<cpu> *stream) {
+  delete stream;
+}
+
+template<int ndim>
+inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape) { // NOLINT(*)
+  os << '(';
+  for (int i = 0; i < ndim; ++i) {
+    if (i != 0) os << ',';
+    os << shape[i];
+  }
+  // python style tuple
+  if (ndim == 1) os << ',';
+  os << ')';
+  return os;
+}
+
+template<typename xpu>
+inline void *AllocHost_(size_t size);
+template<typename xpu>
+inline void FreeHost_(void * dptr);
+
+#ifdef __CUDACC__
+template<>
+inline void *AllocHost_<gpu>(size_t size) {
+  void *dptr;
+  MSHADOW_CUDA_CALL(cudaMallocHost(&dptr, size, cudaHostAllocPortable));
+  return dptr;
+}
+template<>
+inline void FreeHost_<gpu>(void *dptr) {
+  MSHADOW_CUDA_CALL(cudaFreeHost(dptr));
+}
+#endif
+
+template<>
+inline void *AllocHost_<cpu>(size_t size) {
+  size_t pitch;
+  return packet::AlignedMallocPitch(&pitch, size, 1);
+}
+template<>
+inline void FreeHost_<cpu>(void *dptr) {
+  packet::AlignedFree(dptr);
+}
+
+template<typename xpu, int dim, typename DType>
+inline void AllocHost(Tensor<cpu, dim, DType> *obj) {
+  obj->stride_ = obj->size(dim - 1);
+  CHECK_EQ(obj->CheckContiguous(), true) << "AllocHost";
+  void *dptr = AllocHost_<xpu>(obj->MSize() * sizeof(DType));
+  obj->dptr_ = reinterpret_cast<DType*>(dptr);
+}
+template<typename xpu, int dim, typename DType>
+inline void FreeHost(Tensor<cpu, dim, DType> *obj) {
+  if (obj->dptr_ == NULL) {
+    LOG(FATAL) << "FreeHost:: double free";
+  }
+  FreeHost_<xpu>(obj->dptr_);
+  obj->dptr_ = NULL;
+}
+
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<cpu, dim, DType> *obj, bool pad) {
+  size_t pitch;
+  void *dptr;
+  if (pad) {
+    dptr = packet::AlignedMallocPitch
+        (&pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]);
+    obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
+  } else {
+    obj->stride_ = obj->size(dim - 1);
+    dptr = packet::AlignedMallocPitch
+        (&pitch, obj->shape_.Size() * sizeof(DType), 1);
+  }
+  obj->dptr_ = reinterpret_cast<DType*>(dptr);
+}
+template<typename Device, typename DType, int dim>
+inline Tensor<Device, dim, DType>
+NewTensor(const Shape<dim> &shape, DType initv, bool pad, Stream<Device> *stream_) {
+  Tensor<Device, dim, DType> obj(shape);
+  obj.stream_ = stream_;
+  AllocSpace(&obj, pad);
+  MapExp<sv::saveto>(&obj, expr::ScalarExp<DType>(initv));
+  return obj;
+}
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<cpu, dim, DType> *obj) {
+  packet::AlignedFree(obj->dptr_);
+  obj->dptr_ = NULL;
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> _dst,
+                 const Tensor<cpu, dim, DType> &_src,
+                 Stream<cpu> *stream) {
+  CHECK_EQ(_dst.shape_, _src.shape_)
+      << "Copy:shape mismatch:" << _dst.shape_ << " vs " << _src.shape_;
+  if (_dst.CheckContiguous() && _src.CheckContiguous()) {
+    memcpy(_dst.dptr_, _src.dptr_, sizeof(DType) * _dst.shape_.Size());
+  } else {
+    Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
+    Tensor<cpu, 2, DType> src = _src.FlatTo2D();
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1));
+    }
+  }
+}
+
+template<typename Saver, typename R, int dim,
+         typename DType, typename E>
+inline void MapPlan(TRValue<R, cpu, dim, DType> *dst,
+                    const expr::Plan<E, DType> &plan) {
+  Shape<2> shape = expr::ShapeCheck<dim, R>::Check(dst->self()).FlatTo2D();
+  expr::Plan<R, DType> dplan = expr::MakePlan(dst->self());
+#ifndef __CUDACC__
+  #pragma omp parallel for
+#endif
+  // temp remove openmp, as default setting throttles CPU
+  for (openmp_index_t y = 0; y < shape[0]; ++y) {
+    for (index_t x = 0; x < shape[1]; ++x) {
+      // trust your compiler! -_- they will optimize it
+      Saver::template Save<DType>(dplan.REval(y, x), plan.Eval(y, x));
+    }
+  }
+}
+// code to handle SSE optimization
+template<bool pass_check, typename Saver,
+         typename R, int dim,
+         typename DType, typename E, int etype>
+struct MapExpCPUEngine {
+  inline static void Map(TRValue<R, cpu, dim, DType> *dst,
+                         const expr::Exp<E, DType, etype> &exp) {
+    MapPlan<Saver>(dst, MakePlan(exp.self()));
+  }
+};
+
+template<typename SV, int dim, typename DType, typename E, int etype>
+struct MapExpCPUEngine<true, SV, Tensor<cpu, dim, DType>,
+                       dim, DType, E, etype> {
+  inline static void Map(Tensor<cpu, dim, DType> *dst,
+                         const expr::Exp<E, DType, etype> &exp) {
+    if (expr::PacketAlignCheck<dim, E, MSHADOW_DEFAULT_PACKET>::Check(exp.self()) &&
+        expr::PacketAlignCheck<dim, Tensor<cpu, dim, DType>, MSHADOW_DEFAULT_PACKET>::Check(*dst)) {
+      expr::MapPacketPlan<SV>(dst->self(),
+                              expr::MakePacketPlan<MSHADOW_DEFAULT_PACKET>(exp.self()));
+    } else {
+      MapPlan<SV>(dst, MakePlan(exp.self()));
+    }
+  }
+};
+
+
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, cpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, dim, DType, E>::kMapPass>
+      ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+  Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
+  Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
+  CHECK(eshape[0] == 0 || eshape == dshape)
+      << "Assignment: Shape of Tensors are not consistent with target, "
+      << "eshape: " << eshape << " dshape:" << dshape;
+  MapExpCPUEngine<expr::PacketCheck<E, MSHADOW_DEFAULT_PACKET>::kPass,
+                  Saver, R, dim, DType, E, etype>
+  ::Map(dst->ptrself(), exp);
+}
+
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, 1, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self()).FlatTo2D();
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
+  CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
+  CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
+  // execution
+  expr::Plan<R, DType> dplan = MakePlan(dst->self());
+  expr::Plan<E, DType> splan = MakePlan(exp.self());
+#ifndef __CUDACC__
+  #pragma omp parallel for
+#endif
+  for (openmp_index_t x = 0; x < eshape[1]; ++x) {
+    DType res = splan.Eval(0, x);
+    for (index_t y = 1; y < eshape[0]; ++y) {
+      Reducer::Reduce(res, splan.Eval(y, x));
+    }
+    Saver::template Save<DType>(dplan.REval(0, x), res * scale);
+  }
+}
+
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<cpu, dimkeep, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  typedef Shape<expr::ExpInfo<E>::kDim> EShape;
+  EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self());
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
+  CHECK_EQ(eshape[dimkeep], dshape[0])
+    << "MapReduceKeepHighDim::reduction dimension do not match";
+  // use equvalent form
+  Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
+                           eshape[dimkeep],
+                           eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
+                           eshape[EShape::kSubdim]);
+  // execution
+  expr::Plan<R, DType> dplan = MakePlan(dst->self());
+  expr::Plan<E, DType> splan = MakePlan(exp.self());
+#ifndef __CUDACC__
+  #pragma omp parallel for
+#endif
+  for (openmp_index_t c = 0; c < pshape[1]; ++c) {
+    DType res; Reducer::SetInitValue(res);
+    for (index_t n = 0; n < pshape[0]; ++n) {
+      DType tres; Reducer::SetInitValue(tres);
+      for (index_t y = 0; y < pshape[2]; ++y) {
+        for (index_t x = 0; x < pshape[3]; ++x) {
+          Reducer::Reduce(tres,
+                          splan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
+        }
+      }
+      Reducer::Reduce(res, tres);
+    }
+    Saver::template Save<DType>(dplan.REval(0, c), DType(res * scale));
+  }
+}
+
+template<typename DType>
+inline void Softmax(Tensor<cpu, 1, DType> dst,
+                    const Tensor<cpu, 1, DType> &energy) {
+  DType mmax = energy[0];
+  for (index_t x = 1; x < dst.size(0); ++x) {
+    if (mmax < energy[x]) mmax = energy[x];
+  }
+  DType sum = DType(0.0f);
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] = std::exp(energy[x] - mmax);
+    sum += dst[x];
+  }
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] /= sum;
+  }
+}
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label) {
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    const index_t k = static_cast<int>(label[y]);
+    for (index_t x = 0; x < dst.size(1); ++x) {
+      if (x == k) {
+        dst[y][k] = src[y][k] - 1.0f;
+      } else {
+        dst[y][x] = src[y][x];
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label,
+                        const float alpha) {
+  const float smooth_grad = (alpha / (dst.size(1) - 1));
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    const index_t k = static_cast<int>(label[y]);
+    for (index_t x = 0; x < dst.size(1); ++x) {
+      if (x == k) {
+        dst[y][k] = src[y][k] - 1.0f + alpha;
+      } else {
+        dst[y][x] = src[y][x] - smooth_grad;
+      }
+    }
+  }
+}
+
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label,
+                        const DType &ignore_label) {
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    const int k = static_cast<int>(label[y]);
+    for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+      if (static_cast<int>(ignore_label) == k) {
+        dst[y][x] = 0.0f;
+      } else {
+        if (x == k) {
+          dst[y][k] = src[y][k] - 1.0f;
+        } else {
+          dst[y][x] = src[y][x];
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                              const Tensor<cpu, 2, DType> &src,
+                              const Tensor<cpu, 1, DType> &label,
+                              const DType &ignore_label,
+                              const float alpha) {
+  const float smooth_grad = (alpha / (dst.size(1) - 1));
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    const int k = static_cast<int>(label[y]);
+    for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+      if (static_cast<int>(ignore_label) == k) {
+        dst[y][x] = 0.0f;
+      } else {
+        if (x == k) {
+          dst[y][k] = src[y][k] - 1.0f + alpha;
+        } else {
+          dst[y][x] = src[y][x] - smooth_grad;
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label) {
+#pragma omp parallel for
+  for (openmp_index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const int k = static_cast<int>(label[y][n]);
+      for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+        if (x == k) {
+          dst[y][k][n] = src[y][k][n] - 1.0f;
+        } else {
+          dst[y][x][n] = src[y][x][n];
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label,
+                        const float alpha) {
+  const float smooth_grad = (alpha / (dst.size(1) - 1));
+#pragma omp parallel for
+  for (openmp_index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const int k = static_cast<int>(label[y][n]);
+      for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+        if (x == k) {
+          dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
+        } else {
+          dst[y][x][n] = src[y][x][n] - smooth_grad;
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label,
+                        const DType &ignore_label) {
+#pragma omp parallel for
+  for (openmp_index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const int k = static_cast<int>(label[y][n]);
+      if (k == static_cast<int>(ignore_label)) {
+        for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+          dst[y][x][n] = DType(0.0f);
+        }
+      } else {
+        for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+          if (x == k) {
+            dst[y][k][n] = src[y][k][n] - 1.0f;
+          } else {
+            dst[y][x][n] = src[y][x][n];
+          }
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label,
+                        const DType &ignore_label,
+                        const float alpha) {
+  const float smooth_grad = (alpha / (dst.size(1) - 1));
+#pragma omp parallel for
+  for (openmp_index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const int k = static_cast<int>(label[y][n]);
+      if (k == static_cast<int>(ignore_label)) {
+        for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+          dst[y][x][n] = DType(0.0f);
+        }
+      } else {
+        for (int x = 0; x < static_cast<int>(dst.size(1)); ++x) {
+          if (x == k) {
+            dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
+          } else {
+            dst[y][x][n] = src[y][x][n] - smooth_grad;
+          }
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void Softmax(Tensor<cpu, 2, DType> dst,
+                    const Tensor<cpu, 2, DType> &energy) {
+  CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    Softmax(dst[y], energy[y]);
+  }
+}
+
+template<typename DType>
+inline void Softmax(Tensor<cpu, 3, DType> dst,
+                    const Tensor<cpu, 3, DType> &energy) {
+  CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
+#pragma omp parallel for
+  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
+    for (index_t n = 0; n < dst.size(2); ++n) {
+      DType mmax = energy[y][0][n];
+      for (index_t x = 1; x < dst.size(1); ++x) {
+        if (mmax < energy[y][x][n]) mmax = energy[y][x][n];
+      }
+      DType sum = DType(0.0f);
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] = std::exp(energy[y][x][n] - mmax);
+        sum += dst[y][x][n];
+      }
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] /= sum;
+      }
+    }
+  }
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 1, IndexType>& index,
+                        const Tensor<cpu, 2, DType> &src) {
+  const int K = dst.shape_[0];
+  for (index_t y = 0; y < index.size(0); ++y) {
+    int j = index[y];
+    if (j <= 0) j = 0;
+    else if (j >= K) j = K - 1;
+    dst[j] += src[y];
+  }
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGradLargeBatch(Tensor<cpu, 2, DType> dst,
+                                  const Tensor<cpu, 1, IndexType>& sorted,
+                                  const Tensor<cpu, 1, IndexType>& index,
+                                  const Tensor<cpu, 2, DType> &src) {
+  for (index_t y = 0; y < sorted.size(0); ++y) {
+    dst[sorted[y]] += src[index[y]];
+  }
+}
+
+template<typename IndexType, typename DType>
+inline void IndexFill(Tensor<cpu, 2, DType> dst,
+                      const Tensor<cpu, 1, IndexType>& index,
+                      const Tensor<cpu, 2, DType> &src) {
+  for (index_t y = 0; y < index.size(0); ++y) {
+    for (index_t j = 0; j < src.size(1); j++) {
+      dst[index[y]][j] = src[y][j];
+    }
+  }
+}
+
+template<typename KDType, typename VDType>
+inline void SortByKey(Tensor<cpu, 1, KDType> keys, Tensor<cpu, 1, VDType> values,
+                      bool is_ascend) {
+  CHECK_EQ(keys.CheckContiguous(), true);
+  CHECK_EQ(values.CheckContiguous(), true);
+  CHECK_EQ(keys.size(0), values.size(0))
+    << "The sizes of key/value are not equal! keys_size: " << keys.size(0)
+    << "values_size: " << values.size(0);
+  std::vector<size_t> idx(keys.size(0));
+  std::vector<KDType> keys_vec(keys.size(0));
+  std::vector<VDType> values_vec(values.size(0));
+  for (int i = 0; i < keys.size(0); i++) {
+    idx[i] = i;
+    keys_vec[i] = keys[i];
+    values_vec[i] = values[i];
+  }
+  if (is_ascend) {
+    std::stable_sort(idx.begin(), idx.end(),
+                     [&keys_vec](size_t i1, size_t i2)
+                       {return keys_vec[i1] < keys_vec[i2]; });
+  } else {
+    std::stable_sort(idx.begin(), idx.end(),
+                     [&keys_vec](size_t i1, size_t i2)
+                       {return keys_vec[i1] > keys_vec[i2]; });
+  }
+  for (index_t i = 0; i < values.size(0); i++) {
+    keys[i] = keys_vec[idx[i]];
+    values[i] = values_vec[idx[i]];
+  }
+}
+
+template<typename Device, typename VDType, typename SDType>
+inline void VectorizedSort(Tensor<Device, 1, VDType> values, Tensor<Device, 1, SDType> segments) {
+  // We can sort each segments using two stable sorts
+  SortByKey(values, segments, true);
+  SortByKey(segments, values, true);
+}
+
+// blas related
+template<typename Device, typename DType>
+inline void VectorDot(Tensor<Device, 1, DType> dst,
+                      const Tensor<Device, 1, DType> &lhs,
+                      const Tensor<Device, 1, DType> &rhs) {
+  CHECK_EQ(lhs.size(0), rhs.size(0))
+      << "VectorDot: Shape mismatch";
+  CHECK_EQ(dst.size(0), 1U)
+      << "VectorDot: expect dst to be scalar";
+  expr::BLASEngine<Device, DType>::SetStream(lhs.stream_);
+  mshadow::expr::BLASEngine<Device, DType>::dot(
+      lhs.stream_, lhs.size(0), lhs.dptr_, 1, rhs.dptr_, 1, dst.dptr_);
+}
+
+template<bool transpose_left, bool transpose_right, typename Device, typename DType>
+inline void BatchGEMM(Tensor<Device, 3, DType> dst,
+                      const Tensor<Device, 3, DType> &lhs,
+                      const Tensor<Device, 3, DType> &rhs,
+                      DType alpha,
+                      DType beta,
+                      Tensor<Device, 1, DType*> workspace) {
+  index_t batch_size = dst.shape_[0];
+  expr::BLASEngine<Device, DType>::SetStream(dst.stream_);
+  Shape<3> sleft = transpose_left ? Shape3(lhs.shape_[0], lhs.shape_[2], lhs.shape_[1])
+    : lhs.shape_;
+  Shape<3> sright = transpose_right ? Shape3(rhs.shape_[0], rhs.shape_[2], rhs.shape_[1])
+    : rhs.shape_;
+  CHECK_EQ(dst.CheckContiguous(), true);
+  CHECK_EQ(lhs.CheckContiguous(), true);
+  CHECK_EQ(rhs.CheckContiguous(), true);
+  CHECK(sleft[0] == batch_size && sright[0] == batch_size)
+    << "BatchGEMM: batchsize must be equal."
+    << "dst: " << dst.shape_ << "\n"
+    << "lhs: " << sleft << "\n"
+    << "rhs: " << sright << "\n";
+  CHECK(dst.size(1) == sleft[1] && dst.size(2) == sright[2] && sleft[2] == sright[1])
+    << "BatchGEMM: matrix shape mismatch"
+    << "dst: " << dst.shape_ << "\n"
+    << "lhs: " << sleft << "\n"
+    << "rhs: " << sright << "\n";
+  CHECK(workspace.size(0) >= 3 * batch_size)
+    << "Workspace Size must be bigger than " << 3 * batch_size;
+  CHECK_EQ(workspace.CheckContiguous(), true);
+  // use column major argument to compatible with most BLAS
+  expr::BLASEngine<Device, DType>::batched_gemm
+    (dst.stream_,
+    transpose_right, transpose_left,
+    transpose_right ? rhs.size(1) : rhs.size(2),
+    transpose_left ? lhs.size(2) : lhs.size(1),
+    transpose_right ? rhs.size(2) : rhs.size(1),
+    alpha,
+    rhs.dptr_, rhs.stride_,
+    lhs.dptr_, lhs.stride_,
+    beta,
+    dst.dptr_, dst.stride_, batch_size,
+    workspace.dptr_);
+}
+}  // namespace mshadow
+#endif  // MSHADOW_TENSOR_CPU_INL_H_
diff --git a/3rdparty/mshadow/mshadow/tensor_gpu-inl.h b/3rdparty/mshadow/mshadow/tensor_gpu-inl.h
new file mode 100755
index 000000000000..94fdb0527e72
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/tensor_gpu-inl.h
@@ -0,0 +1,245 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_gpu-inl.h
+ * \brief implementation of GPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#ifndef MSHADOW_TENSOR_GPU_INL_H_
+#define MSHADOW_TENSOR_GPU_INL_H_
+#include "./base.h"
+#include "./tensor.h"
+
+namespace mshadow {
+#if MSHADOW_USE_CUDA
+template<>
+inline void InitTensorEngine<gpu>(int dev_id) {
+  cudaDeviceProp prop;
+  int device_id = 0;
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  CHECK_GT(device_count, 0) << "Cannot find CUDA device. Please check CUDA-Configuration";
+  if (dev_id < 0) {
+    device_id = 0;
+  } else {
+    device_id = dev_id;
+  }
+  CHECK_LT(device_id, device_count) << "Incorrect Device ID";
+  MSHADOW_CUDA_CALL(cudaSetDevice(device_id));
+  MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
+}
+template<>
+inline void ShutdownTensorEngine<gpu>(void) {
+}
+template<>
+inline void SetDevice<gpu>(int devid) {
+  MSHADOW_CUDA_CALL(cudaSetDevice(devid));
+}
+template<int dim, typename DType>
+inline void AllocSpace(Tensor<gpu, dim, DType> *obj, bool pad) {
+  size_t pitch;
+  // common choice for cuda mem align unit is 32
+  if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) {
+    MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
+                                      obj->size(dim - 1) * sizeof(DType),
+                                      obj->shape_.FlatTo2D()[0]));
+    obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
+  } else {
+    obj->stride_ = obj->size(dim - 1);
+    MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
+                                      obj->shape_.Size() * sizeof(DType), 1));
+  }
+}
+template<int dim, typename DType>
+inline void FreeSpace(Tensor<gpu, dim, DType> *obj) {
+  MSHADOW_CUDA_CALL(cudaFree(obj->dptr_));
+  obj->dptr_ = NULL;
+}
+template<typename A, typename B, int dim, typename DType>
+inline void Copy(Tensor<A, dim, DType> _dst,
+                 Tensor<B, dim, DType> _src,
+                 cudaMemcpyKind kind,
+                 Stream<gpu> *stream) {
+  CHECK_EQ(_dst.shape_, _src.shape_) << "Copy:shape mismatch";
+  Tensor<A, 2, DType> dst = _dst.FlatTo2D();
+  Tensor<B, 2, DType> src = _src.FlatTo2D();
+  MSHADOW_CUDA_CALL(cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType),
+                                      src.dptr_, src.stride_ * sizeof(DType),
+                                      dst.size(1) * sizeof(DType),
+                                      dst.size(0), kind,
+                                      Stream<gpu>::GetStream(stream)));
+  // use synchronize call behavior for zero stream
+  if (stream == NULL) {
+    MSHADOW_CUDA_CALL(cudaStreamSynchronize(0));
+  }
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<cpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyDeviceToHost, stream);
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<gpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
+}
+template<int dim, typename DType>
+inline void Copy(Tensor<gpu, dim, DType> dst,
+                 const Tensor<cpu, dim, DType> &src,
+                 Stream<gpu> *stream) {
+  Copy(dst, src, cudaMemcpyHostToDevice, stream);
+}
+#endif  // MSHADOW_USE_CUDA
+}  // namespace mshadow
+
+// the following part is included only if compiler is nvcc
+#ifdef __CUDACC__
+#include "./cuda/tensor_gpu-inl.cuh"
+
+namespace mshadow {
+template<typename Saver, typename R, int dim,
+         typename DType, typename E, int etype>
+inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
+                   const expr::Exp<E, DType, etype> &exp) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, dim, DType, E>::kMapPass>
+      ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+  Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
+  Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
+  CHECK(eshape[0] == 0 || eshape == dshape)
+    << "Assignment: Shape of Tensors are not consistent with target, "
+    << "eshape: " << eshape << " dshape:" << dshape;
+  cuda::MapPlan<Saver>(MakePlan(dst->self()),
+                       MakePlan(exp.self()),
+                       dshape.FlatTo2D(),
+                       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+
+template<typename Saver, typename Reducer,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
+                                const expr::Exp<E, DType, etype> &exp,
+                                DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, 1, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self()).FlatTo2D();
+  Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
+  CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
+  CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
+  cuda::MapReduceKeepLowest<Saver, Reducer>
+      (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape,
+       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+
+template<typename Saver, typename Reducer, int dimkeep,
+         typename R, typename DType, typename E, int etype>
+inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
+                                 const expr::Exp<E, DType, etype> &exp,
+                                 DType scale) {
+  expr::TypeCheckPass<expr::TypeCheck<gpu, dimkeep, DType, E>::kRedPass>
+      ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+  typedef Shape<expr::ExpInfo<E>::kDim> EShape;
+  EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
+      ::Check(exp.self());
+    Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
+  CHECK_EQ(eshape[dimkeep], dshape[0]) << "MapReduceKeepHighDim::reduction dimension do not match";
+  // use equvalent form
+  Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
+                           eshape[dimkeep],
+                           eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
+                           eshape[EShape::kSubdim]);
+  // call equavalent map red dim 2
+  cuda::MapReduceKeepDim1<Saver, Reducer>
+      (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape,
+       Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
+}
+template<typename DType>
+inline void Softmax(Tensor<gpu, 2, DType> dst,
+                    const Tensor<gpu, 2, DType>& src) {
+  cuda::Softmax(dst, src);
+}
+
+template<typename DType>
+inline void Softmax(Tensor<gpu, 3, DType> dst,
+                    const Tensor<gpu, 3, DType>& src) {
+  cuda::Softmax(dst, src);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label) {
+  cuda::SoftmaxGrad(dst, src, label);
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                              const Tensor<gpu, 2, DType> &src,
+                              const Tensor<gpu, 1, DType> &label,
+                              const float alpha) {
+  cuda::SmoothSoftmaxGrad(dst, src, label, alpha);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label,
+                        const DType &ignore_label) {
+  cuda::SoftmaxGrad(dst, src, label, ignore_label);
+}
+
+template<typename DType>
+inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
+                              const Tensor<gpu, 2, DType> &src,
+                              const Tensor<gpu, 1, DType> &label,
+                              const DType &ignore_label,
+                              const float alpha) {
+  cuda::SmoothSoftmaxGrad(dst, src, label, ignore_label, alpha);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  cuda::SoftmaxGrad(dst, src, label);
+}
+
+template<typename DType>
+inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label,
+                        const DType &ignore_label) {
+  cuda::SoftmaxGrad(dst, src, label, ignore_label);
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
+                        const Tensor<gpu, 1, IndexType>& index,
+                        const Tensor<gpu, 2, DType> &src) {
+  cuda::AddTakeGrad(dst, index, src);
+}
+
+template<typename IndexType, typename DType>
+inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
+                                  const Tensor<gpu, 1, IndexType>& sorted,
+                                  const Tensor<gpu, 1, IndexType>& index,
+                                  const Tensor<gpu, 2, DType> &src) {
+  cuda::AddTakeGradLargeBatch(dst, sorted, index, src);
+}
+
+template<typename KDType, typename VDType>
+inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,
+                      bool is_ascend) {
+  cuda::SortByKey(keys, values, is_ascend);
+}
+
+template<typename IndexType, typename DType>
+inline void IndexFill(Tensor<gpu, 2, DType> dst,
+                      const Tensor<gpu, 1, IndexType>& index,
+                      const Tensor<gpu, 2, DType> &src) {
+  cuda::IndexFill(dst, index, src);
+}
+}  // namespace mshadow
+#endif  // __CUDACC__
+#endif  // MSHADOW_TENSOR_GPU_INL_H_
diff --git a/3rdparty/mshadow/scripts/travis_script.sh b/3rdparty/mshadow/scripts/travis_script.sh
new file mode 100755
index 000000000000..c0220eb6c589
--- /dev/null
+++ b/3rdparty/mshadow/scripts/travis_script.sh
@@ -0,0 +1,19 @@
+# main script of travis
+if [ ${TASK} == "lint" ]; then
+    python3 dmlc-core/scripts/lint.py mshadow all mshadow mshadow-ps || exit -1
+fi
+
+if [ ${TASK} == "doc" ]; then
+    doxygen doc/Doxyfile 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep nothing) && exit -1
+fi
+
+if [ ${TASK} == "build" ]; then
+    cd guide
+    echo "USE_BLAS=blas" >> config.mk
+    make all || exit -1
+    cd mshadow-ps
+    echo "USE_BLAS=blas" >> config.mk
+    echo "USE_RABIT_PS=0" >> config.mk    
+    make local_sum.cpu || exit -1
+fi
diff --git a/3rdparty/mshadow/test/Makefile b/3rdparty/mshadow/test/Makefile
new file mode 100644
index 000000000000..dc2d0552deb4
--- /dev/null
+++ b/3rdparty/mshadow/test/Makefile
@@ -0,0 +1,35 @@
+# set LD_LIBRARY_PATH
+export CC  = gcc
+export CXX = g++
+export NVCC =nvcc
+export CFLAGS = -Wall -O3 -g -msse3 -Wno-unknown-pragmas -funroll-loops -I../
+export LDFLAGS= -g -lm -lcublas -lcudart -lcusolver
+export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX)
+
+# specify tensor path
+BIN = test_tblob
+OBJ =
+CUOBJ =
+CUBIN = test
+.PHONY: clean all
+
+all: $(CUBIN) $(BIN)
+
+test: test.cu
+
+test_tblob: test_tblob.cc
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -std=c++0x -o $@ $(filter %.cpp %.o %.c %.cc, $^)  $(LDFLAGS)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(CUOBJ) :
+	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^)
+
+$(CUBIN) :
+	$(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^)
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~
diff --git a/3rdparty/mshadow/test/pairtest.cu b/3rdparty/mshadow/test/pairtest.cu
new file mode 100644
index 000000000000..56b0380747a7
--- /dev/null
+++ b/3rdparty/mshadow/test/pairtest.cu
@@ -0,0 +1,105 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist, int I, int J) {
+  for (int i = 0; i < I; ++i) {
+    for (int j = 0; j < J; ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  const int O = (X - K) / 2 + 1;
+  mshadow::TensorContainer<mshadow::cpu, 4, float> srcm(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> srcx(Xmshadow::Shape4(1,1,X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[0][0][i][j] = i * 0.1f + j * 0.1f;
+      srcx[0][0][i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  printf("Source:\n");
+  Print(srcm[0][0], X, X);
+  printf("\n");
+  mshadow::TensorContainer<xpua, 4, float> mct(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<xpub, 4> xct(Xmshadow::Shape4(1,1,X, X));
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+
+  
+  mshadow::TensorContainer<xpua, 4, float> pool_ct(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<xpub, 4> pool_xct(Xmshadow::Shape4(1,1,O,O));
+
+  pool_ct = mshadow::expr::pool<mshadow::red::maximum>(mct, K, K, K);
+  pool_xct = Xmshadow::expr::pool<Xmshadow::red::maximum>(xct, K, K);
+
+  printf("New pool:\n");
+  Print(pool_ct[0][0], O, O);
+  printf("\nOld pool:\n");
+  Print(pool_xct[0][0], O, O);
+  printf("\n");
+  mshadow::TensorContainer<mshadow::cpu, 4, float> gpool_src(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> gpool_xsrc(Xmshadow::Shape4(1,1,O,O));
+  for (int i = 0; i < O; ++i) {
+    for (int j = 0; j < O; ++j) {
+      gpool_src[0][0][i][j] = 0.1f;
+      gpool_xsrc[0][0][i][j] = 0.1f;
+    }
+  }
+  mshadow::TensorContainer<xpua, 4, float> gpool_ct(mshadow::Shape4(1,1, O, O));
+  Xmshadow::TensorContainer<xpub, 4> gpool_xct(Xmshadow::Shape4(1,1,O,O));
+  mshadow::Copy(gpool_ct, gpool_src);
+  Xmshadow::Copy(gpool_xct, gpool_xsrc);
+
+  mshadow::TensorContainer<xpua, 4, float> mout(mshadow::Shape4(1,1,X, X));
+  Xmshadow::TensorContainer<xpub, 4> xout(Xmshadow::Shape4(1,1,X, X));
+
+  mout = mshadow::expr::unpool<mshadow::red::maximum>(mct, pool_ct, gpool_ct, K, K, K);
+  xout = Xmshadow::expr::unpool<Xmshadow::red::maximum>(xct, pool_xct, gpool_xct, K, K);
+
+  mshadow::Copy(srcm, mout);
+  Xmshadow::Copy(srcx, xout);
+
+  mshadow::TensorContainer<mshadow::cpu, 2> l1(mshadow::Shape2(X,X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> l2(Xmshadow::Shape2(X, X));
+  l1 = mshadow::expr::reshape(srcm, l1.shape_);
+  l2 = Xmshadow::expr::reshape(srcx, l2.shape);
+  printf("New unpool\n");
+  Print(l1, l1.size(0), l1.size(1));
+  printf("\nOld unpool\n");
+  Print(l2, X, X);
+  if (Check(l1, l2)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 1) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/mshadow/test/pool.cu b/3rdparty/mshadow/test/pool.cu
new file mode 100644
index 000000000000..9641d53c9c45
--- /dev/null
+++ b/3rdparty/mshadow/test/pool.cu
@@ -0,0 +1,69 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  mshadow::TensorContainer<mshadow::cpu, 2, float> srcm(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> srcx(Xmshadow::Shape2(X, X));
+  
+  mshadow::TensorContainer<xpua, 2, float> mct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<xpub, 2> xct(Xmshadow::Shape2(X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[i][j] = i * 0.1f + j * 0.1f;
+      srcx[i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+  mshadow::TensorContainer<xpua, 2, float> pool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  Xmshadow::TensorContainer<xpub, 2> pool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+
+  pool_ct = mshadow::expr::pool<mshadow::red::maximum>(mct, K, K, K);
+  pool_xct = Xmshadow::expr::pool<Xmshadow::red::maximum>(xct, K, K);
+
+  mshadow::TensorContainer<mshadow::cpu, 2, float> cpool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> cpool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1));
+  mshadow::Copy(cpool_ct, pool_ct);
+  Xmshadow::Copy(cpool_xct, pool_xct);
+  if (Check(cpool_ct, cpool_xct)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/mshadow/test/reshape.cu b/3rdparty/mshadow/test/reshape.cu
new file mode 100644
index 000000000000..c1ad52e07c40
--- /dev/null
+++ b/3rdparty/mshadow/test/reshape.cu
@@ -0,0 +1,74 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int X = 6;
+  const int K = 2;
+  mshadow::TensorContainer<mshadow::cpu, 2, float> srcm(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> srcx(Xmshadow::Shape2(X, X));
+  
+  mshadow::TensorContainer<xpua, 2, float> mct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<xpub, 2> xct(Xmshadow::Shape2(X, X));
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      srcm[i][j] = i * 0.1f + j * 0.1f;
+      srcx[i][j] = i * 0.1f + j * 0.1f;
+    }
+  }
+  mshadow::Copy(mct, srcm);
+  Xmshadow::Copy(xct, srcx);
+
+  mshadow::TensorContainer<xpua, 4, float> mct4d(mshadow::Shape4(1, 1, X / K, X * K));
+  Xmshadow::TensorContainer<xpub, 4> xct4d(Xmshadow::Shape4(X / K, X * K, 1, 1));
+  
+  mct4d = mshadow::expr::reshape(mct, mct4d.shape_);
+  xct4d = Xmshadow::expr::reshape(xct, xct4d.shape);
+  
+  mct = mshadow::expr::reshape(mct4d, mct.shape_);
+  xct = Xmshadow::expr::reshape(xct4d, xct.shape);
+  
+  mshadow::TensorContainer<mshadow::cpu, 2, float> m_ct(mshadow::Shape2(X, X));
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> x_ct(Xmshadow::Shape2(X, X));
+  
+  mshadow::Copy(m_ct, mct);
+  Xmshadow::Copy(x_ct, xct);
+  if (Check(m_ct, x_ct)) {
+    printf("Pass\n");
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file
diff --git a/3rdparty/mshadow/test/test.cu b/3rdparty/mshadow/test/test.cu
new file mode 100644
index 000000000000..13f26dcfd6ea
--- /dev/null
+++ b/3rdparty/mshadow/test/test.cu
@@ -0,0 +1,79 @@
+#include "test.h"
+
+using namespace mshadow;
+
+
+int main() {
+  InitTensorEngine<cpu>();
+  InitTensorEngine<gpu>();
+  Tensor<cpu, 3, float> tc = NewTensor<cpu, float>(Shape3(3, 2, 4), 0.0f);
+  Tensor<gpu, 3, float> tg = NewTensor<gpu, float>(tc.shape_, 0.0f);
+  // init
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    for (index_t j = 0; j < tc.size(1); ++j) {
+      for (index_t k = 0; k < tc.size(2); ++k) {
+        tc[i][j][k] = i * 0.1f + j * 0.2f + k * 0.1f;
+      }
+    }
+  }
+  Copy(tg, tc);
+  // print
+  printf("\n#print batch 0 of cpu tensor:\n");
+  Print2DTensor(tc[0]);
+  printf("\n");
+  Print2DTensor(tc[1]);
+  printf("\n");
+  Print2DTensor(tc[2]);
+  // check
+  if (Check2DTensor(tg[1], tc[1])) {
+    printf("batch 1 of gpu & cpu tensor are same.\n");
+  }
+  // sum of row
+  Tensor<cpu, 1, float> tmp_tc = NewTensor<cpu, float>(Shape1(tc[0].size(1)), 0.0f);
+  Tensor<gpu, 1, float> tmp_tg = NewTensor<gpu, float>(Shape1(tg[0].size(1)), 0.0f);
+  printf("\n#sum_rows of batch 0:\n");
+  tmp_tc = sum_rows(tc[0]);
+  tmp_tg = sum_rows(tg[0]);
+  Print1DTensor(tmp_tc);
+  if (Check1DTensor(tmp_tg, tmp_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&tmp_tc);
+  FreeSpace(&tmp_tg);
+  // sumall_except_dim
+  printf("\n#sumall_except_dim<0> of batch 0:\n");
+  Tensor<cpu, 1, float> red_tc = NewTensor<cpu, float>(Shape1(tc.size(0)), 0.0f);
+  Tensor<gpu, 1, float> red_tg = NewTensor<gpu, float>(Shape1(tg.size(0)), 0.0f);
+  red_tc = sumall_except_dim<0>(tc);
+  red_tg = sumall_except_dim<0>(tg);
+  Print1DTensor(red_tc);
+  if (Check1DTensor(red_tg, red_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&red_tc);
+  FreeSpace(&red_tg);
+  // softmax
+  printf("\n#Softmax\n");
+  Tensor<cpu, 2, float> sm_tc = NewTensor<cpu, float>(tc[0].shape_, 0.0f);
+  Tensor<gpu, 2, float> sm_tg = NewTensor<gpu, float>(tg[0].shape_, 0.0f);
+  Softmax(sm_tc, tc[0]);
+  Softmax(sm_tg, tg[0]);
+  if (Check2DTensor(sm_tg, sm_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  // mirror
+  printf("\n#mirror\n");
+  sm_tc = mirror(tc[0]);
+  sm_tg = mirror(tg[0]);
+  if (Check2DTensor(sm_tg, sm_tc)) {
+    printf("cpu & gpu result consists\n");
+  }
+  FreeSpace(&sm_tc);
+  FreeSpace(&sm_tg);
+  // reshape
+  
+  FreeSpace(&tc);
+  FreeSpace(&tg);
+  ShutdownTensorEngine<cpu>();
+  ShutdownTensorEngine<gpu>();
+}
diff --git a/3rdparty/mshadow/test/test.h b/3rdparty/mshadow/test/test.h
new file mode 100644
index 000000000000..2cfc515957ca
--- /dev/null
+++ b/3rdparty/mshadow/test/test.h
@@ -0,0 +1,67 @@
+#ifndef TEST_H
+#define TEST_H
+
+#include "mshadow/tensor.h"
+#include "assert.h"
+
+#define EPS 0.0001
+using namespace mshadow;
+using namespace mshadow::expr;
+
+
+template<typename xpu>
+void Print2DTensor(Tensor<xpu, 2, float> const &ts);
+
+template<typename xpu>
+void Print1DTensor(Tensor<xpu, 1, float> const &ts);
+
+template<>
+void Print1DTensor(Tensor<cpu, 1, float> const &ts) {
+  for (index_t i = 0; i < ts.size(0); ++i) {
+    printf("%.2f ", ts[i]);
+  }
+  printf("\n");
+}
+
+
+template<>
+void Print2DTensor(Tensor<cpu, 2, float> const &ts) {
+  for (index_t i = 0; i < ts.size(0); ++i) {
+    Print1DTensor(ts[i]);
+  }
+}
+
+template<>
+void Print2DTensor(Tensor<gpu, 2, float> const &tg) {
+  Tensor<cpu, 2, float> tc = NewTensor<cpu, float>(tg.shape_, 0.0f);
+  Copy(tc, tg);
+  Print2DTensor(tc);
+  FreeSpace(&tc);
+}
+
+
+
+bool Check2DTensor(Tensor<gpu, 2, float> const &tg, Tensor<cpu, 2, float> const &tc) {
+  Tensor<cpu, 2, float> tcc = NewTensor<cpu, float>(tg.shape_, 0.0f);
+  Copy(tcc, tg);
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    for (index_t j = 0; j < tc.size(1); ++j) {
+      assert(abs(tcc[i][j] - tc[i][j]) < EPS);
+    }
+  }
+  FreeSpace(&tcc);
+  return true;
+}
+
+bool Check1DTensor(Tensor<gpu, 1, float> const &tg, Tensor<cpu, 1, float> const &tc) {
+  Tensor<cpu, 1, float> tcc = NewTensor<cpu, float>(tc.shape_, 0.0f);
+  Copy(tcc, tg);
+  printf("gpu result:\n");
+  Print1DTensor(tcc);
+  for (index_t i = 0; i < tc.size(0); ++i) {
+    assert(abs(tcc[i] - tc[i]) < EPS);
+  }
+  FreeSpace(&tcc);
+  return true;
+}
+#endif
diff --git a/3rdparty/mshadow/test/unpack.cu b/3rdparty/mshadow/test/unpack.cu
new file mode 100644
index 000000000000..dd0c2b9c5821
--- /dev/null
+++ b/3rdparty/mshadow/test/unpack.cu
@@ -0,0 +1,85 @@
+#include "mshadow/tensor.h"
+#include "old/tensor.h"
+#include "assert.h"
+#include <cstring>
+
+using mshadow::index_t;
+template<typename T>
+void Print(T const & ist) {
+  for (int i = 0; i < ist.size(0); ++i) {
+    for (int j = 0; j < ist.size(1); ++j) {
+      printf("%.2f ", ist[i][j]);
+    }
+    printf("\n");
+  }
+}
+
+bool Check(mshadow::TensorContainer<mshadow::cpu, 2, float> &mct, \
+           Xmshadow::TensorContainer<Xmshadow::cpu, 2> &xct) {
+  for (index_t i = 0; i < mct.size(0); ++i) {
+    for (index_t j = 0; j < mct.size(1); ++j) {
+      assert(mct[i][j] == xct[i][j]);
+    }
+  }
+  return true;
+}
+
+template<typename xpua, typename xpub>
+void RunTask() {
+  const int ksize = 3;
+  const int kstride = 2;
+  const int X = 6;
+  Xmshadow::TensorContainer<Xmshadow::cpu, 4> xsrc(Xmshadow::Shape4(1, 1, X, X));
+  mshadow::TensorContainer<mshadow::cpu, 4> src(mshadow::Shape4(1, 1, X, X));
+
+  for (int i = 0; i < X; ++i) {
+    for (int j = 0; j < X; ++j) {
+      xsrc[0][0][i][j] = i * 0.1f + j * 0.2f;
+      src[0][0][i][j] = i * 0.1f + j * 0.2f;
+    }
+  }
+  Xmshadow::TensorContainer<xpub, 4> xin(Xmshadow::Shape4(1, 1, X, X));
+  mshadow::TensorContainer<xpua, 4> in(mshadow::Shape4(1, 1, X, X));
+
+  mshadow::Copy(in, src);
+  Xmshadow::Copy(xin, xsrc);
+
+  Xmshadow::TensorContainer<xpub, 2> xtmp_col;
+  mshadow::TensorContainer<xpua, 2> tmp_col;
+  
+
+  index_t oheight  = (in.size(2) - ksize)/kstride + 1;
+  index_t owidth   = (in.size(3) - ksize)/kstride + 1;
+  index_t nbatch   = in.size(0);
+
+  
+  xtmp_col.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
+  tmp_col.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+  xtmp_col = Xmshadow::expr::unpack_patch2col( xin, ksize, kstride );
+  tmp_col = mshadow::expr::unpack_patch2col(in, ksize, ksize, kstride);
+
+  Xmshadow::TensorContainer<Xmshadow::cpu, 2> xtc;
+  mshadow::TensorContainer<mshadow::cpu, 2> tc;
+
+  xtc.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) );
+  tc.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth));
+
+  mshadow::Copy(tc, tmp_col);
+  Xmshadow::Copy(xtc, xtmp_col);
+  if (Check(tc, xtc)) {
+    printf("Pass\n");
+  }
+  
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    printf("Usage: dev\n");
+    exit(-1);
+  }
+  if (!strcmp(argv[1], "cpu")) {
+    RunTask<mshadow::cpu, Xmshadow::cpu>();
+  } else {
+    RunTask<mshadow::gpu, Xmshadow::gpu>();
+  }
+}
\ No newline at end of file

From 4a2bfe0d5fd869230b882533d3d6f64245db2bbf Mon Sep 17 00:00:00 2001
From: Zhaoqi Zhu <zhaoqizh@usc.edu>
Date: Thu, 1 Aug 2019 20:29:22 -0700
Subject: [PATCH 20/24] Skip Flaky Test (#15722)

* Update test_profiler.py

* retrigger tests
---
 tests/python/unittest/test_profiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index 33f59845557b..d04f390f219b 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -433,6 +433,7 @@ def test_custom_operator_profiling_multiple_custom_ops_imperative():
     custom_operator_profiling_multiple_custom_ops(None, 'imperative', \
             'test_custom_operator_profiling_multiple_custom_ops_imperative.json')
 
+@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/15406")
 def test_custom_operator_profiling_naive_engine():
     # run the three tests above using Naive Engine
     run_in_spawned_process(test_custom_operator_profiling, \

From 60a2fc03376a986aab0497e7aa7f701cca4e6b1a Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 2 Aug 2019 10:23:09 +0530
Subject: [PATCH 21/24] Add magic method `abs` to NDArray and Symbol. (#15680)

* add magic method abs to ndarray

* add relevant tests

* add magic method abs to symbol

* add relevant tests

* retrigger CI

* retrigger CI
---
 python/mxnet/ndarray/ndarray.py       |  4 ++++
 python/mxnet/symbol/symbol.py         |  4 ++++
 tests/python/unittest/test_ndarray.py |  9 +++++++++
 tests/python/unittest/test_symbol.py  | 17 ++++++++++++++++-
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 3d8a7aa98c94..0b7dca4ebba4 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -205,6 +205,10 @@ def _to_shared_mem(self):
             self.handle, ctypes.byref(shared_pid), ctypes.byref(shared_id)))
         return shared_pid.value, shared_id.value, self.shape, self.dtype
 
+    def __abs__(self):
+        """x.__abs__() <=> abs(x) <=> x.abs() <=> mx.nd.abs(x, y)"""
+        return self.abs()
+
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         return add(self, other)
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 1e2defab3713..68322297c8bb 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -93,6 +93,10 @@ def __iter__(self):
         """
         return (self[i] for i in range(len(self)))
 
+    def __abs__(self):
+        """x.__abs__() <=> abs(x) <=> x.abs() <=> mx.symbol.abs(x, y)"""
+        return self.abs()
+
     def __add__(self, other):
         """x.__add__(y) <=> x+y
 
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 56db1ebd640d..0f154bd67a1a 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -172,6 +172,15 @@ def test_ndarray_negate():
     assert_almost_equal(npy, arr.asnumpy())
 
 
+@with_seed()
+def test_ndarray_magic_abs():
+    for dim in range(1, 7):
+        shape = rand_shape_nd(dim)
+        npy = np.random.uniform(-10, 10, shape)
+        arr = mx.nd.array(npy)
+        assert_almost_equal(abs(arr).asnumpy(), arr.abs().asnumpy())
+
+
 @with_seed()
 def test_ndarray_reshape():
     tensor = (mx.nd.arange(30) + 1).reshape(2, 3, 5)
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 0c97c68b0880..963b32493b44 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -22,7 +22,7 @@
 import numpy as np
 from common import assertRaises, models
 from mxnet.base import NotImplementedForSymbol
-from mxnet.test_utils import discard_stderr
+from mxnet.test_utils import discard_stderr, rand_shape_nd
 import pickle as pkl
 
 def test_symbol_basic():
@@ -188,6 +188,21 @@ def test_symbol_infer_shape_var():
     assert arg_shapes[1] == overwrite_shape
     assert out_shapes[0] == overwrite_shape
 
+
+def test_symbol_magic_abs():
+    for dim in range(1, 7):
+        with mx.name.NameManager():
+            data = mx.symbol.Variable('data')
+            method = data.abs(name='abs0')
+            magic = abs(data)
+            regular = mx.symbol.abs(data, name='abs0')
+            ctx = {'ctx': mx.context.current_context(), 'data': rand_shape_nd(dim)}
+            mx.test_utils.check_consistency(
+                [method, magic], ctx_list=[ctx, ctx])
+            mx.test_utils.check_consistency(
+                [regular, magic], ctx_list=[ctx, ctx])
+
+
 def test_symbol_fluent():
     has_grad = set(['flatten', 'expand_dims', 'flip', 'tile', 'transpose', 'sum', 'nansum', 'prod',
                     'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split',

From c01e88ed160b60881be7e4f39b3103f4e56d526c Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Thu, 1 Aug 2019 22:12:00 -0700
Subject: [PATCH 22/24] fix boolean_mask for 0-size output (#15731)

---
 include/mxnet/ndarray.h                |  6 +-----
 src/imperative/imperative.cc           |  4 +++-
 src/ndarray/ndarray.cc                 |  7 +++++++
 src/operator/contrib/boolean_mask.cc   |  1 +
 src/operator/contrib/boolean_mask.cu   | 15 +++++++++------
 tests/python/unittest/test_operator.py | 15 +++++++++++++++
 6 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 428245b56d0e..176aa0aaa197 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -190,11 +190,7 @@ class NDArray {
   /*!
    * \brief set the correct shape of NDArray directly from the storage_shape of its own chunk.
    */
-  void SetShapeFromChunk() {
-    if (!(ptr_->storage_shape.ndim() == 1 && ptr_->storage_shape[0] == 0)) {
-      shape_ = ptr_->storage_shape;
-    }
-  }
+  void SetShapeFromChunk();
   /*
    * This indicates whether an array is a view of another array (created by
    * reshape or slice). If an array is a view and the data is stored in
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index e2c0c9d4c9d4..c00021c44d1d 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -313,7 +313,9 @@ std::vector<NDArray*> Imperative::Backward(
     } else {
       info.outputs.emplace_back(outputs[i]->shape(), outputs[i]->ctx(),
                                 true, outputs[i]->dtype());
-      info.outputs.back() = static_cast<real_t>(1.0);
+      if (info.outputs.back().shape().Size() != 0) {
+        info.outputs.back() = static_cast<real_t>(1.0);
+      }
     }
   }
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index bee8bef37b44..37c32c09cebb 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -96,6 +96,13 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
         dtype, aux_types, aux_shapes);
 }
 
+void NDArray::SetShapeFromChunk() {
+  if (Imperative::Get()->is_np_shape() ||
+      !(ptr_->storage_shape.ndim() == 1 && ptr_->storage_shape[0] == 0)) {
+    shape_ = ptr_->storage_shape;
+  }
+}
+
 struct ChunkMem {
   Storage::Handle h;
   std::vector<Storage::Handle> aux_h;
diff --git a/src/operator/contrib/boolean_mask.cc b/src/operator/contrib/boolean_mask.cc
index 4d66e1ec0a69..f431d77f26ce 100644
--- a/src/operator/contrib/boolean_mask.cc
+++ b/src/operator/contrib/boolean_mask.cc
@@ -143,6 +143,7 @@ inline void BooleanMaskForward<cpu>(const nnvm::NodeAttrs& attrs,
   // set the output shape forcefully
   mxnet::TShape s = data.shape();
   s[axis] = valid_num;
+
   const_cast<NDArray &>(out).Init(s);
   // do the copy
   MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
diff --git a/src/operator/contrib/boolean_mask.cu b/src/operator/contrib/boolean_mask.cu
index 47335bfd6b79..c4a06d25d70a 100644
--- a/src/operator/contrib/boolean_mask.cu
+++ b/src/operator/contrib/boolean_mask.cu
@@ -79,7 +79,6 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
                                 Stream<gpu>::GetStream(s));
   CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[idx_size - 1], sizeof(int32_t),
                        cudaMemcpyDeviceToHost));
-  CHECK(valid_num > 0) << "boolean_mask behavior not defined when all masks are 0";
   // Set the output shape forcefully
   mxnet::TShape data_shape = data.shape();
   data_shape[axis] = valid_num;
@@ -88,8 +87,10 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
   size_t col_size = input_size / idx.shape()[0];
   // Do the copy
   MSHADOW_TYPE_SWITCH(out.dtype(), DType, {
-    mxnet_op::Kernel<BooleanMaskForwardKernel, gpu>::Launch(
-      s, input_size, out.data().dptr<DType>(), data.data().dptr<DType>(), prefix_sum, col_size);
+    if (valid_num > 0) {
+      mxnet_op::Kernel<BooleanMaskForwardKernel, gpu>::Launch(
+        s, input_size, out.data().dptr<DType>(), data.data().dptr<DType>(), prefix_sum, col_size);
+    }
   });
 }
 
@@ -143,9 +144,11 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
   size_t col_size = input_size / idx_size;
   // Backward pass
   MSHADOW_TYPE_SWITCH(igrad_data.dtype(), DType, {
-    mxnet_op::Kernel<BooleanMaskBackwardKernel, gpu>::Launch(
-      s, input_size, igrad_data.data().dptr<DType>(), req[0], ograd.data().dptr<DType>(),
-      prefix_sum, col_size);
+    if (input_size > 0) {
+      mxnet_op::Kernel<BooleanMaskBackwardKernel, gpu>::Launch(
+        s, input_size, igrad_data.data().dptr<DType>(), req[0], ograd.data().dptr<DType>(),
+        prefix_sum, col_size);
+    }
   });
 }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 5d7e51af7467..c20ae915eb17 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5616,6 +5616,21 @@ def test_boolean_mask():
     assert same(out.asnumpy(), expected)
     assert same(data.grad.asnumpy(), expected_grad)
 
+    # test 0-size output
+    mx.set_np_shape(True)
+    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.nd.array([0, 0, 0])
+    data.attach_grad()
+    with mx.autograd.record():
+        out = mx.nd.contrib.boolean_mask(data, index)
+    out.backward()
+    data.grad.wait_to_read()
+    expected = np.zeros((0, 3))
+    expected_grad = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
+    assert same(out.asnumpy(), expected)
+    assert same(data.grad.asnumpy(), expected_grad)
+    mx.set_np_shape(False)
+
     # test gradient
     shape = (100, 30)
     a = mx.nd.random.randint(0, 100, shape=shape)

From 7767f47c593e800d8b0ca4ce05d61c3f5a782f00 Mon Sep 17 00:00:00 2001
From: Caenorst <cfujitsang@nvidia.com>
Date: Fri, 2 Aug 2019 19:14:43 -0400
Subject: [PATCH 23/24] prevent TRT_Logger to be destroyed before TRT engine
 (#14898)

* prevent TRT_Logger to be destroyed before TRT engine

* use unique_ptr for trt_logger/parser/engine/executor ownership

* reduce line length for lint
---
 .../subgraph/tensorrt/onnx_to_tensorrt.cc     | 35 ++++------
 .../subgraph/tensorrt/onnx_to_tensorrt.h      | 66 ++++++++++++-------
 src/operator/subgraph/tensorrt/tensorrt-inl.h | 25 +++----
 src/operator/subgraph/tensorrt/tensorrt.cc    |  4 +-
 4 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
index 7dbc54bc1a63..27f6da436a8f 100644
--- a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
@@ -48,23 +48,6 @@ using std::endl;
 
 namespace onnx_to_tensorrt {
 
-struct InferDeleter {
-  template<typename T>
-    void operator()(T* obj) const {
-      if ( obj ) {
-        obj->destroy();
-      }
-    }
-};
-
-template<typename T>
-inline std::shared_ptr<T> InferObject(T* obj) {
-  if ( !obj ) {
-    throw std::runtime_error("Failed to create object");
-  }
-  return std::shared_ptr<T>(obj, InferDeleter());
-}
-
 std::string onnx_ir_version_string(int64_t ir_version = onnx::IR_VERSION) {
   int onnx_ir_major = ir_version / 1000000;
   int onnx_ir_minor = ir_version % 1000000 / 10000;
@@ -83,7 +66,9 @@ void PrintVersion() {
     << NV_TENSORRT_PATCH << endl;
 }
 
-std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
+std::tuple<unique_ptr<nvinfer1::ICudaEngine>,
+           unique_ptr<nvonnxparser::IParser>,
+           std::unique_ptr<TRT_Logger> > onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size,
         size_t max_workspace_size,
@@ -91,10 +76,10 @@ std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
         bool debug_builder) {
   GOOGLE_PROTOBUF_VERIFY_VERSION;
 
-  TRT_Logger trt_logger(verbosity);
-  auto trt_builder = InferObject(nvinfer1::createInferBuilder(trt_logger));
-  auto trt_network = InferObject(trt_builder->createNetwork());
-  auto trt_parser  = nvonnxparser::createParser(trt_network.get(), trt_logger);
+  auto trt_logger = std::unique_ptr<TRT_Logger>(new TRT_Logger(verbosity));
+  auto trt_builder = nvinfer1::createInferBuilder(*trt_logger);
+  auto trt_network = trt_builder->createNetwork();
+  auto trt_parser  = InferObject(nvonnxparser::createParser(trt_network, *trt_logger));
   ::ONNX_NAMESPACE::ModelProto parsed_model;
   // We check for a valid parse, but the main effect is the side effect
   // of populating parsed_model
@@ -139,8 +124,10 @@ std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
   trt_builder->setMaxBatchSize(max_batch_size);
   trt_builder->setMaxWorkspaceSize(max_workspace_size);
   trt_builder->setDebugSync(debug_builder);
-  nvinfer1::ICudaEngine* trt_engine = trt_builder->buildCudaEngine(*trt_network.get());
-  return std::make_tuple(trt_engine, trt_parser);
+  auto trt_engine = InferObject(trt_builder->buildCudaEngine(*trt_network));
+  trt_builder->destroy();
+  trt_network->destroy();
+  return std::make_tuple(std::move(trt_engine), std::move(trt_parser), std::move(trt_logger));
 }
 
 }  // namespace onnx_to_tensorrt
diff --git a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
index 3e8ea1bf9ee1..b89422f59069 100644
--- a/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
@@ -32,6 +32,7 @@
 #include <NvInfer.h>
 
 #include <fstream>
+#include <memory>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -40,33 +41,51 @@
 
 namespace onnx_to_tensorrt {
 
+struct InferDeleter {
+  template<typename T>
+    void operator()(T* obj) const {
+      if ( obj ) {
+        obj->destroy();
+      }
+    }
+};
+
+template<typename T>
+using unique_ptr = std::unique_ptr<T, InferDeleter>;
+
+template<typename T>
+inline unique_ptr<T> InferObject(T* obj) {
+  if ( !obj ) {
+    throw std::runtime_error("Failed to create object");
+  }
+  return unique_ptr<T>(obj, InferDeleter());
+}
+
 class TRT_Logger : public nvinfer1::ILogger {
-        nvinfer1::ILogger::Severity _verbosity;
-        std::ostream* _ostream;
+  nvinfer1::ILogger::Severity _verbosity;
+  std::ostream* _ostream;
  public:
-        TRT_Logger(Severity verbosity = Severity::kWARNING,
-                   std::ostream& ostream = std::cout)
-                : _verbosity(verbosity), _ostream(&ostream) {}
-        void log(Severity severity, const char* msg) override {
-                if ( severity <= _verbosity ) {
-                        time_t rawtime = std::time(0);
-                        char buf[256];
-                        strftime(&buf[0], 256,
-                                 "%Y-%m-%d %H:%M:%S",
-                                 std::gmtime(&rawtime));
-                        const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" :
-                                              severity == Severity::kERROR          ? "  ERROR" :
-                                              severity == Severity::kWARNING        ? "WARNING" :
-                                              severity == Severity::kINFO           ? "   INFO" :
-                                              "UNKNOWN");
-                        (*_ostream) << "[" << buf << " " << sevstr << "] "
-                                    << msg
-                                    << std::endl;
-                }
-        }
+  TRT_Logger(Severity verbosity = Severity::kWARNING,
+             std::ostream& ostream = std::cout) :
+               _verbosity(verbosity), _ostream(&ostream) {}
+  void log(Severity severity, const char* msg) override {
+    if (severity <= _verbosity) {
+      time_t rawtime = std::time(0);
+      char buf[256];
+      strftime(&buf[0], 256, "%Y-%m-%d %H:%M:%S", std::gmtime(&rawtime));
+      const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" :
+                            severity == Severity::kERROR          ? "  ERROR" :
+                            severity == Severity::kWARNING        ? "WARNING" :
+                            severity == Severity::kINFO           ? "   INFO" :
+                            "UNKNOWN");
+      (*_ostream) << "[" << buf << " " << sevstr << "] " << msg << std::endl;
+    }
+  }
 };
 
-std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
+std::tuple<unique_ptr<nvinfer1::ICudaEngine>,
+           unique_ptr<nvonnxparser::IParser>,
+           std::unique_ptr<TRT_Logger> > onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size = 32,
         size_t max_workspace_size = 1L << 30,
@@ -75,5 +94,4 @@ std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
 }  // namespace onnx_to_tensorrt
 
 #endif  // MXNET_USE_TENSORRT
-
 #endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
index a6b93f10598a..7b0dcc1bd4a5 100644
--- a/src/operator/subgraph/tensorrt/tensorrt-inl.h
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -51,10 +51,14 @@ struct TRTParam {
 };
 
 struct TRTEngineParam {
-  TRTEngineParam(nvinfer1::ICudaEngine* trt_engine,
-                 nvonnxparser::IParser* _parser,
-                 const std::unordered_map<std::string, uint32_t> input_map,
-                 const std::unordered_map<std::string, uint32_t> output_map) {
+  TRTEngineParam(onnx_to_tensorrt::unique_ptr<nvinfer1::ICudaEngine> _trt_engine,
+                 onnx_to_tensorrt::unique_ptr<nvonnxparser::IParser> _trt_parser,
+                 std::unique_ptr<onnx_to_tensorrt::TRT_Logger> _trt_logger,
+                 const std::unordered_map<std::string, uint32_t>& input_map,
+                 const std::unordered_map<std::string, uint32_t>& output_map) {
+    trt_engine = std::move(_trt_engine);
+    trt_logger = std::move(_trt_logger);
+    trt_parser = std::move(_trt_parser);
     binding_order = std::make_shared<std::vector<std::pair<uint32_t, bool> > >();
     bindings = std::make_shared<std::vector<void*> >();
     binding_order->reserve(trt_engine->getNbBindings());
@@ -67,16 +71,13 @@ struct TRTEngineParam {
         binding_order->emplace_back(output_map.at(binding_name), false);
       }
     }
-    trt_executor = trt_engine->createExecutionContext();
-    trt_parser = _parser;
+    trt_executor = onnx_to_tensorrt::InferObject(trt_engine->createExecutionContext());
   }
 
-  ~TRTEngineParam() {
-    trt_parser->destroy();
-    trt_executor->destroy();
-  }
-  nvinfer1::IExecutionContext* trt_executor;
-  nvonnxparser::IParser* trt_parser;
+  onnx_to_tensorrt::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  onnx_to_tensorrt::unique_ptr<nvinfer1::IExecutionContext> trt_executor;
+  onnx_to_tensorrt::unique_ptr<nvonnxparser::IParser> trt_parser;
+  std::unique_ptr<onnx_to_tensorrt::TRT_Logger> trt_logger;
   std::shared_ptr<std::vector<std::pair<uint32_t, bool> > > binding_order;
   std::shared_ptr<std::vector<void*> > bindings;
 };
diff --git a/src/operator/subgraph/tensorrt/tensorrt.cc b/src/operator/subgraph/tensorrt/tensorrt.cc
index 8b64c2a6b6ac..4ef1d7dece9d 100644
--- a/src/operator/subgraph/tensorrt/tensorrt.cc
+++ b/src/operator/subgraph/tensorrt/tensorrt.cc
@@ -312,7 +312,9 @@ OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context ctx,
   graph.attrs["shape"]        = std::make_shared<nnvm::any>(std::move(shapes));
   auto onnx_graph = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(graph, &params_map);
   auto trt_tuple = ::onnx_to_tensorrt::onnxToTrtCtx(onnx_graph, max_batch_size, 1 << 30);
-  return OpStatePtr::Create<TRTEngineParam>(std::get<0>(trt_tuple), std::get<1>(trt_tuple),
+  return OpStatePtr::Create<TRTEngineParam>(std::move(std::get<0>(trt_tuple)),
+                                            std::move(std::get<1>(trt_tuple)),
+                                            std::move(std::get<2>(trt_tuple)),
                                             inputs_to_idx, outputs_to_idx);
 }
 

From 4455b77203d949de15541c745bb2b67d8323eec4 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Sat, 3 Aug 2019 07:50:16 +0800
Subject: [PATCH 24/24] Delete extra white space

---
 src/operator/nn/mkldnn/mkldnn_rnn_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
index d71cbdfeacdf..98e431375335 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -548,7 +548,7 @@ static void MKLDNNRNNForwardUnidi(const bool state_outputs,
           user_bias_f[g + l * single_b_size] =
               b_ptr[g + hidden_size + l * mx_single_b_sz * 2]
               + b_ptr[g + hidden_size + l * mx_single_b_sz * 2 + mx_single_b_sz];
-          
+
           user_bias_f[g + l * single_b_size + 2 * hidden_size] =
               b_ptr[g + l * mx_single_b_sz * 2 + 2 * hidden_size];
           user_bias_f[g + l * single_b_size + 3 * hidden_size] =