From a572198920b2ed4480085157e2751e9b6be2e815 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 6 Nov 2018 09:50:09 -0800
Subject: [PATCH 01/43] Support large integer in operators

---
 src/operator/mxnet_op.h            | 46 +++++++++++++++---------------
 src/operator/tensor/indexing_op.cc | 26 ++++++++---------
 src/operator/tensor/indexing_op.h  | 38 ++++++++++++------------
 src/operator/tensor/init_op.h      |  2 +-
 4 files changed, 57 insertions(+), 55 deletions(-)
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index e77569671ebb..a3b13433eb16 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -433,51 +433,51 @@ struct op_with_req {
 
   /*! \brief input is one tensor */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i]));
   }
 
   /*! \brief inputs are two tensors */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs, const DType *rhs) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *lhs, const DType *rhs) {
     KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
   }
 
   /*! \brief input is tensor and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
   /*! \brief input is tensor and two scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in,
                                   const DType value_1, const DType value_2) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value_1, value_2));
   }
 
   /*! \brief No inputs (ie fill to constant value) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     KERNEL_ASSIGN(out[i], req, OP::Map());
   }
 
   /*! \brief input is single scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(value));
   }
 
   /*! \brief inputs are two tensors and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1, const DType *input_2, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], value));
   }
 
   /*! \brief inputs are three tensors (ie backward grad with binary grad function) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1,
                                   const DType *input_2,
                                   const DType *input_3) {
@@ -503,21 +503,21 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() functoion
    */
   template<typename ...Args>
-  inline static bool Launch(mshadow::Stream<cpu> *, const int N, Args... args) {
+  inline static bool Launch(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
-      for (int i = 0; i < N; ++i) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -536,22 +536,22 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() functoion
    */
   template<typename PRIMITIVE_OP, typename DType, typename ...Args>
-  static void LaunchTuned(mshadow::Stream<cpu> *, const int N, Args... args) {
+  static void LaunchTuned(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2 || !tuned_op<PRIMITIVE_OP, DType>::UseOMP(
-      static_cast<size_t>(N), static_cast<size_t>(omp_threads))) {
-      for (int i = 0; i < N; ++i) {
+      N, static_cast<size_t>(omp_threads))) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -565,15 +565,15 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions
    */
   template<typename ...Args>
-  inline static void LaunchEx(mshadow::Stream<cpu> *s, const int N, Args... args) {
+  inline static void LaunchEx(mshadow::Stream<cpu> *s, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
       OP::Map(0, N, args...);
     } else {
-      const int length = (N + omp_threads - 1) / omp_threads;
+      const auto length = (N + omp_threads - 1) / omp_threads;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i += length) {
+      for (auto i = 0; i < N; i += length) {
         OP::Map(i, i + length > N ? N - i : length, args...);
       }
     }
@@ -595,7 +595,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, T>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<T, DType>(s, N, dest, args...);
     return true;
   }
@@ -613,7 +613,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, typename T::Operation>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<typename T::Operation, DType>(s, N, dest, args...);
     return true;
   }
@@ -669,7 +669,7 @@ template<int val>
 struct set_to_int : public tunable {
   // mxnet_op version (when used directly with Kernel<>::Launch()) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     out[i] = DType(val);
   }
   // mshadow_op version (when used with op_with_req<>)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 77236e068f86..c39418dbe41d 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -36,7 +36,7 @@ struct TakeCPU {
   // K is the number of rows of in_data
   // i is the index of out_data
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
                                   const IType* idx, const size_t M, const int64_t K) {
     int64_t j = static_cast<int64_t>(idx[i]);
     if (clip) {
@@ -420,19 +420,19 @@ inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
 #pragma omp parallel for
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
 #pragma omp atomic
       out[offset + j] += data[i * K + j];
     }
@@ -441,18 +441,18 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       out[offset + j] += data[i * K + j];
     }
   }
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 92b6e21018e5..03981574fbf5 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -314,7 +314,7 @@ struct Take {
    * \param axis        axis id
    */
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const IType* idx,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const IType* idx,
                                   const mshadow::Shape<10> in_stride,
                                   const mshadow::Shape<10> out_stride,
                                   const int in_ndims, const int out_ndims, const int idx_ndims,
@@ -361,7 +361,7 @@ struct TakeRspKernel {
    * \param nnr         number of non-zero rows
    */
   template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i,
+  MSHADOW_XINLINE static void Map(index_t i,
                                   const IType* data,
                                   DType* out,
                                   const RType* weight_idx,
@@ -1395,15 +1395,15 @@ inline bool ScatterNDType(const nnvm::NodeAttrs& attrs,
 
 struct scatter_nd {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, OpReqType req, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, OpReqType req, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       KERNEL_ASSIGN(out[offset+j], req, data[i*K + j]);
     }
   }
@@ -1416,17 +1416,18 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
@@ -1441,7 +1442,7 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1450,7 +1451,7 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1458,7 +1459,7 @@ GatherNDBackwardImpl(int N, int M, int K,
                      mshadow::Stream<cpu> *s);
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
@@ -1472,17 +1473,18 @@ void GatherNDBackward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 4e52b087f10a..1de623c47df1 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -453,7 +453,7 @@ void EyeFill(const nnvm::NodeAttrs& attrs,
 
 struct range_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int repeat, DType start, DType step,
+  MSHADOW_XINLINE static void Map(index_t i, int repeat, DType start, DType step,
                                   int req, DType* out) {
     KERNEL_ASSIGN(out[i], req, start + (i/repeat) * step);
   }

From e37a06b8d8fb90b6a2761d7bdeba89262161daee Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 7 Nov 2018 06:47:53 -0800
Subject: [PATCH 02/43] fix large array in sum

---
 src/operator/mxnet_op.h                    | 22 ++---
 src/operator/tensor/broadcast_reduce-inl.h | 94 +++++++++++-----------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index a3b13433eb16..e1cbd47bf242 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -289,8 +289,8 @@ inline int get_num_threads<cpu>(const int N) {
 
 /* \brief Compute flattened index given coordinates and shape. */
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
@@ -301,11 +301,11 @@ MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
 
 /* Compute coordinates from flattened index given shape */
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -315,8 +315,8 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 
 /* Compute dot product of two vector */
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret += coord[i] * stride[i];
@@ -327,12 +327,12 @@ MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
 
 /* Combining unravel and dot */
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 167fa34b083f..352949c86452 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -53,14 +53,14 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
-  const Shape<ndim>& stridej, const Shape<ndim>& stridek, int* j, int* k) {
+MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape<ndim>& shape,
+  const Shape<ndim>& stridej, const Shape<ndim>& stridek, index_t* j, index_t* k) {
   *j = 0;
   *k = 0;
   #pragma unroll
-  for (int i = ndim-1, idx_t = idx; i >=0; --i) {
-    const int tmp = idx_t / shape[i];
-    const int coord = idx_t - tmp*shape[i];
+  for (index_t i = ndim-1, idx_t = idx; i >=0; --i) {
+    const auto tmp = idx_t / shape[i];
+    const auto coord = idx_t - tmp*shape[i];
     *j += coord*stridej[i];
     *k += coord*stridek[i];
     idx_t = tmp;
@@ -68,11 +68,11 @@ MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -80,10 +80,10 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
-  for (int i = 0; i < ndim; ++i) {
+  for (index_t i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > 1) * coord[i];
   }
   return ret;
@@ -111,12 +111,12 @@ MSHADOW_XINLINE int diff(const Shape<ndim>& small, const Shape<ndim>& big, Shape
 }
 
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
@@ -124,8 +124,8 @@ MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i)
     ret += coord[i] * stride[i];
@@ -142,27 +142,27 @@ MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) {
 }
 
 template<int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void binary_broadcast_assign(const int idx, const bool addto,
+MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto,
                                              const DType* __restrict lhs,
                                              const DType* __restrict rhs, DType* out,
                                              const Shape<ndim>& lshape, const Shape<ndim>& rshape,
                                              const Shape<ndim>& oshape) {
   const Shape<ndim> coord = unravel(idx, oshape);
-  const int j = ravel(coord, lshape);
-  const int k = ravel(coord, rshape);
+  const index_t j = ravel(coord, lshape);
+  const index_t k = ravel(coord, rshape);
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, DType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
-  int j = ravel(coord, bshape);
+  index_t j = ravel(coord, bshape);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (index_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
     Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
@@ -176,10 +176,10 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 #else
 
 template<int ndim, typename DType, typename OP>
-void binary_broadcast_compute(const int N, const bool addto, const DType *lhs,
+void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs,
                               const DType *rhs, DType *out, const Shape<ndim> lshape,
                               const Shape<ndim> rshape, const Shape<ndim> oshape) {
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < N; ++idx) {
     binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
   }
 }
@@ -188,26 +188,26 @@ template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
   if (req == kNullOp) return;
-  int N = out.shape_.Size();
+  size_t N = out.shape_.Size();
   binary_broadcast_compute<ndim, DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
                            out.dptr<DType>(), lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>(),
                            out.shape_.get<ndim>());
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, DType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
   }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
+void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool addto,
                                   const DType* big, DType* small,
                                   const Shape<ndim> bshape,
                                   const Shape<ndim> sshape,
@@ -215,12 +215,12 @@ void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
                                   const Shape<ndim> rstride,
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < N; ++idx) {
     Shape<ndim> coord = unravel(idx, sshape);
-    int j = ravel(coord, bshape);
+    index_t j = ravel(coord, bshape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
-    for (int k = 0; k < M; ++k) {
+    for (index_t k = 0; k < M; ++k) {
       Reducer::Reduce(val, OP::Map(big[j + ws_dptr[k]]), residual);
     }
     assign(&small[idx], addto, val);
@@ -233,7 +233,7 @@ void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   seq_reduce_compute<Reducer, ndim, DType, OP>(
     N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
     big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
@@ -247,9 +247,9 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   index_t* ws_dptr = reinterpret_cast<index_t*>(workspace.dptr_);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int k = 0; k < M; k++) {
+  for (index_t k = 0; k < M; k++) {
     Shape<ndim> coord = unravel(k, rshape);
     ws_dptr[k] = dot(coord, rstride);
   }
@@ -272,7 +272,7 @@ size_t ReduceWorkspaceSize(Stream<cpu> *s, const TShape& small, const OpReqType
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, const DType* __restrict lhs,
                                        const DType* __restrict rhs, DType *small,
                                        const Shape<ndim>& big_shape, const Shape<ndim>& lhs_shape0,
@@ -282,20 +282,20 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
                                        const Shape<ndim>& rstride, const Shape<ndim>& lhs_stride,
                                        const Shape<ndim>& rhs_stride) {
   Shape<ndim> coord = unravel(idx, small_shape);
-  const int idx_big0 = ravel(coord, big_shape);
-  const int idx_lhs0 = ravel(coord, lhs_shape0);
-  const int idx_rhs0 = ravel(coord, rhs_shape0);
+  const index_t idx_big0 = ravel(coord, big_shape);
+  const index_t idx_lhs0 = ravel(coord, lhs_shape0);
+  const index_t idx_rhs0 = ravel(coord, rhs_shape0);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (index_t k = 0; k < M; ++k) {
     Shape<ndim> coord_big = unravel(k, rshape);
-    int idx_big = idx_big0 + dot(coord_big, rstride);
+    index_t idx_big = idx_big0 + dot(coord_big, rstride);
 
     Shape<ndim> coord_lhs = unravel(k, lhs_shape);
-    int idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
+    index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
 
     Shape<ndim> coord_rhs = unravel(k, rhs_shape);
-    int idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
+    index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
 
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
   }
@@ -304,7 +304,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, const DType *lhs, const DType *rhs, DType *small,
                         const Shape<ndim> big_shape, const Shape<ndim> small_shape,
                         const Shape<ndim> rshape, const Shape<ndim> rstride,
@@ -312,7 +312,7 @@ void seq_reduce_compute(const int N, const int M, const bool addto,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < N; ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
       lhs_stride, rhs_stride);
@@ -326,8 +326,8 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size();
-  int M = rshape.Size();
+  size_t N = small.shape_.Size();
+  size_t M = rshape.Size();
 
   Shape<ndim> lhs_shape, lhs_stride;
   diff(small.shape_.get<ndim>(), lhs.shape_.get<ndim>(), &lhs_shape, &lhs_stride);

From e48b2744ae75fc09b1dcf31804547842063d3e34 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 7 Nov 2018 11:43:33 -0800
Subject: [PATCH 03/43] Fix large array issue in slice operation

---
 src/operator/tensor/broadcast_reduce-inl.h |  14 +-
 src/operator/tensor/matrix_op-inl.h        | 202 ++++++++++-----------
 2 files changed, 108 insertions(+), 108 deletions(-)

diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 352949c86452..2db1b4441124 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -162,7 +162,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
   index_t j = ravel(coord, bshape);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (index_t k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
     Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
@@ -179,7 +179,7 @@ template<int ndim, typename DType, typename OP>
 void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs,
                               const DType *rhs, DType *out, const Shape<ndim> lshape,
                               const Shape<ndim> rshape, const Shape<ndim> oshape) {
-  for (index_t idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
   }
 }
@@ -215,12 +215,12 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add
                                   const Shape<ndim> rstride,
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (index_t idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     Shape<ndim> coord = unravel(idx, sshape);
     index_t j = ravel(coord, bshape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
-    for (index_t k = 0; k < M; ++k) {
+    for (size_t k = 0; k < M; ++k) {
       Reducer::Reduce(val, OP::Map(big[j + ws_dptr[k]]), residual);
     }
     assign(&small[idx], addto, val);
@@ -249,7 +249,7 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   index_t* ws_dptr = reinterpret_cast<index_t*>(workspace.dptr_);
   size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (index_t k = 0; k < M; k++) {
+  for (size_t k = 0; k < M; k++) {
     Shape<ndim> coord = unravel(k, rshape);
     ws_dptr[k] = dot(coord, rstride);
   }
@@ -287,7 +287,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
   const index_t idx_rhs0 = ravel(coord, rhs_shape0);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (index_t k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     Shape<ndim> coord_big = unravel(k, rshape);
     index_t idx_big = idx_big0 + dot(coord_big, rstride);
 
@@ -312,7 +312,7 @@ void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (index_t idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
       lhs_stride, rhs_stride);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index e375d640536a..2eeb8213be92 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -626,9 +626,9 @@ inline void GetIndexRange(const TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
                           const nnvm::Tuple<dmlc::optional<int>>& param_end,
                           const nnvm::Tuple<dmlc::optional<int>>& param_step,
-                          common::StaticArray<int, ndim>* begin,
-                          common::StaticArray<int, ndim>* end,
-                          common::StaticArray<int, ndim>* step) {
+                          common::StaticArray<index_t, ndim>* begin,
+                          common::StaticArray<index_t, ndim>* end,
+                          common::StaticArray<index_t, ndim>* step) {
   CHECK_NE(dshape.ndim(), 0U);
   CHECK_LE(param_begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
@@ -724,7 +724,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -743,19 +743,19 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D data
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D data
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -771,20 +771,20 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int out_offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D data
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t out_offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D data
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -813,11 +813,11 @@ void SliceOpForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        int num_threads = out.shape_.FlatTo2D()[0];
+        size_t num_threads = out.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= out.shape_.get<ndim>()[ndim - 1];
         }
@@ -836,20 +836,20 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -866,19 +866,19 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D out
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D out
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -911,7 +911,7 @@ void SliceOpBackward(const nnvm::NodeAttrs& attrs,
     LOG(FATAL) << "_slice_backward does not support kWriteInplace";
   }
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(igrad.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -937,7 +937,7 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   TShape vshape = dshape;  // vshape is the value shape on the right hand side
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -975,7 +975,7 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1024,20 +1024,20 @@ template<int ndim>
 struct slice_assign_scalar {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType val,
                                   const OpReqType req,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -1076,7 +1076,7 @@ void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
   TShape vshape = data.shape_;
   const SliceAssignScalarParam& param = nnvm::get<SliceAssignScalarParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -1354,7 +1354,7 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1400,7 +1400,7 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(ograd.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1429,7 +1429,7 @@ struct ClipParam : public dmlc::Parameter<ClipParam> {
 
 struct clip {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1445,7 +1445,7 @@ struct clip {
 
 struct clip_grad {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* grad, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* grad, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1934,7 +1934,7 @@ struct reverse {
   }
 #ifdef __CUDACC__
   template<typename DType>
-  __device__  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  __device__  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                               const index_t * stride_,
                               const index_t * trailing_) {
     __shared__ index_t stride_share[REVERSE_MAX_DIM];
@@ -1949,7 +1949,7 @@ struct reverse {
   }
 #else
   template<typename DType>
-  MSHADOW_XINLINE  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  MSHADOW_XINLINE  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                                    const index_t * stride_,
                                    const index_t * trailing_) {
     index_t new_idx = ReverseIndex(index, nreversedim, stride_, trailing_);
@@ -2141,10 +2141,10 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 // move all the zeros to the last of the shape array
 // and keep the relative order of the non-zero values.
 // Returns the new shape size after moving all zeros to the end.
-inline uint32_t SqueezeShapeHelper(TShape* shape) {
+inline size_t SqueezeShapeHelper(TShape* shape) {
   CHECK(shape != nullptr);
-  uint32_t count = 0;
-  for (uint32_t i = 0; i < shape->ndim(); ++i) {
+  size_t count = 0;
+  for (size_t i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2167,7 +2167,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   if (param.axis.has_value()) {
     // preprocess axis
     TShape axes = param.axis.value();
-    for (uint32_t i = 0; i < axes.ndim(); ++i) {
+    for (size_t i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2182,11 +2182,11 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (uint32_t i = 0; i < oshape.ndim(); ++i) {
+    for (size_t i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
-  uint32_t oshape_size = SqueezeShapeHelper(&oshape);
+  size_t oshape_size = SqueezeShapeHelper(&oshape);
   if (oshape_size == 0) {  // corner case when dshape is (1, 1, 1, 1)
     oshape[0] = 1;
     oshape_size = 1;
@@ -2229,7 +2229,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  uint32_t i = 2;
+  size_t i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
@@ -2259,9 +2259,9 @@ inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
  * \param inp_index         index within input tensor from where value is retrieved
  * \param offset_arr        array containing the linear offset of input tensor
  */
-MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
-                                  int *inp_index, const int* offset_arr) {
-  int next_idx_val = *idx / dim_size;
+MSHADOW_XINLINE void update_index(index_t index_position, index_t dim_size, index_t *idx,
+                                  index_t *inp_index, const index_t* offset_arr) {
+  index_t next_idx_val = *idx / dim_size;
   *inp_index += (*idx - next_idx_val * dim_size) * offset_arr[index_position];
   *idx = next_idx_val;
 }
@@ -2280,9 +2280,9 @@ MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
 template<int req>
 struct depth_to_space_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
-                                  const int block, const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const int block, const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = block;
     update_index(2, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[3];
@@ -2315,9 +2315,9 @@ struct depth_to_space_forward {
 template<int req>
 struct compute_offset_for_depth_to_space {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1, const int32_t size2,
-                                  const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1, const index_t size2,
+                                  const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2349,10 +2349,10 @@ void DepthToSpaceOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
@@ -2431,9 +2431,9 @@ inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
 template<int req>
 struct space_to_depth_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const int block,
-                                  const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const int block,
+                                  const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = size[3] / block;
     update_index(4, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[2] / block;
@@ -2466,9 +2466,9 @@ struct space_to_depth_forward {
 template<int req>
 struct compute_offset_for_space_to_depth {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1,
-                                  const int32_t size2, const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1,
+                                  const index_t size2, const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2500,10 +2500,10 @@ void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {

From b183c3fce0cd1018d152f164cb924f36d267cba6 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 00:50:29 -0800
Subject: [PATCH 04/43] fix bug in shape

---
 include/mxnet/c_api.h             | 26 +++++++++++++-------------
 include/mxnet/ndarray.h           |  2 +-
 python/mxnet/base.py              |  1 +
 python/mxnet/ndarray/ndarray.py   |  6 +++---
 src/c_api/c_api.cc                | 14 +++++++-------
 src/c_api/c_api_common.h          | 10 +++++-----
 src/c_api/c_api_symbolic.cc       | 16 ++++++++--------
 src/ndarray/ndarray.cc            |  2 +-
 src/operator/elemwise_op_common.h |  8 ++++----
 9 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index e9f1e2d6cccc..d4fb7d0f8c9f 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -487,7 +487,7 @@ MXNET_DLL int MXNDArrayCreateNone(NDArrayHandle *out);
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreate(const mx_uint *shape,
+MXNET_DLL int MXNDArrayCreate(const dim_t *shape,
                               mx_uint ndim,
                               int dev_type,
                               int dev_id,
@@ -506,7 +506,7 @@ MXNET_DLL int MXNDArrayCreate(const mx_uint *shape,
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
+MXNET_DLL int MXNDArrayCreateEx(const dim_t *shape,
                               mx_uint ndim,
                               int dev_type,
                               int dev_id,
@@ -533,7 +533,7 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
-                                      const mx_uint *shape,
+                                      const dim_t *shape,
                                       mx_uint ndim,
                                       int dev_type,
                                       int dev_id,
@@ -748,8 +748,8 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
-                                mx_uint *out_dim,
-                                const mx_uint **out_pdata);
+                                dim_t *out_dim,
+                                const dim_t **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -1466,16 +1466,16 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  mx_uint num_args,
                                  const char** keys,
                                  const mx_uint *arg_ind_ptr,
-                                 const mx_uint *arg_shape_data,
+                                 const dim_t *arg_shape_data,
                                  mx_uint *in_shape_size,
                                  const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
+                                 const dim_t ***in_shape_data,
                                  mx_uint *out_shape_size,
                                  const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
+                                 const dim_t ***out_shape_data,
                                  mx_uint *aux_shape_size,
                                  const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
+                                 const dim_t ***aux_shape_data,
                                  int *complete);
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
@@ -1505,16 +1505,16 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
                                         mx_uint num_args,
                                         const char** keys,
                                         const mx_uint *arg_ind_ptr,
-                                        const mx_uint *arg_shape_data,
+                                        const dim_t *arg_shape_data,
                                         mx_uint *in_shape_size,
                                         const mx_uint **in_shape_ndim,
-                                        const mx_uint ***in_shape_data,
+                                        const dim_t ***in_shape_data,
                                         mx_uint *out_shape_size,
                                         const mx_uint **out_shape_ndim,
-                                        const mx_uint ***out_shape_data,
+                                        const dim_t ***out_shape_data,
                                         mx_uint *aux_shape_size,
                                         const mx_uint **aux_shape_ndim,
-                                        const mx_uint ***aux_shape_data,
+                                        const dim_t ***aux_shape_data,
                                         int *complete);
 
 /*!
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index e877d35dbb5b..f811000d216f 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -464,7 +464,7 @@ class NDArray {
   /*!
    * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
    */
-  void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+  void SyncCopyFromNDArray(const NDArray &src, index_t i = -1, index_t j = -1);
 
   /*!
    * \brief Do a synchronize copy to a continugous CPU memory region.
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index feb4d70b6533..3e06f47a8ea3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -215,6 +215,7 @@ def _load_lib():
 # type definitions
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
+mx_long = ctypes.c_longlong
 mx_float_p = ctypes.POINTER(mx_float)
 mx_real_t = np.float32
 NDArrayHandle = ctypes.c_void_p
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index bf1140d2071b..e93dd13eb6b8 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -35,7 +35,7 @@
 import numpy as np
 from ..base import _LIB, numeric_types, integer_types
 from ..base import c_str, c_array, c_array_buf, c_handle_array, mx_real_t
-from ..base import mx_uint, NDArrayHandle, check_call, DLPackHandle
+from ..base import mx_uint, mx_long, NDArrayHandle, check_call, DLPackHandle
 from ..base import ctypes2buffer
 from ..context import Context, current_context
 from . import _internal
@@ -131,7 +131,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array_buf(mx_uint, native_array('I', shape)),
+        c_array_buf(mx_long, native_array('I', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -1834,7 +1834,7 @@ def shape(self):
         (2L, 3L, 4L)
         """
         ndim = mx_uint()
-        pdata = ctypes.POINTER(mx_uint)()
+        pdata = ctypes.POINTER(mx_long)()
         check_call(_LIB.MXNDArrayGetShape(
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
         return tuple(pdata[:ndim.value]) # pylint: disable=invalid-slice-index
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 80bd60538ff5..1df397e89a84 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -151,7 +151,7 @@ int MXNDArrayCreateNone(NDArrayHandle *out) {
   API_END();
 }
 
-int MXNDArrayCreate(const mx_uint *shape,
+int MXNDArrayCreate(const dim_t *shape,
                     mx_uint ndim,
                     int dev_type,
                     int dev_id,
@@ -165,7 +165,7 @@ int MXNDArrayCreate(const mx_uint *shape,
   API_END();
 }
 
-int MXNDArrayCreateEx(const mx_uint *shape,
+int MXNDArrayCreateEx(const dim_t *shape,
                     mx_uint ndim,
                     int dev_type,
                     int dev_id,
@@ -182,7 +182,7 @@ int MXNDArrayCreateEx(const mx_uint *shape,
 }
 
 int MXNDArrayCreateSparseEx(int storage_type,
-                    const mx_uint *shape,
+                    const dim_t *shape,
                     mx_uint ndim,
                     int dev_type,
                     int dev_id,
@@ -266,7 +266,7 @@ int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
  */
 int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
                                  const NDArrayHandle handle_src,
-                                 const int i) {
+                                 const dim_t i) {
   API_BEGIN();
   NDArray* dst = static_cast<NDArray*>(handle_dst);
   NDArray* src = static_cast<NDArray*>(handle_src);
@@ -481,15 +481,15 @@ int MXNDArrayGetStorageType(NDArrayHandle handle,
 }
 
 int MXNDArrayGetShape(NDArrayHandle handle,
-                      mx_uint *out_dim,
-                      const mx_uint **out_pdata) {
+                      dim_t *out_dim,
+                      const dim_t **out_pdata) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   if (!arr->is_none()) {
     const TShape &s = arr->shape();
     *out_dim = s.ndim();
-    std::vector<uint32_t>& buffer = ret->arg_shape_buffer;
+    std::vector<dim_t>& buffer = ret->arg_shape_buffer;
     buffer.resize(s.ndim());
     nnvm::ShapeTypeCast(s.begin(), s.end(), buffer.data());
     *out_pdata = buffer.data();
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 079b587e9965..12e823cf3183 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -84,23 +84,23 @@ struct MXAPIThreadLocalEntry {
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
   /*! \brief result holder for returning shape pointer */
-  std::vector<const mx_uint*> arg_shape_data, out_shape_data, aux_shape_data;
+  std::vector<const dim_t*> arg_shape_data, out_shape_data, aux_shape_data;
   /*! \brief uint32_t buffer for returning shape pointer */
-  std::vector<uint32_t> arg_shape_buffer, out_shape_buffer, aux_shape_buffer;
+  std::vector<dim_t> arg_shape_buffer, out_shape_buffer, aux_shape_buffer;
   /*! \brief bool buffer */
   std::vector<bool> save_inputs, save_outputs;
   // helper function to setup return value of shape array
   inline static void SetupShapeArrayReturnWithBuffer(
       const std::vector<TShape> &shapes,
       std::vector<mx_uint> *ndim,
-      std::vector<const mx_uint*> *data,
-      std::vector<uint32_t> *buffer) {
+      std::vector<const dim_t*> *data,
+      std::vector<dim_t> *buffer) {
     ndim->resize(shapes.size());
     data->resize(shapes.size());
     size_t size = 0;
     for (const auto& s : shapes) size += s.ndim();
     buffer->resize(size);
-    uint32_t *ptr = buffer->data();
+    dim_t *ptr = buffer->data();
     for (size_t i = 0; i < shapes.size(); ++i) {
       ndim->at(i) = shapes[i].ndim();
       data->at(i) = ptr;
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index d4625de80110..b6924f95067b 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -505,16 +505,16 @@ int MXSymbolInferShape(SymbolHandle sym,
                        mx_uint num_args,
                        const char** keys,
                        const mx_uint *arg_ind_ptr,
-                       const mx_uint *arg_shape_data,
+                       const dim_t *arg_shape_data,
                        mx_uint *in_shape_size,
                        const mx_uint **in_shape_ndim,
-                       const mx_uint ***in_shape_data,
+                       const dim_t ***in_shape_data,
                        mx_uint *out_shape_size,
                        const mx_uint **out_shape_ndim,
-                       const mx_uint ***out_shape_data,
+                       const dim_t ***out_shape_data,
                        mx_uint *aux_shape_size,
                        const mx_uint **aux_shape_ndim,
-                       const mx_uint ***aux_shape_data,
+                       const dim_t ***aux_shape_data,
                        int *complete) {
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
@@ -572,16 +572,16 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
                               mx_uint num_args,
                               const char** keys,
                               const mx_uint *arg_ind_ptr,
-                              const mx_uint *arg_shape_data,
+                              const dim_t *arg_shape_data,
                               mx_uint *in_shape_size,
                               const mx_uint **in_shape_ndim,
-                              const mx_uint ***in_shape_data,
+                              const dim_t ***in_shape_data,
                               mx_uint *out_shape_size,
                               const mx_uint **out_shape_ndim,
-                              const mx_uint ***out_shape_data,
+                              const dim_t ***out_shape_data,
                               mx_uint *aux_shape_size,
                               const mx_uint **aux_shape_ndim,
-                              const mx_uint ***aux_shape_data,
+                              const dim_t ***aux_shape_data,
                               int *complete) {
   int succ;
   *complete = 1;
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 5d8e39dea225..9820b809aa50 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1849,7 +1849,7 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
 /*!
  * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j).
  */
-void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
+void NDArray::SyncCopyFromNDArray(const NDArray& src, index_t i, index_t j) {
   if (i >= 0) {
     CHECK_NE(src.storage_type(), kDefaultStorage);
   } else {
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index cf44da699156..4b8663bba6ea 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -100,7 +100,7 @@ inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
  *  \tparam rsp whether row sparse stype is supported
  *  \tparam rsp whether csr stype is supported
  */
-template<int n_in, int n_out, bool cpu_only, bool rsp, bool csr>
+template<index_t n_in, index_t n_out, bool cpu_only, bool rsp, bool csr>
 inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
                                 const int dev_mask,
                                 DispatchMode* dispatch_mode,
@@ -115,7 +115,7 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
 template<typename AttrType, bool (*is_none)(const AttrType&),
          bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
          std::string (*attr_string)(const AttrType&),
-         int n_in = -1, int n_out = -1>
+         index_t n_in = -1, index_t n_out = -1>
 inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
                          std::vector<AttrType> *in_attrs,
                          std::vector<AttrType> *out_attrs,
@@ -154,7 +154,7 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
@@ -168,7 +168,7 @@ inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
     attrs, in_attrs, out_attrs, TShape());
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {

From fcebf5a5bb9c939c868b9fb65656fd43f6b2d65f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 10:51:09 -0800
Subject: [PATCH 05/43] fix getitem with large index

---
 include/mxnet/c_api.h           | 2 +-
 python/mxnet/ndarray/ndarray.py | 4 ++--
 src/c_api/c_api.cc              | 2 +-
 src/operator/tensor/init_op.h   | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index d4fb7d0f8c9f..50fad3d79409 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -705,7 +705,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
-                          mx_uint idx,
+                          dim_t idx,
                           NDArrayHandle *out);
 
 /*!
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index e93dd13eb6b8..c35d3f7e3932 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -902,7 +902,7 @@ def _slice(self, start, stop):
         start, stop, _ = _get_index_range(start, stop, self.shape[0])
 
         check_call(_LIB.MXNDArraySlice(
-            self.handle, mx_uint(start), mx_uint(stop), ctypes.byref(handle)))
+            self.handle, mx_long(start), mx_long(stop), ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
     def _at(self, idx):
@@ -936,7 +936,7 @@ def _at(self, idx):
                 raise IndexError('index %d is out of bounds for axis 0 with size %d'
                                  % (idx-length, length))
         check_call(_LIB.MXNDArrayAt(
-            self.handle, mx_uint(idx), ctypes.byref(handle)))
+            self.handle, mx_long(idx), ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
     def reshape(self, *shape, **kwargs):
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1df397e89a84..7dc7d30bddad 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -406,7 +406,7 @@ int MXNDArraySlice(NDArrayHandle handle,
 }
 
 int MXNDArrayAt(NDArrayHandle handle,
-                mx_uint idx,
+                dim_t idx,
                 NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 1de623c47df1..e9e67cb1a4c5 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -471,8 +471,8 @@ void RangeCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       // Force unsigned params to take two's complement form on ARM to ensure consistency with x86
       // results.  Casting negative floats to unsigned types is undefined in the CPP standard.
-      auto step = std::is_signed<DType>() ? param.step : static_cast<int>(param.step);
-      auto start = std::is_signed<DType>() ? param.start : static_cast<int>(param.start);
+      auto step = std::is_signed<DType>() ? param.step : static_cast<index_t>(param.step);
+      auto start = std::is_signed<DType>() ? param.start : static_cast<index_t>(param.start);
       Kernel<range_fwd, xpu>::Launch(s,
                                      outputs[0].Size(),
                                      static_cast<int>(param.repeat),

From 3c7557bfebf782d7b48440d55246440d267f1f9d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 14:45:36 -0800
Subject: [PATCH 06/43] fix bug in slice operator

---
 include/mxnet/c_api.h               |  4 ++--
 src/c_api/c_api.cc                  |  4 ++--
 src/operator/tensor/matrix_op-inl.h | 17 ++++++++++-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 50fad3d79409..c93767731d67 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -693,8 +693,8 @@ MXNET_DLL int MXNDArrayFree(NDArrayHandle handle);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
-                             mx_uint slice_begin,
-                             mx_uint slice_end,
+                             dim_t slice_begin,
+                             dim_t slice_end,
                              NDArrayHandle *out);
 
 /*!
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7dc7d30bddad..d503bf4f19db 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -394,8 +394,8 @@ int MXNDArrayFree(NDArrayHandle handle) {
 }
 
 int MXNDArraySlice(NDArrayHandle handle,
-                   mx_uint slice_begin,
-                   mx_uint slice_end,
+                   dim_t slice_begin,
+                   dim_t slice_end,
                    NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 2eeb8213be92..cb903f36e626 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -646,8 +646,8 @@ inline void GetIndexRange(const TShape& dshape,
   }
 
   for (index_t i = 0; i < param_begin.ndim(); ++i) {
-    int b = 0, e = dshape[i], s = 1;
-    const int len = dshape[i];
+    index_t b = 0, e = dshape[i], s = 1;
+    const index_t len = dshape[i];
     if (param_step.ndim() != 0U) {
       const auto& opt_step_val = param_step[i];
       if (opt_step_val.has_value()) {
@@ -1107,7 +1107,7 @@ struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
 };
 
 inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape,
-                           int* axis, int* begin, int* end) {
+                           int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
     *axis += static_cast<int>(ishape.ndim());
@@ -1115,7 +1115,7 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape
   CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
-  int axis_size = static_cast<int>(ishape[*axis]);
+  index_t axis_size = static_cast<index_t>(ishape[*axis]);
   *begin = param.begin;
   *end = -1;
   if (*begin < 0) {
@@ -1149,7 +1149,8 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
@@ -1173,7 +1174,8 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());
 
@@ -1207,7 +1209,8 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());
 

From 904f09baf5039646049581553000b05b9317365c Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 15:38:35 -0800
Subject: [PATCH 07/43] fix bug in random uniform op

---
 src/operator/random/sampler.h | 43 ++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index 44f80ab56254..de84a58323c6 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -43,32 +43,33 @@ namespace op {
 template<typename OP, typename xpu, typename GType, typename ...Args>
 inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                              common::random::RandGenerator<xpu, GType> *gen,
-                             const int N, Args... args) {
+                             const index_t N, Args... args) {
   // minimal check to avoid division by zero, below.
   // if `N` is zero the map operation is a no-op in any case.
   if (N <= 0) {
     return;
   }
-  const int nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+  const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
                     RandGenerator<xpu>::kMinNumRandomPerThread;
-  const int nthread = std::min(nloop, RandGenerator<xpu>::kNumRandomStates);
-  const int step = (N + nthread - 1) / nthread;
+  const index_t nthread = std::min(nloop,
+                                   static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
+  const index_t step = (N + nthread - 1) / nthread;
   Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }
 
 #define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \
-  const int start = thread_id * step;                                    \
-  const int end = start + step;                                          \
+  const index_t start = thread_id * step;                                    \
+  const index_t end = start + step;                                          \
   typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id);     \
-  for (int i = start; i < end && i < N; ++i) {                           \
+  for (index_t i = start; i < end && i < N; ++i) {                           \
     {__VA_ARGS__}                                                        \
   }
 
 template<typename xpu>
 struct SampleUniformKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lower, const IType *upper, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -95,8 +96,8 @@ struct UniformSampler {
 template<typename xpu>
 struct SampleNormalKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mean, const IType *std, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -122,8 +123,8 @@ struct NormalSampler {
 template<typename xpu>
 struct SampleExponentialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -170,8 +171,8 @@ MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu,
 template<typename xpu>
 struct SampleGammaKernel {
   template<typename IType, typename OType, typename FType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, FType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, FType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *alpha, const IType *beta, OType *out) {
     RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
@@ -232,8 +233,8 @@ MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, floa
 template<typename xpu>
 struct SamplePoissonKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -259,8 +260,8 @@ struct PoissonSampler {
 template<typename xpu>
 struct SampleNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *k, const IType *p, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -291,8 +292,8 @@ struct NegativeBinomialSampler {
 template<typename xpu>
 struct SampleGeneralizedNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mu, const IType *alpha, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {

From 08bd8abfdb50927ea19e5d5fa39fbf684e49f03e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 15:52:13 -0800
Subject: [PATCH 08/43] add nightly test

---
 tests/nightly/test_large_array.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 121acc174b51..a9edf4497890 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -29,6 +29,25 @@ def test_ndarray2numpy(self):
         x.shape
         test = x.asnumpy()
         assert (x.shape == test.shape)
+    
+    def test_ndarray_ones(self):
+        arr = nd.ones(shape=(100000000, 50))
+        assert arr[-1][0] == 1
+        assert nd.sum(arr).asnumpy() == 5000000000
+
+    def test_ndarray_zeros(self):
+        arr = nd.zeros(shape=(5000000000))
+        assert arr.shape == (5000000000,)
+        assert arr.size == 5000000000
+
+    def test_ndarray_arrange(self):
+        arr = mx.nd.arange(0, 5000000000, dtype='int64')
+        assert arr[-1] == 4999999999
+        assert mx.nd.slice(arr, begin=-2, end=-1) == 4999999998
+
+    def test_ndarray_random_uniform(self):
+        arr = mx.nd.random.uniform(shape=(100000000, 50))
+        assert arr[-1][0] != 0
 
 if __name__ == '__main__':
     unittest.main()

From 244f3865e4381374b5551bc5ade642467b8cbe01 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 16:05:38 -0800
Subject: [PATCH 09/43] fix lint error

---
 src/operator/tensor/indexing_op.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 03981574fbf5..fba331e25705 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -314,7 +314,8 @@ struct Take {
    * \param axis        axis id
    */
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const IType* idx,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const IType* idx,
                                   const mshadow::Shape<10> in_stride,
                                   const mshadow::Shape<10> out_stride,
                                   const int in_ndims, const int out_ndims, const int idx_ndims,

From 3ecd257c66391ed5961c6f8d7d81508ac166ec21 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 17:14:28 -0800
Subject: [PATCH 10/43] fix compilation error on gpu

---
 src/operator/mxnet_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index e1cbd47bf242..16ba9aefaa7a 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -512,7 +512,7 @@ struct Kernel<OP, cpu> {
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (size_t i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     }
@@ -546,7 +546,7 @@ struct Kernel<OP, cpu> {
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (size_t i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     }

From c70afe8a3d3bedfad43460d9bcd1dfa6f293e342 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 8 Nov 2018 22:23:43 -0800
Subject: [PATCH 11/43] fix gpu compilation

---
 src/operator/tensor/indexing_op.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 0d72b1815fde..bad3e5a1a6c5 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -439,22 +439,22 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const bool deterministic,
 
 struct backward_gather_nd_gpu {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
       offset += strides[j] * static_cast<int>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       atomicAdd(out + (offset + j), data[i * K + j]);
     }
   }
 };
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,

From ffcd17548c6037f9430ac684569b7689d657351e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 00:03:38 -0800
Subject: [PATCH 12/43] fix build issue

---
 src/operator/mxnet_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 16ba9aefaa7a..49f3b717bc63 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -512,7 +512,7 @@ struct Kernel<OP, cpu> {
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (index_t i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }
@@ -546,7 +546,7 @@ struct Kernel<OP, cpu> {
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (index_t i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }

From dbe0e6c347f92960420d8638f78e8ac5b3b4593e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 05:48:17 -0800
Subject: [PATCH 13/43] fix windows build error

---
 src/operator/mxnet_op.h                    | 2 +-
 src/operator/tensor/broadcast_reduce-inl.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 49f3b717bc63..5445fdd8be53 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -573,7 +573,7 @@ struct Kernel<OP, cpu> {
     } else {
       const auto length = (N + omp_threads - 1) / omp_threads;
       #pragma omp parallel for num_threads(omp_threads)
-      for (auto i = 0; i < N; i += length) {
+      for (index_t i = 0; i < static_cast<index_t>(N); i += length) {
         OP::Map(i, i + length > N ? N - i : length, args...);
       }
     }
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 2db1b4441124..0c770bff74b6 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -200,7 +200,7 @@ void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (size_t idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
   }
@@ -312,7 +312,7 @@ void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (size_t idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
       lhs_stride, rhs_stride);

From 068018444312235617dca5abb52ec03b28a03a93 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 13:30:50 -0800
Subject: [PATCH 14/43] fix build issue in windows

---
 cpp-package/include/mxnet-cpp/base.h       | 2 +-
 cpp-package/include/mxnet-cpp/ndarray.hpp  | 4 ++--
 src/operator/tensor/broadcast_reduce-inl.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
index d0f1bea15f00..e98f805d66ab 100644
--- a/cpp-package/include/mxnet-cpp/base.h
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -34,7 +34,7 @@
 namespace mxnet {
 namespace cpp {
 
-typedef unsigned index_t;
+typedef int64_t index_t;
 
 enum OpReqType {
   /*! \brief no operation, do not write anything */
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index 966cf75c9122..b29fa28e2996 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -76,7 +76,7 @@ inline NDArray::NDArray(const mx_float *data, const Shape &shape,
   MXNDArraySyncCopyFromCPU(handle, data, shape.Size());
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
-inline NDArray::NDArray(const std::vector<mx_float> &data, const Shape &shape,
+inline NDArray::NDArray(const std::vector<index_t> &data, const Shape &shape,
                         const Context &context) {
   NDArrayHandle handle;
   CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
@@ -397,7 +397,7 @@ inline size_t NDArray::Size() const {
 }
 
 inline std::vector<mx_uint> NDArray::GetShape() const {
-  const mx_uint *out_pdata;
+  const index_t *out_pdata;
   mx_uint out_dim;
   MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
   std::vector<mx_uint> ret;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 0c770bff74b6..68dcfad79ebb 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -249,7 +249,7 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   index_t* ws_dptr = reinterpret_cast<index_t*>(workspace.dptr_);
   size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (size_t k = 0; k < M; k++) {
+  for (index_t k = 0; k < static_cast<index_t>(M); k++) {
     Shape<ndim> coord = unravel(k, rshape);
     ws_dptr[k] = dot(coord, rstride);
   }

From 8fda02aa4f618566b3835f0f38222d91c213b72b Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 13:55:03 -0800
Subject: [PATCH 15/43] fix omp build issue

---
 src/operator/tensor/broadcast_reduce-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 68dcfad79ebb..141d2fb83d0d 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -215,7 +215,7 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add
                                   const Shape<ndim> rstride,
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (size_t idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     Shape<ndim> coord = unravel(idx, sshape);
     index_t j = ravel(coord, bshape);
     DType val, residual;

From 87cd1445e0bd34fef015c7d59a0618e2799697be Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 14:09:50 -0800
Subject: [PATCH 16/43] fix cpp-package build error

---
 cpp-package/include/mxnet-cpp/ndarray.h   |  4 +--
 cpp-package/include/mxnet-cpp/ndarray.hpp | 12 ++++-----
 cpp-package/include/mxnet-cpp/symbol.h    |  8 +++---
 cpp-package/include/mxnet-cpp/symbol.hpp  | 30 +++++++++++------------
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
index 6f37d91aa68e..ce1095f1cb49 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.h
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -134,7 +134,7 @@ class NDArray {
   * \param constext context of NDArray
   * \param delay_alloc whether delay the allocation
   */
-  NDArray(const std::vector<mx_uint> &shape, const Context &context,
+  NDArray(const std::vector<index_t> &shape, const Context &context,
           bool delay_alloc = true);
   /*!
   * \brief construct a new dynamic NDArray
@@ -444,7 +444,7 @@ class NDArray {
   /*!
   * \return the shape of current NDArray, in the form of mx_uint vector
   */
-  std::vector<mx_uint> GetShape() const;
+  std::vector<index_t> GetShape() const;
   /*!
   * \return the data type of current NDArray
   */
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index b29fa28e2996..90ce3604facc 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -46,7 +46,7 @@ inline NDArray::NDArray() {
 inline NDArray::NDArray(const NDArrayHandle &handle) {
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
-inline NDArray::NDArray(const std::vector<mx_uint> &shape, const Context &context,
+inline NDArray::NDArray(const std::vector<index_t> &shape, const Context &context,
                         bool delay_alloc) {
   NDArrayHandle handle;
   CHECK_EQ(MXNDArrayCreate(shape.data(), shape.size(), context.GetDeviceType(),
@@ -76,7 +76,7 @@ inline NDArray::NDArray(const mx_float *data, const Shape &shape,
   MXNDArraySyncCopyFromCPU(handle, data, shape.Size());
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
-inline NDArray::NDArray(const std::vector<index_t> &data, const Shape &shape,
+inline NDArray::NDArray(const std::vector<mx_float> &data, const Shape &shape,
                         const Context &context) {
   NDArrayHandle handle;
   CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
@@ -396,12 +396,12 @@ inline size_t NDArray::Size() const {
   return ret;
 }
 
-inline std::vector<mx_uint> NDArray::GetShape() const {
+inline std::vector<index_t> NDArray::GetShape() const {
   const index_t *out_pdata;
-  mx_uint out_dim;
+  index_t out_dim;
   MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
-  std::vector<mx_uint> ret;
-  for (mx_uint i = 0; i < out_dim; ++i) {
+  std::vector<index_t> ret;
+  for (index_t i = 0; i < out_dim; ++i) {
     ret.push_back(out_pdata[i]);
   }
   return ret;
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index a25824cad602..80eec6376b48 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -161,10 +161,10 @@ class Symbol {
   * \param aux_shapes use to store the infered shapes of auxiliary states
   */
   void InferShape(
-      const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
-      std::vector<std::vector<mx_uint> > *in_shape,
-      std::vector<std::vector<mx_uint> > *aux_shape,
-      std::vector<std::vector<mx_uint> > *out_shape) const;
+      const std::map<std::string, std::vector<index_t> > &arg_shapes,
+      std::vector<std::vector<index_t> > *in_shape,
+      std::vector<std::vector<index_t> > *aux_shape,
+      std::vector<std::vector<index_t> > *out_shape) const;
   /*!
   * \brief List the arguments names.
   *
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index b82e060ca8da..938d61883190 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -181,14 +181,14 @@ inline std::string Symbol::GetName() const {
 }
 
 inline void Symbol::InferShape(
-    const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
-    std::vector<std::vector<mx_uint> > *in_shape,
-    std::vector<std::vector<mx_uint> > *aux_shape,
-    std::vector<std::vector<mx_uint> > *out_shape) const {
+    const std::map<std::string, std::vector<index_t> > &arg_shapes,
+    std::vector<std::vector<index_t> > *in_shape,
+    std::vector<std::vector<index_t> > *aux_shape,
+    std::vector<std::vector<index_t> > *out_shape) const {
 
   std::vector<const char *> keys;
   std::vector<mx_uint> arg_ind_ptr;
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<index_t> arg_shape_data;
 
   for (const auto &arg : arg_shapes) {
     keys.push_back(arg.first.c_str());
@@ -201,13 +201,13 @@ inline void Symbol::InferShape(
 
   mx_uint in_shape_size;
   const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const index_t **in_shape_data;
   mx_uint out_shape_size;
   const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const index_t **out_shape_data;
   mx_uint aux_shape_size;
   const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const index_t **aux_shape_data;
   int complete;
 
   CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
@@ -220,19 +220,19 @@ inline void Symbol::InferShape(
 
   if (complete) {
     for (mx_uint i = 0; i < in_shape_size; ++i) {
-      in_shape->push_back(std::vector<mx_uint>());
+      in_shape->push_back(std::vector<index_t>());
       for (mx_uint j = 0; j < in_shape_ndim[i]; ++j) {
         (*in_shape)[i].push_back(in_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < aux_shape_size; ++i) {
-      aux_shape->push_back(std::vector<mx_uint>());
+      aux_shape->push_back(std::vector<index_t>());
       for (mx_uint j = 0; j < aux_shape_ndim[i]; ++j) {
         (*aux_shape)[i].push_back(aux_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < out_shape_size; ++i) {
-      out_shape->push_back(std::vector<mx_uint>());
+      out_shape->push_back(std::vector<index_t>());
       for (mx_uint j = 0; j < out_shape_ndim[i]; ++j) {
         (*out_shape)[i].push_back(out_shape_data[i][j]);
       }
@@ -250,8 +250,8 @@ inline void Symbol::InferExecutorArrays(
     const std::map<std::string, NDArray> &aux_map) const {
 
   const auto arg_name_list = ListArguments();
-  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
-  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+  std::vector<std::vector<index_t> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<index_t> > arg_shapes;
 
   for (const auto &arg_name : arg_name_list) {
     auto iter = args_map.find(arg_name);
@@ -307,8 +307,8 @@ inline void Symbol::InferArgsMap(
     const std::map<std::string, NDArray> &known_args) const {
 
   const auto arg_name_list = ListArguments();
-  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
-  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+  std::vector<std::vector<index_t> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<index_t> > arg_shapes;
 
   for (const auto &arg_name : arg_name_list) {
     auto iter = known_args.find(arg_name);

From 7afc7a8cb5c7b559555f4e46890b1c00f0a3b17f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 15:05:58 -0800
Subject: [PATCH 17/43] fix mkldnn build

---
 cpp-package/include/mxnet-cpp/initializer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
index 021808b38e34..df34928e6387 100644
--- a/cpp-package/include/mxnet-cpp/initializer.h
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -167,7 +167,7 @@ class Xavier : public Initializer {
     Shape shape(arr->GetShape());
     float hw_scale = 1.0f;
     if (shape.ndim() > 2) {
-      for (size_t i = 2; i < shape.ndim(); ++i) {
+      for (index_t i = 2; i < shape.ndim(); ++i) {
         hw_scale *= shape[i];
       }
     }

From 862be243f9d7f58a91856a542f604a7102b44253 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 9 Nov 2018 20:58:36 -0800
Subject: [PATCH 18/43] fix an array size bound

---
 python/mxnet/ndarray/ndarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index c35d3f7e3932..a842a4507f99 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -131,7 +131,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array_buf(mx_long, native_array('I', shape)),
+        c_array_buf(mx_long, native_array('q', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),

From 22213fae9e6e229ea2086419f4723659ea583737 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Sat, 10 Nov 2018 06:15:42 -0800
Subject: [PATCH 19/43] add constants in tests

---
 tests/nightly/test_large_array.py | 58 ++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index a9edf4497890..3f2cf35daae1 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -17,37 +17,53 @@
 
 import unittest
 import mxnet as mx
+import numpy as np
 from mxnet import gluon, nd
 
+# dimension constants
+MEDIUM_X = 10000
+LARGE_X = MEDIUM_X * MEDIUM_X
+SMALL_Y = 50
+LARGE_SIZE = LARGE_X * SMALL_Y
 
 class TestLargeArray(unittest.TestCase):
-    def test_ndarray2numpy(self):
-        m = gluon.nn.Embedding(14000, 128)
+    def test_gluon_embedding(self):
+        m = gluon.nn.Embedding(SMALL_Y, MEDIUM_X)
         m.initialize()
-        ind = nd.zeros((700000, 128))
-        x = m(ind)
-        x.shape
-        test = x.asnumpy()
-        assert (x.shape == test.shape)
-    
-    def test_ndarray_ones(self):
-        arr = nd.ones(shape=(100000000, 50))
-        assert arr[-1][0] == 1
-        assert nd.sum(arr).asnumpy() == 5000000000
+        a = nd.zeros((MEDIUM_X, SMALL_Y))
+        b = m(a)
+        assert b.shape == (MEDIUM_X, SMALL_Y, MEDIUM_X)
+        assert b.asnumpy().size == LARGE_SIZE
 
     def test_ndarray_zeros(self):
-        arr = nd.zeros(shape=(5000000000))
-        assert arr.shape == (5000000000,)
-        assert arr.size == 5000000000
+        a = nd.zeros(shape=(LARGE_X, SMALL_Y))
+        assert a[-1][0] == 0
+        assert a.shape == (LARGE_X, SMALL_Y)
+        assert a.size == LARGE_SIZE
+
+    def test_ndarray_ones(self):
+        a = nd.ones(shape=(LARGE_X, SMALL_Y))
+        assert a[-1][0] == 1
+        assert nd.sum(a).asnumpy() == LARGE_SIZE
 
-    def test_ndarray_arrange(self):
-        arr = mx.nd.arange(0, 5000000000, dtype='int64')
-        assert arr[-1] == 4999999999
-        assert mx.nd.slice(arr, begin=-2, end=-1) == 4999999998
+    def test_ndarray_zeros2(self):
+        a = nd.zeros(shape=(LARGE_SIZE))
+        assert a[LARGE_SIZE-1] == 0
+        assert a.shape == (LARGE_SIZE,)
+
+    def test_ndarray_arange(self):
+        a = nd.arange(0, LARGE_SIZE, dtype='int64')
+        assert a[-1] == LARGE_SIZE - 1
+        assert nd.slice(a, begin=-2, end=-1) == (LARGE_SIZE - 2)
 
     def test_ndarray_random_uniform(self):
-        arr = mx.nd.random.uniform(shape=(100000000, 50))
-        assert arr[-1][0] != 0
+        a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
+        assert a[-1][0] != 0
+
+    def test_ndarray_empty(self):
+        a = np.empty((LARGE_SIZE,))
+        b = nd.array(a)
+        assert b.shape == (LARGE_SIZE,)
 
 if __name__ == '__main__':
     unittest.main()

From cbaa553473528a493299a8c9499777f65e067fa4 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 13:50:24 -0800
Subject: [PATCH 20/43] fix sparse array

---
 include/mxnet/c_api.h          | 2 +-
 python/mxnet/ndarray/sparse.py | 6 +++---
 src/c_api/c_api.cc             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index c93767731d67..7084424d7221 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -542,7 +542,7 @@ MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
                                       mx_uint num_aux,
                                       int *aux_type,
                                       mx_uint *aux_ndims,
-                                      const mx_uint *aux_shape,
+                                      const dim_t *aux_shape,
                                       NDArrayHandle *out);
 
 /*!
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 7b4cc90648c2..51b3793f2415 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -41,7 +41,7 @@
 from ..base import NotSupportedForSparseNDArray
 from ..base import _LIB, numeric_types
 from ..base import c_array_buf, mx_real_t, integer_types
-from ..base import mx_uint, NDArrayHandle, check_call
+from ..base import mx_uint, mx_long, NDArrayHandle, check_call
 from ..context import Context, current_context
 from . import _internal
 from . import op
@@ -90,7 +90,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
     num_aux = mx_uint(len(aux_types))
     check_call(_LIB.MXNDArrayCreateSparseEx(
         ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
-        c_array_buf(mx_uint, native_array('I', shape)),
+        c_array_buf(mx_long, native_array('q', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -99,7 +99,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
         num_aux,
         c_array_buf(ctypes.c_int, native_array('i', aux_type_ids)),
         c_array_buf(mx_uint, native_array('I', aux_shape_lens)),
-        c_array_buf(mx_uint, native_array('I', aux_shapes)),
+        c_array_buf(mx_long, native_array('q', aux_shapes)),
         ctypes.byref(hdl)))
     return hdl
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d503bf4f19db..1ed997e18a64 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -191,7 +191,7 @@ int MXNDArrayCreateSparseEx(int storage_type,
                     mx_uint num_aux,
                     int *aux_type,
                     mx_uint *aux_ndims,
-                    const mx_uint *aux_shape,
+                    const dim_t *aux_shape,
                     NDArrayHandle *out) {
   API_BEGIN();
   std::vector<int> aux_types;

From 7eca035642c2dd3f2340c77faf201461d3a1b52d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 14:57:46 -0800
Subject: [PATCH 21/43] fix unit test

---
 python/mxnet/symbol/symbol.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index c6575072cc70..76b8a49e2a4f 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -34,7 +34,7 @@
 
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
-from ..base import mx_uint, py_str, string_types, integer_types
+from ..base import mx_uint, mx_long, py_str, string_types, integer_types
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context, current_context
@@ -1096,13 +1096,13 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             keys = c_str_array(str_keys)
         arg_shape_size = mx_uint()
         arg_shape_ndim = ctypes.POINTER(mx_uint)()
-        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_long))()
         out_shape_size = mx_uint()
         out_shape_ndim = ctypes.POINTER(mx_uint)()
-        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_long))()
         aux_shape_size = mx_uint()
         aux_shape_ndim = ctypes.POINTER(mx_uint)()
-        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_long))()
         complete = ctypes.c_int()
         if partial:
             infer_func = _LIB.MXSymbolInferShapePartial
@@ -1113,7 +1113,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             mx_uint(len(indptr) - 1),
             keys,
             c_array_buf(mx_uint, array('I', indptr)),
-            c_array_buf(mx_uint, array('I', sdata)),
+            c_array_buf(mx_long, array('q', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),

From 1b48d4a2158a14985950fc8a4465e1c9108f56f8 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 16:31:17 -0800
Subject: [PATCH 22/43] fix unit test

---
 include/mxnet/c_api.h          | 2 +-
 python/mxnet/ndarray/sparse.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 7084424d7221..2dcc7992aade 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -650,7 +650,7 @@ MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
                                            const NDArrayHandle handle_src,
-                                           const int i);
+                                           const dim_t i);
 
 /*!
  * \brief check whether the NDArray format is valid
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 51b3793f2415..13f8ab6d0e5b 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -1009,9 +1009,9 @@ def _csr_matrix_from_definition(data, indices, indptr, shape=None, ctx=None,
         raise ValueError('invalid shape')
     result = CSRNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
                                           [indptr_type, indices_type], aux_shapes))
-    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1)))
-    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_int(0)))
-    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_longlong(-1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_longlong(0)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_longlong(1)))
     return result
     # pylint: enable= no-member, protected-access
 

From cb2ee1ef9b2c95e460bf9b831f627a2c9f5258e5 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 17:16:28 -0800
Subject: [PATCH 23/43] fix R and scala package build

---
 R-package/src/base.h                                 |  4 ++--
 R-package/src/ndarray.cc                             |  6 +++---
 R-package/src/symbol.cc                              |  6 +++---
 .../src/main/native/org_apache_mxnet_native_c_api.cc | 12 ++++++------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/R-package/src/base.h b/R-package/src/base.h
index 8645d8576b0e..6d15b8b1a0ce 100644
--- a/R-package/src/base.h
+++ b/R-package/src/base.h
@@ -354,8 +354,8 @@ inline std::vector<std::string> SafeGetListNames(const Rcpp::List& src) {
  * \param rshape The dimension in R
  * \return A internal vector representation of shapes in mxnet.
  */
-inline std::vector<mx_uint> Dim2InternalShape(const Rcpp::Dimension &rshape) {
-  std::vector<mx_uint> shape(rshape.size());
+inline std::vector<dim_t> Dim2InternalShape(const Rcpp::Dimension &rshape) {
+  std::vector<dim_t> shape(rshape.size());
   for (size_t i = 0; i < rshape.size(); ++i) {
     shape[rshape.size() - i - 1] = rshape[i];
   }
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index 94d24f3fb46b..fdcac7e5079f 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -180,7 +180,7 @@ Rcpp::RObject NDArrayPacker::CreateNDArrayPacker() {
 
 Rcpp::Dimension NDArray::dim() const {
   mx_uint ndim;
-  const mx_uint *pshape;
+  const dim_t *pshape;
   MX_CALL(MXNDArrayGetShape(
       ptr_->handle, &ndim, &pshape));
   Rcpp::IntegerVector dat(pshape, pshape + ndim);
@@ -190,7 +190,7 @@ Rcpp::Dimension NDArray::dim() const {
 }
 
 NDArray NDArray::Clone() const {
-  std::vector<mx_uint> shape = Dim2InternalShape(this->dim());
+  std::vector<dim_t> shape = Dim2InternalShape(this->dim());
   Context ctx = this->ctx();
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
@@ -276,7 +276,7 @@ Rcpp::List NDArray::Load(const std::string& filename) {
 NDArray::RObjectType NDArray::Empty(
     const Rcpp::Dimension& rshape,
     const Context::RObjectType& rctx) {
-  std::vector<mx_uint> shape = Dim2InternalShape(rshape);
+  std::vector<dim_t> shape = Dim2InternalShape(rshape);
   Context ctx(rctx);
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 031c9a254019..a6de8da77ed1 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -198,13 +198,13 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
 
   mx_uint in_shape_size;
   const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const dim_t **in_shape_data;
   mx_uint out_shape_size;
   const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const dim_t **out_shape_data;
   mx_uint aux_shape_size;
   const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const dim_t **aux_shape_data;
   int complete;
 
   MX_CALL(MXSymbolInferShape(
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 17d166eac345..60add9f218a1 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -85,7 +85,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
     jint devId, jint delayAlloc, jint dtype, jobject ndArrayHandle) {
   jint *shapeArr = env->GetIntArrayElements(shape, NULL);
   NDArrayHandle out;
-  int ret = MXNDArrayCreateEx(reinterpret_cast<mx_uint *>(shapeArr), static_cast<mx_uint>(ndim),
+  int ret = MXNDArrayCreateEx(reinterpret_cast<dim_t *>(shapeArr), static_cast<mx_uint>(ndim),
                               devType, devId, delayAlloc, dtype, &out);
   env->ReleaseIntArrayElements(shape, shapeArr, 0);
   SetLongField(env, ndArrayHandle, reinterpret_cast<jlong>(out));
@@ -354,7 +354,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayLoadFromRawBytes
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jobject ndimRef, jobject dataBuf) {
   mx_uint ndim;
-  const mx_uint *pdata;
+  const dim_t *pdata;
   int ret = MXNDArrayGetShape(reinterpret_cast<NDArrayHandle>(ndArrayPtr), &ndim, &pdata);
 
   // fill dataBuf
@@ -1549,15 +1549,15 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
 
   mx_uint inShapeSize;
   const mx_uint *inShapeNdim;
-  const mx_uint **inShapeData;
+  const dim_t **inShapeData;
 
   mx_uint outShapeSize;
   const mx_uint *outShapeNdim;
-  const mx_uint **outShapeData;
+  const dim_t **outShapeData;
 
   mx_uint auxShapeSize;
   const mx_uint *auxShapeNdim;
-  const mx_uint **auxShapeData;
+  const dim_t **auxShapeData;
 
   int complete;
 
@@ -1567,7 +1567,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                static_cast<mx_uint>(jnumArgs),
                                keys,
                                reinterpret_cast<const mx_uint *>(argIndPtr),
-                               reinterpret_cast<const mx_uint *>(argShapeData),
+                               reinterpret_cast<const dim_t *>(argShapeData),
                                &inShapeSize,
                                &inShapeNdim,
                                &inShapeData,

From 08471f2950c7ed183fba68e23255c3120ff8c8e3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 22:03:04 -0800
Subject: [PATCH 24/43] Fix build error in scala, julia, perl

---
 cpp-package/include/mxnet-cpp/ndarray.hpp |  4 +--
 include/mxnet/c_api.h                     |  2 +-
 julia/src/base.jl                         |  1 +
 julia/src/ndarray.jl                      |  8 +++---
 perl-package/AI-MXNetCAPI/mxnet.i         | 32 +++++++++++------------
 src/c_api/c_api.cc                        |  2 +-
 6 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index 90ce3604facc..75ca89a40485 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -398,10 +398,10 @@ inline size_t NDArray::Size() const {
 
 inline std::vector<index_t> NDArray::GetShape() const {
   const index_t *out_pdata;
-  index_t out_dim;
+  mx_uint out_dim;
   MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
   std::vector<index_t> ret;
-  for (index_t i = 0; i < out_dim; ++i) {
+  for (mx_uint i = 0; i < out_dim; ++i) {
     ret.push_back(out_pdata[i]);
   }
   return ret;
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 2dcc7992aade..ff8fc0ecd8d3 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -748,7 +748,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
-                                dim_t *out_dim,
+                                uint *out_dim,
                                 const dim_t **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
diff --git a/julia/src/base.jl b/julia/src/base.jl
index ce1c183eafb5..0524d1ebde1e 100644
--- a/julia/src/base.jl
+++ b/julia/src/base.jl
@@ -26,6 +26,7 @@ Base.show(io::IO, e::MXError) = print(io, e.msg)
 # Common types used in MXNet API
 ################################################################################
 const MX_uint = Cuint
+const MX_long = Clonglong
 const MX_float = Cfloat
 const MX_handle = Ptr{Void}
 
diff --git a/julia/src/ndarray.jl b/julia/src/ndarray.jl
index 9e47150a1a00..d79d4c1b08c3 100644
--- a/julia/src/ndarray.jl
+++ b/julia/src/ndarray.jl
@@ -245,8 +245,8 @@ See also the notes on NDArray shapes [`NDArray`](@ref).
 """
 function size(x::NDArray)
   ref_ndim  = Ref{MX_uint}(0)
-  ref_shape = Ref{Ptr{MX_uint}}(0)
-  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+  ref_shape = Ref{Ptr{MX_long}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_long}}),
           x, ref_ndim, ref_shape)
   tuple(map(Int, flipdim(unsafe_wrap(Array, ref_shape[], ref_ndim[]),1))...)
 end
@@ -278,8 +278,8 @@ ndims(x::NDArray) = ndims(x.handle)
 
 function ndims(x::MX_NDArrayHandle)::Int
   ref_ndim  = Ref{MX_uint}(0)
-  ref_shape = Ref{Ptr{MX_uint}}(0)
-  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+  ref_shape = Ref{Ptr{MX_long}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_long}}),
           x, ref_ndim, ref_shape)
   ref_ndim[]
 end
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index b1907f5cd7ec..ba60fb30e8ed 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -384,7 +384,7 @@ int MXNDArrayCreateNone(NDArrayHandle *out);
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-int MXNDArrayCreate(const mx_uint *in,
+int MXNDArrayCreate(const dim_t *in,
                               mx_uint ndim,
                               int dev_type,
                               int dev_id,
@@ -403,7 +403,7 @@ int MXNDArrayCreate(const mx_uint *in,
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-int MXNDArrayCreateEx(const mx_uint *in,
+int MXNDArrayCreateEx(const dim_t *in,
                               mx_uint ndim,
                               int dev_type,
                               int dev_id,
@@ -428,7 +428,7 @@ int MXNDArrayCreateEx(const mx_uint *in,
  * \return 0 when success, -1 when failure happens
  */
 int MXNDArrayCreateSparseEx(int storage_type,
-                                      const mx_uint *in,
+                                      const dim_t *in,
                                       mx_uint ndim,
                                       int dev_type,
                                       int dev_id,
@@ -437,7 +437,7 @@ int MXNDArrayCreateSparseEx(int storage_type,
                                       mx_uint num_aux,
                                       int *in,
                                       mx_uint *in,
-                                      const mx_uint *in,
+                                      const dim_t *in,
                                       NDArrayHandle *out);
 
 
@@ -589,8 +589,8 @@ int MXNDArrayFree(NDArrayHandle handle);
  * \return 0 when success, -1 when failure happens
  */
 int MXNDArraySlice(NDArrayHandle handle,
-                             mx_uint slice_begin,
-                             mx_uint slice_end,
+                             dim_t slice_begin,
+                             dim_t slice_end,
                              NDArrayHandle *out);
 /*!
  * \brief Index the NDArray along axis 0.
@@ -600,7 +600,7 @@ int MXNDArraySlice(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 int MXNDArrayAt(NDArrayHandle handle,
-                          mx_uint idx,
+                          dim_t idx,
                           NDArrayHandle *out);
 /*!
  * \brief get the storage type of the array
@@ -642,7 +642,7 @@ int MXNDArrayReshape64(NDArrayHandle handle,
  */
 int MXNDArrayGetShape(NDArrayHandle handle,
                                 mx_uint *out_dim,
-                                const mx_uint **out_pdata);
+                                const dim_t **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -1293,16 +1293,16 @@ int MXSymbolInferShape(SymbolHandle sym,
                                  mx_uint num_args,
                                  const char** in,
                                  const mx_uint *in,
-                                 const mx_uint *in,
+                                 const dim_t *in,
                                  mx_uint *in_shape_size,
                                  const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
+                                 const dim_t ***in_shape_data,
                                  mx_uint *out_shape_size,
                                  const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
+                                 const dim_t ***out_shape_data,
                                  mx_uint *aux_shape_size,
                                  const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
+                                 const dim_t ***aux_shape_data,
                                  int *out);
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
@@ -1365,13 +1365,13 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
 int MXSymbolInferType(SymbolHandle sym,
                                 mx_uint num_args,
                                 const char** in,
-                                const int *in,
+                                const dim_t *in,
                                 mx_uint *in_type_size,
-                                const int **in_type_data,
+                                const dim_t **in_type_data,
                                 mx_uint *out_type_size,
-                                const int **out_type_data,
+                                const dim_t **out_type_data,
                                 mx_uint *aux_type_size,
-                                const int **aux_type_data,
+                                const dim_t **aux_type_data,
                                 int *out);
 //--------------------------------------------
 // Part 4: Executor interface
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1ed997e18a64..5ed6d085bb13 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -481,7 +481,7 @@ int MXNDArrayGetStorageType(NDArrayHandle handle,
 }
 
 int MXNDArrayGetShape(NDArrayHandle handle,
-                      dim_t *out_dim,
+                      mx_uint *out_dim,
                       const dim_t **out_pdata) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();

From e5a3b32ffff3b40900bb087f609067fa98ae677a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 22:04:48 -0800
Subject: [PATCH 25/43] fix a typo

---
 include/mxnet/c_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index ff8fc0ecd8d3..93bf27ad5d70 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -748,7 +748,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
-                                uint *out_dim,
+                                mx_uint *out_dim,
                                 const dim_t **out_pdata);
 /*!
  * \brief get the content of the data in NDArray

From 629a7c5698449aa0a5e8aec3301e81a73fda1b5a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 13 Nov 2018 22:43:58 -0800
Subject: [PATCH 26/43] fix R-package scala-package compiation error

---
 R-package/src/symbol.cc                                     | 6 +++---
 .../native/src/main/native/org_apache_mxnet_native_c_api.cc | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index a6de8da77ed1..c70bf794f39b 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -168,7 +168,7 @@ Symbol::RObjectType Symbol::GetOutput(mx_uint index) const {
 // helper function to convert shape into Rcpp vector
 inline Rcpp::List BuildShapeData(mx_uint shape_size,
                                  const mx_uint *shape_ndim,
-                                 const mx_uint **shape_data,
+                                 const dim_t **shape_data,
                                  const std::vector<std::string> &names) {
   Rcpp::List ret(shape_size);
   for (mx_uint i = 0; i < shape_size; ++i) {
@@ -185,12 +185,12 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
       << "Need to pass parameters in key=value style.\n";
   std::vector<std::string> keys = kwargs.names();
   std::vector<mx_uint> arg_ind_ptr(1, 0);
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<dim_t> arg_shape_data;
 
   for (size_t i = 0; i < kwargs.size(); ++i) {
     RCHECK(keys[i].length() != 0)
       << "Need to pass parameters in key=value style.\n";
-    std::vector<mx_uint> dim = Dim2InternalShape(kwargs[i]);
+    std::vector<dim_t> dim = Dim2InternalShape(kwargs[i]);
     arg_shape_data.insert(arg_shape_data.end(), dim.begin(), dim.end());
     arg_ind_ptr.push_back(static_cast<mx_uint>(arg_shape_data.size()));
   }
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 60add9f218a1..feaaa84d1a70 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -1519,7 +1519,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateFromFile
 
 int FillSymbolInferShape
   (JNIEnv *env, jmethodID listAppend, jobject joutData,
-    mx_uint shapeSize, const mx_uint *shapeNdim, const mx_uint **shapeData) {
+    mx_uint shapeSize, const mx_uint *shapeNdim, const dim_t **shapeData) {
   for (size_t i = 0; i < shapeSize; ++i) {
     jintArray jshape = env->NewIntArray(shapeNdim[i]);
     if (jshape == NULL) {

From 2dd990a379946ed2104c3cde58b1ab76fc64e4c6 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 14 Nov 2018 10:36:09 -0800
Subject: [PATCH 27/43] fix scala unit test

---
 .../src/main/native/org_apache_mxnet_native_c_api.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index feaaa84d1a70..af6a42f200ef 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -81,13 +81,13 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateNone
 }
 
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
-  (JNIEnv *env, jobject obj, jintArray shape, jint ndim, jint devType,
+  (JNIEnv *env, jobject obj, jlongArray shape, jint ndim, jint devType,
     jint devId, jint delayAlloc, jint dtype, jobject ndArrayHandle) {
-  jint *shapeArr = env->GetIntArrayElements(shape, NULL);
+  jlong *shapeArr = env->GetLongArrayElements(shape, NULL);
   NDArrayHandle out;
   int ret = MXNDArrayCreateEx(reinterpret_cast<dim_t *>(shapeArr), static_cast<mx_uint>(ndim),
                               devType, devId, delayAlloc, dtype, &out);
-  env->ReleaseIntArrayElements(shape, shapeArr, 0);
+  env->ReleaseLongArrayElements(shape, shapeArr, 0);
   SetLongField(env, ndArrayHandle, reinterpret_cast<jlong>(out));
   return ret;
 }
@@ -1526,7 +1526,7 @@ int FillSymbolInferShape
       // TODO(Yizhi): out of memory error thrown, return a specific error code ?
       return -1;
     }
-    env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
+    env->SetLongArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jlong *>(shapeData[i]));
     env->CallObjectMethod(joutData, listAppend, jshape);
     env->DeleteLocalRef(jshape);
   }
@@ -1562,7 +1562,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   int complete;
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
-  jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
+  jlong *argShapeData = env->GetLongArrayElements(jargShapeData, NULL);
   int ret = MXSymbolInferShape(reinterpret_cast<SymbolHandle>(symbolPtr),
                                static_cast<mx_uint>(jnumArgs),
                                keys,
@@ -1578,7 +1578,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                &auxShapeNdim,
                                &auxShapeData,
                                &complete);
-  env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
+  env->ReleaseLongArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 
   if (ret == 0) {

From 5b0cd3aa5f576198ab91cbe0ec08b16c94d4f737 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 14 Nov 2018 15:18:14 -0800
Subject: [PATCH 28/43] fix python2 unit test

---
 python/mxnet/ndarray/ndarray.py | 2 +-
 python/mxnet/ndarray/sparse.py  | 4 ++--
 python/mxnet/symbol/symbol.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index a842a4507f99..e8ddab30412b 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -131,7 +131,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array_buf(mx_long, native_array('q', shape)),
+        c_array_buf(mx_long, native_array('l', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 13f8ab6d0e5b..502a2e1bff4e 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -90,7 +90,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
     num_aux = mx_uint(len(aux_types))
     check_call(_LIB.MXNDArrayCreateSparseEx(
         ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
-        c_array_buf(mx_long, native_array('q', shape)),
+        c_array_buf(mx_long, native_array('l', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -99,7 +99,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
         num_aux,
         c_array_buf(ctypes.c_int, native_array('i', aux_type_ids)),
         c_array_buf(mx_uint, native_array('I', aux_shape_lens)),
-        c_array_buf(mx_long, native_array('q', aux_shapes)),
+        c_array_buf(mx_long, native_array('l', aux_shapes)),
         ctypes.byref(hdl)))
     return hdl
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 76b8a49e2a4f..8381cf1bf37f 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1113,7 +1113,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             mx_uint(len(indptr) - 1),
             keys,
             c_array_buf(mx_uint, array('I', indptr)),
-            c_array_buf(mx_long, array('q', sdata)),
+            c_array_buf(mx_long, array('l', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),

From 43ba3aa919dabc34732b71ffac049a4ced5b5462 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 14 Nov 2018 21:30:16 -0800
Subject: [PATCH 29/43] fix scala unit test

---
 .../native/src/main/native/org_apache_mxnet_native_c_api.cc     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index af6a42f200ef..1cfd847c8245 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -1521,7 +1521,7 @@ int FillSymbolInferShape
   (JNIEnv *env, jmethodID listAppend, jobject joutData,
     mx_uint shapeSize, const mx_uint *shapeNdim, const dim_t **shapeData) {
   for (size_t i = 0; i < shapeSize; ++i) {
-    jintArray jshape = env->NewIntArray(shapeNdim[i]);
+    jlongArray jshape = env->NewLongArray(shapeNdim[i]);
     if (jshape == NULL) {
       // TODO(Yizhi): out of memory error thrown, return a specific error code ?
       return -1;

From 5286f6328a2537e9dae95ffc4e6cc3660fb36264 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 14 Nov 2018 22:24:41 -0800
Subject: [PATCH 30/43] fix scala unit test

---
 .../native/src/main/native/org_apache_mxnet_native_c_api.cc     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 1cfd847c8245..8d24e4ef1b6d 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -1534,7 +1534,7 @@ int FillSymbolInferShape
 }
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
-    jintArray jargIndPtr, jintArray jargShapeData,
+    jintArray jargIndPtr, jlongArray jargShapeData,
     jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
   const char **keys = NULL;
   if (jkeys != NULL) {

From 7247e6bd5266852591041161ed849ea563447e96 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 16 Nov 2018 09:48:54 -0800
Subject: [PATCH 31/43] fix scala build

---
 .../core/src/main/scala/org/apache/mxnet/LibInfo.scala      | 6 +++---
 .../core/src/main/scala/org/apache/mxnet/NDArray.scala      | 4 ++--
 .../core/src/main/scala/org/apache/mxnet/Symbol.scala       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index 0a5683aa7ab3..75f6883b6131 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -40,7 +40,7 @@ private[mxnet] class LibInfo {
                                  paramVals: Array[String]): Int
   @native def mxNDArrayFree(handle: NDArrayHandle): Int
   @native def mxNDArrayCreateNone(out: NDArrayHandleRef): Int
-  @native def mxNDArrayCreateEx(shape: Array[Int],
+  @native def mxNDArrayCreateEx(shape: Array[Long],
                                 ndim: Int,
                                 devType: Int,
                                 devId: Int,
@@ -75,7 +75,7 @@ private[mxnet] class LibInfo {
                              paramVals: Array[Array[Byte]]): Int
   @native def mxNDArrayGetShape(handle: NDArrayHandle,
                                 ndim: MXUintRef,
-                                data: ArrayBuffer[Int]): Int
+                                data: ArrayBuffer[Long]): Int
   @native def mxNDArraySyncCopyToCPU(handle: NDArrayHandle,
                                      data: Array[Byte],
                                      size: Int): Int
@@ -236,7 +236,7 @@ private[mxnet] class LibInfo {
                                  numArgs: MXUint,
                                  keys: Array[String],
                                  argIndPtr: Array[MXUint],
-                                 argShapeData: Array[MXUint],
+                                 argShapeData: Array[Long],
                                  inShapeData: ListBuffer[Array[Int]],
                                  outShapeData: ListBuffer[Array[Int]],
                                  auxShapeData: ListBuffer[Array[Int]],
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index f9f2dbe42a90..8e59edf7e6ab 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -135,7 +135,7 @@ object NDArray extends NDArrayBase {
                              dtype: DType = DType.Float32): NDArrayHandle = {
     val hdl = new NDArrayHandleRef
     checkCall(_LIB.mxNDArrayCreateEx(
-      shape.toArray,
+      shape.toArray.map(_.toLong),
       shape.length,
       ctx.deviceTypeid,
       ctx.deviceId,
@@ -1012,7 +1012,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
   def shape: Shape = {
     val ndim = new MXUintRef
     val data = ArrayBuffer[Int]()
-    checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data))
+    checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data.map(_.toLong)))
     require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
     Shape(data)
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 4472a8426f9f..73f8bc5e9b2b 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -264,7 +264,7 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
     val auxShapeData = ListBuffer.empty[Array[Int]]
     val complete = new RefInt
 
-    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
+    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values.map(_.toLong),
       argShapeData, outShapeData, auxShapeData, complete))
     if (complete.value != 0) {
       (argShapeData.map(s => Shape(s)).toIndexedSeq,

From f8839b376f01fb73adf849ffefc24626c00a67a1 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 16 Nov 2018 12:04:50 -0800
Subject: [PATCH 32/43] fix python unit test

---
 python/mxnet/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 3e06f47a8ea3..a99179fc5c41 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -215,7 +215,7 @@ def _load_lib():
 # type definitions
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
-mx_long = ctypes.c_longlong
+mx_long = ctypes.c_long
 mx_float_p = ctypes.POINTER(mx_float)
 mx_real_t = np.float32
 NDArrayHandle = ctypes.c_void_p

From e1cd1cd005486719b77a69748df98f03ec5eabad Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 16 Nov 2018 17:08:44 -0800
Subject: [PATCH 33/43] update scala-package to fix unittest

---
 .../main/scala/org/apache/mxnet/LibInfo.scala  |  6 +++---
 .../main/scala/org/apache/mxnet/NDArray.scala  |  4 ++--
 .../main/scala/org/apache/mxnet/Symbol.scala   |  2 +-
 .../native/org_apache_mxnet_native_c_api.cc    | 18 +++++++++---------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index 75f6883b6131..0a5683aa7ab3 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -40,7 +40,7 @@ private[mxnet] class LibInfo {
                                  paramVals: Array[String]): Int
   @native def mxNDArrayFree(handle: NDArrayHandle): Int
   @native def mxNDArrayCreateNone(out: NDArrayHandleRef): Int
-  @native def mxNDArrayCreateEx(shape: Array[Long],
+  @native def mxNDArrayCreateEx(shape: Array[Int],
                                 ndim: Int,
                                 devType: Int,
                                 devId: Int,
@@ -75,7 +75,7 @@ private[mxnet] class LibInfo {
                              paramVals: Array[Array[Byte]]): Int
   @native def mxNDArrayGetShape(handle: NDArrayHandle,
                                 ndim: MXUintRef,
-                                data: ArrayBuffer[Long]): Int
+                                data: ArrayBuffer[Int]): Int
   @native def mxNDArraySyncCopyToCPU(handle: NDArrayHandle,
                                      data: Array[Byte],
                                      size: Int): Int
@@ -236,7 +236,7 @@ private[mxnet] class LibInfo {
                                  numArgs: MXUint,
                                  keys: Array[String],
                                  argIndPtr: Array[MXUint],
-                                 argShapeData: Array[Long],
+                                 argShapeData: Array[MXUint],
                                  inShapeData: ListBuffer[Array[Int]],
                                  outShapeData: ListBuffer[Array[Int]],
                                  auxShapeData: ListBuffer[Array[Int]],
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 8e59edf7e6ab..f9f2dbe42a90 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -135,7 +135,7 @@ object NDArray extends NDArrayBase {
                              dtype: DType = DType.Float32): NDArrayHandle = {
     val hdl = new NDArrayHandleRef
     checkCall(_LIB.mxNDArrayCreateEx(
-      shape.toArray.map(_.toLong),
+      shape.toArray,
       shape.length,
       ctx.deviceTypeid,
       ctx.deviceId,
@@ -1012,7 +1012,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
   def shape: Shape = {
     val ndim = new MXUintRef
     val data = ArrayBuffer[Int]()
-    checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data.map(_.toLong)))
+    checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data))
     require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
     Shape(data)
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 73f8bc5e9b2b..4472a8426f9f 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -264,7 +264,7 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
     val auxShapeData = ListBuffer.empty[Array[Int]]
     val complete = new RefInt
 
-    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values.map(_.toLong),
+    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
       argShapeData, outShapeData, auxShapeData, complete))
     if (complete.value != 0) {
       (argShapeData.map(s => Shape(s)).toIndexedSeq,
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 8d24e4ef1b6d..e996b22e6d2f 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -81,13 +81,13 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateNone
 }
 
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
-  (JNIEnv *env, jobject obj, jlongArray shape, jint ndim, jint devType,
+  (JNIEnv *env, jobject obj, jintArray shape, jint ndim, jint devType,
     jint devId, jint delayAlloc, jint dtype, jobject ndArrayHandle) {
-  jlong *shapeArr = env->GetLongArrayElements(shape, NULL);
+  jint *shapeArr = env->GetIntArrayElements(shape, NULL);
   NDArrayHandle out;
   int ret = MXNDArrayCreateEx(reinterpret_cast<dim_t *>(shapeArr), static_cast<mx_uint>(ndim),
                               devType, devId, delayAlloc, dtype, &out);
-  env->ReleaseLongArrayElements(shape, shapeArr, 0);
+  env->ReleaseIntArrayElements(shape, shapeArr, 0);
   SetLongField(env, ndArrayHandle, reinterpret_cast<jlong>(out));
   return ret;
 }
@@ -365,7 +365,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   jmethodID arrayAppend = env->GetMethodID(arrayClass,
     "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ArrayBuffer;");
   for (size_t i = 0; i < ndim; ++i) {
-    jobject data = env->NewObject(integerClass, newInteger, pdata[i]);
+    jobject data = env->NewObject(integerClass, newInteger, static_cast<int>(pdata[i]));
     env->CallObjectMethod(dataBuf, arrayAppend, data);
     env->DeleteLocalRef(data);
   }
@@ -1521,12 +1521,12 @@ int FillSymbolInferShape
   (JNIEnv *env, jmethodID listAppend, jobject joutData,
     mx_uint shapeSize, const mx_uint *shapeNdim, const dim_t **shapeData) {
   for (size_t i = 0; i < shapeSize; ++i) {
-    jlongArray jshape = env->NewLongArray(shapeNdim[i]);
+    jintArray jshape = env->NewIntArray(shapeNdim[i]);
     if (jshape == NULL) {
       // TODO(Yizhi): out of memory error thrown, return a specific error code ?
       return -1;
     }
-    env->SetLongArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jlong *>(shapeData[i]));
+    env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
     env->CallObjectMethod(joutData, listAppend, jshape);
     env->DeleteLocalRef(jshape);
   }
@@ -1534,7 +1534,7 @@ int FillSymbolInferShape
 }
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
-    jintArray jargIndPtr, jlongArray jargShapeData,
+    jintArray jargIndPtr, jintArray jargShapeData,
     jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
   const char **keys = NULL;
   if (jkeys != NULL) {
@@ -1562,7 +1562,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   int complete;
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
-  jlong *argShapeData = env->GetLongArrayElements(jargShapeData, NULL);
+  jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
   int ret = MXSymbolInferShape(reinterpret_cast<SymbolHandle>(symbolPtr),
                                static_cast<mx_uint>(jnumArgs),
                                keys,
@@ -1578,7 +1578,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                &auxShapeNdim,
                                &auxShapeData,
                                &complete);
-  env->ReleaseLongArrayElements(jargShapeData, argShapeData, 0);
+  env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 
   if (ret == 0) {

From e0fe05ce89f554ed5f1544bfbaa7a0a068ec2aaa Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 15:15:24 -0800
Subject: [PATCH 34/43] fix scala unit test

---
 .../native/org_apache_mxnet_native_c_api.cc   | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index e996b22e6d2f..71fcf540b5e2 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -83,10 +83,18 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateNone
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
   (JNIEnv *env, jobject obj, jintArray shape, jint ndim, jint devType,
     jint devId, jint delayAlloc, jint dtype, jobject ndArrayHandle) {
+  // TODO: this is a workaround to get scala unit test pass
+  // need to update scala APIs to support large array
+  const size_t length = env->GetArrayLength(shape);
   jint *shapeArr = env->GetIntArrayElements(shape, NULL);
+  jlong *tmpShapeArr = new jlong[length];
+  for (size_t i = 0; i < length; ++i) {
+    tmpShapeArr[i] = shapeArr[i];
+  }
   NDArrayHandle out;
-  int ret = MXNDArrayCreateEx(reinterpret_cast<dim_t *>(shapeArr), static_cast<mx_uint>(ndim),
+  int ret = MXNDArrayCreateEx(reinterpret_cast<dim_t *>(tmpShapeArr), static_cast<mx_uint>(ndim),
                               devType, devId, delayAlloc, dtype, &out);
+  delete[] tmpShapeArr;
   env->ReleaseIntArrayElements(shape, shapeArr, 0);
   SetLongField(env, ndArrayHandle, reinterpret_cast<jlong>(out));
   return ret;
@@ -381,8 +389,10 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyToCPU
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jbyteArray data, jint size) {
   jbyte *pdata = env->GetByteArrayElements(data, NULL);
+  // TODO: this is a workaround to get scala unit test pass
+  // need to update scala APIs to support large array
   int ret = MXNDArraySyncCopyToCPU(reinterpret_cast<NDArrayHandle>(ndArrayPtr),
-                                   reinterpret_cast<void *>(pdata), size);
+                                   reinterpret_cast<void *>(pdata), static_cast<dim_t>(size));
   env->ReleaseByteArrayElements(data, pdata, 0);  // copy back to java array automatically
   return ret;
 }
@@ -417,8 +427,11 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
   (JNIEnv *env, jobject obj, jlong arrayPtr, jfloatArray sourceArr, jint arrSize) {
   jfloat *sourcePtr = env->GetFloatArrayElements(sourceArr, NULL);
+  // TODO: this is a workaround to get scala unit test pass
+  // need to update scala APIs to support large array
   int ret = MXNDArraySyncCopyFromCPU(reinterpret_cast<NDArrayHandle>(arrayPtr),
-                                     static_cast<const mx_float *>(sourcePtr), arrSize);
+                                     static_cast<const mx_float *>(sourcePtr), 
+                                     static_cast<jlong>(arrSize));
   env->ReleaseFloatArrayElements(sourceArr, sourcePtr, 0);
   return ret;
 }
@@ -1563,11 +1576,18 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
   jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
+  // TODO: this is a workaround to get scala unit test pass
+  // need to update scala APIs to support large array
+  const size_t argShapeLength = env->GetArrayLength(jargShapeData);
+  jlong *argShapeDataTmp = new jlong[argShapeLength];
+  for (size_t i = 0; i < argShapeLength; ++i) {
+    argShapeDataTmp[i] = argShapeData[i];
+  }
   int ret = MXSymbolInferShape(reinterpret_cast<SymbolHandle>(symbolPtr),
                                static_cast<mx_uint>(jnumArgs),
                                keys,
                                reinterpret_cast<const mx_uint *>(argIndPtr),
-                               reinterpret_cast<const dim_t *>(argShapeData),
+                               reinterpret_cast<const dim_t *>(argShapeDataTmp),
                                &inShapeSize,
                                &inShapeNdim,
                                &inShapeData,
@@ -1578,6 +1598,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                &auxShapeNdim,
                                &auxShapeData,
                                &complete);
+  delete[] argShapeDataTmp;
   env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 

From 024a0ce7efe6d410c47f1f7139970fbf43b16fc1 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 15:34:23 -0800
Subject: [PATCH 35/43] fix array typecode for python 2 and python 3

---
 python/mxnet/base.py            | 2 +-
 python/mxnet/ndarray/ndarray.py | 3 ++-
 python/mxnet/ndarray/sparse.py  | 5 +++--
 python/mxnet/ndarray/utils.py   | 6 ++++++
 python/mxnet/symbol/symbol.py   | 4 +++-
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index a99179fc5c41..3e06f47a8ea3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -215,7 +215,7 @@ def _load_lib():
 # type definitions
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
-mx_long = ctypes.c_long
+mx_long = ctypes.c_longlong
 mx_float_p = ctypes.POINTER(mx_float)
 mx_real_t = np.float32
 NDArrayHandle = ctypes.c_void_p
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 368748762c9f..5cf69f12aead 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -41,6 +41,7 @@
 from . import _internal
 from . import op
 from ._internal import NDArrayBase
+from .utils import get_array_typecode
 
 __all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP",
            "ones", "add", "arange", "eye", "divide", "equal", "full", "greater", "greater_equal",
@@ -131,7 +132,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array_buf(mx_long, native_array('l', shape)),
+        c_array_buf(mx_long, native_array(get_array_typecode(), shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index dcb6a8908501..318c7cf16301 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -56,6 +56,7 @@
 from .ndarray import zeros as _zeros_ndarray
 from .ndarray import array as _array
 from .ndarray import _ufunc_helper
+from .utils import get_array_typecode
 
 
 try:
@@ -90,7 +91,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
     num_aux = mx_uint(len(aux_types))
     check_call(_LIB.MXNDArrayCreateSparseEx(
         ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
-        c_array_buf(mx_long, native_array('l', shape)),
+        c_array_buf(mx_long, native_array(get_array_typecode(), shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -99,7 +100,7 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
         num_aux,
         c_array_buf(ctypes.c_int, native_array('i', aux_type_ids)),
         c_array_buf(mx_uint, native_array('I', aux_shape_lens)),
-        c_array_buf(mx_long, native_array('l', aux_shapes)),
+        c_array_buf(mx_long, native_array(get_array_typecode(), aux_shapes)),
         ctypes.byref(hdl)))
     return hdl
 
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index ff93d0be6d73..a1a6fca9b9eb 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -271,3 +271,9 @@ def save(fname, data):
                                   mx_uint(len(handles)),
                                   handles,
                                   keys))
+
+def get_array_typecode():
+    if sys.version_info.major > 2:
+        return 'q'
+    else:
+        return 'l'
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 9a02eebdf3cc..d7f8f9f19099 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -45,6 +45,8 @@
 from . import _internal
 from . import op
 from ._internal import SymbolBase, _set_symbol_class
+from ..ndarray.utils import get_array_typecode
+
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
            "pow", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
@@ -1113,7 +1115,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             mx_uint(len(indptr) - 1),
             keys,
             c_array_buf(mx_uint, array('I', indptr)),
-            c_array_buf(mx_long, array('l', sdata)),
+            c_array_buf(mx_long, array(get_array_typecode(), sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),

From 1f3361b351a06002c400261145dc419df21b1df1 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 15:53:59 -0800
Subject: [PATCH 36/43] lint it

---
 .../src/main/native/org_apache_mxnet_native_c_api.cc      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 71fcf540b5e2..d77c21224fd5 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -83,7 +83,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateNone
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
   (JNIEnv *env, jobject obj, jintArray shape, jint ndim, jint devType,
     jint devId, jint delayAlloc, jint dtype, jobject ndArrayHandle) {
-  // TODO: this is a workaround to get scala unit test pass
+  // TODO(andrewfayres): this is a workaround to get scala unit test pass
   // need to update scala APIs to support large array
   const size_t length = env->GetArrayLength(shape);
   jint *shapeArr = env->GetIntArrayElements(shape, NULL);
@@ -389,7 +389,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyToCPU
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jbyteArray data, jint size) {
   jbyte *pdata = env->GetByteArrayElements(data, NULL);
-  // TODO: this is a workaround to get scala unit test pass
+  // TODO(andrewfayres): this is a workaround to get scala unit test pass
   // need to update scala APIs to support large array
   int ret = MXNDArraySyncCopyToCPU(reinterpret_cast<NDArrayHandle>(ndArrayPtr),
                                    reinterpret_cast<void *>(pdata), static_cast<dim_t>(size));
@@ -427,7 +427,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
   (JNIEnv *env, jobject obj, jlong arrayPtr, jfloatArray sourceArr, jint arrSize) {
   jfloat *sourcePtr = env->GetFloatArrayElements(sourceArr, NULL);
-  // TODO: this is a workaround to get scala unit test pass
+  // TODO(andrewfayres): this is a workaround to get scala unit test pass
   // need to update scala APIs to support large array
   int ret = MXNDArraySyncCopyFromCPU(reinterpret_cast<NDArrayHandle>(arrayPtr),
                                      static_cast<const mx_float *>(sourcePtr), 
@@ -1576,7 +1576,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
   jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
-  // TODO: this is a workaround to get scala unit test pass
+  // TODO(andrewfayres): this is a workaround to get scala unit test pass
   // need to update scala APIs to support large array
   const size_t argShapeLength = env->GetArrayLength(jargShapeData);
   jlong *argShapeDataTmp = new jlong[argShapeLength];

From 1cd9b88c1230257168cfe8da7dde7e19d1d765c3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 16:13:42 -0800
Subject: [PATCH 37/43] lint it again

---
 .../native/src/main/native/org_apache_mxnet_native_c_api.cc     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index d77c21224fd5..bbd161d6aa5b 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -430,7 +430,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
   // TODO(andrewfayres): this is a workaround to get scala unit test pass
   // need to update scala APIs to support large array
   int ret = MXNDArraySyncCopyFromCPU(reinterpret_cast<NDArrayHandle>(arrayPtr),
-                                     static_cast<const mx_float *>(sourcePtr), 
+                                     static_cast<const mx_float *>(sourcePtr),
                                      static_cast<jlong>(arrSize));
   env->ReleaseFloatArrayElements(sourceArr, sourcePtr, 0);
   return ret;

From 335e896c277197ddb20464f85d362a2652276230 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 16:49:27 -0800
Subject: [PATCH 38/43] fix python include error

---
 python/mxnet/ndarray/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index a1a6fca9b9eb..ce5079a45ceb 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 """Utility functions for NDArray and BaseSparseNDArray."""
 import ctypes
+import sys
 
 from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle
 from ..base import c_array, c_handle_array, c_str_array

From a08c79e03e07a676368eac1f13539da108b6dd04 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 19 Nov 2018 22:54:40 -0800
Subject: [PATCH 39/43] fix unit test

---
 python/mxnet/ndarray/ndarray.py | 11 +++++++++--
 python/mxnet/ndarray/sparse.py  |  2 +-
 python/mxnet/ndarray/utils.py   |  9 +--------
 python/mxnet/symbol/symbol.py   |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 5cf69f12aead..28c1a352dda7 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -29,6 +29,7 @@
 
 from array import array as native_array
 import ctypes
+import sys
 import warnings
 import operator
 from functools import reduce # pylint: disable=redefined-builtin
@@ -41,14 +42,13 @@
 from . import _internal
 from . import op
 from ._internal import NDArrayBase
-from .utils import get_array_typecode
 
 __all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP",
            "ones", "add", "arange", "eye", "divide", "equal", "full", "greater", "greater_equal",
            "imdecode", "lesser", "lesser_equal", "logical_and", "logical_or", "logical_xor",
            "maximum", "minimum", "moveaxis", "modulo", "multiply", "not_equal", "onehot_encode",
            "power", "subtract", "true_divide", "waitall", "_new_empty_handle", "histogram",
-           "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack"]
+           "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack", "get_array_typecode"]
 
 _STORAGE_TYPE_UNDEFINED = -1
 _STORAGE_TYPE_DEFAULT = 0
@@ -4034,3 +4034,10 @@ def from_dlpack(dlpack):
     # delete the deleter of the old dlpack
     ctypes.pythonapi.PyCapsule_SetDestructor(dlpack, None)
     return NDArray(handle=handle)
+
+
+def get_array_typecode():
+    if sys.version_info.major > 2:
+        return 'q'
+    else:
+        return 'l'
\ No newline at end of file
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 4cc3fece9fdc..5c7d4545fe34 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -56,7 +56,7 @@
 from .ndarray import zeros as _zeros_ndarray
 from .ndarray import array as _array
 from .ndarray import _ufunc_helper
-from .utils import get_array_typecode
+from .ndarray import get_array_typecode
 
 
 try:
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index ce5079a45ceb..14e332fc117e 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -18,7 +18,6 @@
 # coding: utf-8
 """Utility functions for NDArray and BaseSparseNDArray."""
 import ctypes
-import sys
 
 from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle
 from ..base import c_array, c_handle_array, c_str_array
@@ -271,10 +270,4 @@ def save(fname, data):
     check_call(_LIB.MXNDArraySave(c_str(fname),
                                   mx_uint(len(handles)),
                                   handles,
-                                  keys))
-
-def get_array_typecode():
-    if sys.version_info.major > 2:
-        return 'q'
-    else:
-        return 'l'
+                                  keys))
\ No newline at end of file
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index d7f8f9f19099..abe5ae389d71 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -45,7 +45,7 @@
 from . import _internal
 from . import op
 from ._internal import SymbolBase, _set_symbol_class
-from ..ndarray.utils import get_array_typecode
+from ..ndarray.ndarray import get_array_typecode
 
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",

From 69703fc37829ad44a052b82ddc7feb698fafcaf6 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 20 Nov 2018 09:12:13 -0800
Subject: [PATCH 40/43] lint me in

---
 python/mxnet/ndarray/ndarray.py | 2 +-
 python/mxnet/ndarray/utils.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 28c1a352dda7..3f536c99eea4 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -4040,4 +4040,4 @@ def get_array_typecode():
     if sys.version_info.major > 2:
         return 'q'
     else:
-        return 'l'
\ No newline at end of file
+        return 'l'
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index 14e332fc117e..ff93d0be6d73 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -270,4 +270,4 @@ def save(fname, data):
     check_call(_LIB.MXNDArraySave(c_str(fname),
                                   mx_uint(len(handles)),
                                   handles,
-                                  keys))
\ No newline at end of file
+                                  keys))

From 01952c56cfc5fdfee272e6fda8fc78146c7fbf88 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 20 Nov 2018 14:02:20 -0800
Subject: [PATCH 41/43] fix python unit test in python2 windows

---
 python/mxnet/base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 3e06f47a8ea3..3582adc62002 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -215,7 +215,10 @@ def _load_lib():
 # type definitions
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
-mx_long = ctypes.c_longlong
+if sys.version_info.major > 2:
+    mx_long = ctypes.c_longlong
+else:
+    mx_long = ctypes.c_long
 mx_float_p = ctypes.POINTER(mx_float)
 mx_real_t = np.float32
 NDArrayHandle = ctypes.c_void_p

From a68bd971a31387e2d6b6c88f508768eae5f32aa3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 20 Nov 2018 14:10:08 -0800
Subject: [PATCH 42/43] fix perl-package unit test

---
 perl-package/AI-MXNetCAPI/mxnet.i | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index ba60fb30e8ed..7571f14e420a 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1332,16 +1332,16 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
                                  mx_uint num_args,
                                  const char** in,
                                  const mx_uint *in,
-                                 const mx_uint *in,
+                                 const dim_t *in,
                                  mx_uint *in_shape_size,
                                  const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
+                                 const dim_t ***in_shape_data,
                                  mx_uint *out_shape_size,
                                  const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
+                                 const dim_t ***out_shape_data,
                                  mx_uint *aux_shape_size,
                                  const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
+                                 const dim_t ***aux_shape_data,
                                  int *out);
 
 /*!
@@ -1365,13 +1365,13 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
 int MXSymbolInferType(SymbolHandle sym,
                                 mx_uint num_args,
                                 const char** in,
-                                const dim_t *in,
+                                const int *in,
                                 mx_uint *in_type_size,
                                 const dim_t **in_type_data,
                                 mx_uint *out_type_size,
-                                const dim_t **out_type_data,
+                                const int **out_type_data,
                                 mx_uint *aux_type_size,
-                                const dim_t **aux_type_data,
+                                const int **aux_type_data,
                                 int *out);
 //--------------------------------------------
 // Part 4: Executor interface

From c1b14d1837653b1e722ddde1df43d915156d8db6 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 20 Nov 2018 14:32:56 -0800
Subject: [PATCH 43/43] fix perl package

---
 perl-package/AI-MXNetCAPI/mxnet.i | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index 7571f14e420a..c900598700f7 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1367,7 +1367,7 @@ int MXSymbolInferType(SymbolHandle sym,
                                 const char** in,
                                 const int *in,
                                 mx_uint *in_type_size,
-                                const dim_t **in_type_data,
+                                const int **in_type_data,
                                 mx_uint *out_type_size,
                                 const int **out_type_data,
                                 mx_uint *aux_type_size,