From 88c555a777db5035c8de4c6dfbdcfa27fd2e32ba Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Sat, 27 Jun 2020 05:35:31 +0000
Subject: [PATCH 1/8] enable Large Tensor Support on master as default for Unix
 CPU/GPU and Windows CPU/GPU

---
 CMakeLists.txt                 | 14 ++++++++++++--
 ci/docker/runtime_functions.sh |  2 --
 config/darwin.cmake            |  2 +-
 config/linux.cmake             |  2 +-
 config/linux_gpu.cmake         |  2 +-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30839b45c339..cac025c26722 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,18 @@ endif()
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake)
 
+if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4)
+  message(STATUS "Large Tensor disabled !!")
+  set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+else()
+  message(STATUS "Large Tensor enabled !!")
+  set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+  if(USE_BLAS STREQUAL "MKL")
+    # Enable MKL ILP64 support when Large Tensor enabled
+    set(MKL_USE_ILP64 ON CACHE BOOL "enable MKL ILP64 interface.")
+  endif()
+endif()
+
 include(CMakeDependentOption)
 #Some things have order. This must be put in front alone
 option(MXNET_BUILD_SHARED_LIBS "Build shared libraries instead of static libraries" ON)
@@ -88,8 +100,6 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
 option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF)
 option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
 cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF)
-option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
-option(BUILD_EXTENSION_PATH "Path to extension to build" "")
 option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
 option(LOG_FATAL_THROW "Log exceptions but do not abort" ON)
 cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF)
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f5ee9ba6f0ab..4be3b7a0b4ef 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -694,7 +694,6 @@ build_ubuntu_cpu_large_tensor() {
         -DUSE_CUDA=OFF                          \
         -DUSE_CUDNN=OFF                         \
         -DUSE_MKLDNN=ON                         \
-        -DUSE_INT64_TENSOR_SIZE=ON              \
         -G Ninja                                \
         /work/mxnet
 
@@ -714,7 +713,6 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_INT64_TENSOR_SIZE=ON              \
         -G Ninja                                \
         /work/mxnet
 
diff --git a/config/darwin.cmake b/config/darwin.cmake
index 65e93efb7373..5a7899e018e7 100644
--- a/config/darwin.cmake
+++ b/config/darwin.cmake
@@ -122,7 +122,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
diff --git a/config/linux.cmake b/config/linux.cmake
index 8881402ede8e..55c1d0810d81 100644
--- a/config/linux.cmake
+++ b/config/linux.cmake
@@ -121,7 +121,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 933857ce6739..cedcbac9c5f9 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -125,7 +125,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)

From 485dc645f2607d8c740e78762d4edc8d7b98e33d Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Tue, 30 Jun 2020 07:27:35 +0000
Subject: [PATCH 2/8] fixing windows CI for Large Tensor

---
 src/operator/numpy/np_polynomial_op.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_polynomial_op.cc b/src/operator/numpy/np_polynomial_op.cc
index 3fc94395946a..4018964263dd 100644
--- a/src/operator/numpy/np_polynomial_op.cc
+++ b/src/operator/numpy/np_polynomial_op.cc
@@ -53,7 +53,8 @@ struct polyval_backward_p {
     DType igrad_p = 0;
     index_t j = x_size - 1;
     while (j >= 0) {
-        igrad_p += pow(x_dptr[j], p_size - i - 1) * ograd_dptr[j];
+        igrad_p += pow(x_dptr[j], static_cast<DType>(p_size) -
+                                  static_cast<DType>(i + 1)) * ograd_dptr[j];
         j--;
     }
     KERNEL_ASSIGN(igrad_p_dptr[i], req, igrad_p);

From 48d73286c1dab9c89143037124bf89afd43e881d Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Fri, 3 Jul 2020 00:04:45 +0000
Subject: [PATCH 3/8] disable Lasrge Tensor on miscellneaous build. To be done
 in stage 2

---
 ci/docker/runtime_functions.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 4be3b7a0b4ef..c25738ac5b40 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -407,6 +407,7 @@ build_ubuntu_cpu_gcc8_werror() {
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CPP_PACKAGE=ON \
         -DMXNET_USE_CPU=ON \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -GNinja /work/mxnet
     ninja
 }
@@ -419,6 +420,7 @@ build_ubuntu_cpu_clang10_werror() {
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
        -DUSE_CPP_PACKAGE=ON \
        -DMXNET_USE_CPU=ON \
+       -DUSE_INT64_TENSOR_SIZE=OFF \
        -GNinja /work/mxnet
     ninja
 }
@@ -438,6 +440,7 @@ build_ubuntu_gpu_clang10_werror() {
        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
        -DUSE_CPP_PACKAGE=OFF \
+       -DUSE_INT64_TENSOR_SIZE=OFF \
        -GNinja /work/mxnet
     ninja
 }

From 63690096b7d511417953f7e8e8994c11f5fb464e Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Fri, 3 Jul 2020 04:14:25 +0000
Subject: [PATCH 4/8] revert disable

---
 ci/docker/runtime_functions.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c25738ac5b40..4be3b7a0b4ef 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -407,7 +407,6 @@ build_ubuntu_cpu_gcc8_werror() {
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_CPP_PACKAGE=ON \
         -DMXNET_USE_CPU=ON \
-        -DUSE_INT64_TENSOR_SIZE=OFF \
         -GNinja /work/mxnet
     ninja
 }
@@ -420,7 +419,6 @@ build_ubuntu_cpu_clang10_werror() {
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
        -DUSE_CPP_PACKAGE=ON \
        -DMXNET_USE_CPU=ON \
-       -DUSE_INT64_TENSOR_SIZE=OFF \
        -GNinja /work/mxnet
     ninja
 }
@@ -440,7 +438,6 @@ build_ubuntu_gpu_clang10_werror() {
        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
        -DUSE_CPP_PACKAGE=OFF \
-       -DUSE_INT64_TENSOR_SIZE=OFF \
        -GNinja /work/mxnet
     ninja
 }

From fcdee7172e2f35765bcd4d8038e5c7f9248fbdd3 Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Fri, 3 Jul 2020 05:25:15 +0000
Subject: [PATCH 5/8] fixing lapack issues and mkl blas issues

---
 3rdparty/mshadow/mshadow/dot_engine-inl.h | 28 +++++++++++------------
 CMakeLists.txt                            | 21 ++++++++---------
 src/operator/contrib/transformer.cc       | 14 ++++++------
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/3rdparty/mshadow/mshadow/dot_engine-inl.h b/3rdparty/mshadow/mshadow/dot_engine-inl.h
index 93273154b429..1adfdf600326 100644
--- a/3rdparty/mshadow/mshadow/dot_engine-inl.h
+++ b/3rdparty/mshadow/mshadow/dot_engine-inl.h
@@ -314,12 +314,12 @@ struct BLASEngine<cpu, float> {
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   // since same m/n/k is used for all single gemms, so we put all gemms into one group
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   float p_alpha[GROUP_SIZE] = {alpha};
   float p_beta[GROUP_SIZE] = {beta};
@@ -327,7 +327,7 @@ struct BLASEngine<cpu, float> {
   CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
   CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 
@@ -423,12 +423,12 @@ struct BLASEngine<cpu, double> {
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   // since same m/n/k is used for all single gemms, so we put all gemms into one group
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   double p_alpha[GROUP_SIZE] = {alpha};
   double p_beta[GROUP_SIZE] = {beta};
@@ -436,7 +436,7 @@ struct BLASEngine<cpu, double> {
   CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
   CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cac025c26722..69858b658d2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,18 +36,6 @@ endif()
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake)
 
-if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4)
-  message(STATUS "Large Tensor disabled !!")
-  set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
-else()
-  message(STATUS "Large Tensor enabled !!")
-  set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
-  if(USE_BLAS STREQUAL "MKL")
-    # Enable MKL ILP64 support when Large Tensor enabled
-    set(MKL_USE_ILP64 ON CACHE BOOL "enable MKL ILP64 interface.")
-  endif()
-endif()
-
 include(CMakeDependentOption)
 #Some things have order. This must be put in front alone
 option(MXNET_BUILD_SHARED_LIBS "Build shared libraries instead of static libraries" ON)
@@ -316,6 +304,14 @@ endif()
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
+if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4)
+  message(STATUS "Large Tensor disabled")
+  set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+else()
+  message(STATUS "Large Tensor enabled")
+  set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+endif()
+
 include(cmake/ChooseBlas.cmake)
 
 if(USE_ASAN)
@@ -994,3 +990,4 @@ if(BUILD_CYTHON_MODULES)
     message(FATAL_ERROR "No python interpreter found to build cython modules")
   endif()
 endif()
+
diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc
index 43c322e9ca21..e85e1d22b6b0 100644
--- a/src/operator/contrib/transformer.cc
+++ b/src/operator/contrib/transformer.cc
@@ -140,12 +140,12 @@ void strided_batch_sgemm(bool transA, bool transB,
 
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   float p_alpha[GROUP_SIZE] = {alpha};
   float p_beta[GROUP_SIZE] = {beta};
@@ -153,7 +153,7 @@ void strided_batch_sgemm(bool transA, bool transB,
   CBLAS_TRANSPOSE cblas_a_trans = transA ? CblasTrans : CblasNoTrans;
   CBLAS_TRANSPOSE cblas_b_trans = transB ? CblasTrans : CblasNoTrans;
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batchCount};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batchCount)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 

From 9e0796db0c1fecd505b9266e04a237c6cff0b642 Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Tue, 10 Nov 2020 21:52:55 +0000
Subject: [PATCH 6/8] make CentOS builds as int32 only

---
 ci/docker/runtime_functions.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 4be3b7a0b4ef..7fdcff16be48 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -267,7 +267,7 @@ build_centos7_cpu() {
         -DUSE_MKLDNN=OFF \
         -DUSE_DIST_KVSTORE=ON \
         -DUSE_CUDA=OFF \
-        -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -282,6 +282,7 @@ build_centos7_mkldnn() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=OFF \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -299,7 +300,7 @@ build_centos7_gpu() {
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_DIST_KVSTORE=ON\
-        -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }

From 5cdf8d87c634e4e036a1bd44bffc794c22d94a15 Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Mon, 16 Nov 2020 18:58:23 +0000
Subject: [PATCH 7/8] cleanup after rebase

---
 CMakeLists.txt                 | 9 ++-------
 ci/docker/runtime_functions.sh | 4 +++-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69858b658d2d..2413c5679e95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,7 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
 option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF)
 option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
 cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF)
+option(BUILD_EXTENSION_PATH "Path to extension to build" "")
 option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
 option(LOG_FATAL_THROW "Log exceptions but do not abort" ON)
 cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF)
@@ -304,13 +305,7 @@ endif()
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4)
-  message(STATUS "Large Tensor disabled")
-  set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
-else()
-  message(STATUS "Large Tensor enabled")
-  set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
-endif()
+cmake_dependent_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" ON "CMAKE_SIZEOF_VOID_P EQUAL 8" OFF)
 
 include(cmake/ChooseBlas.cmake)
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7fdcff16be48..08f10dbca90d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -267,6 +267,7 @@ build_centos7_cpu() {
         -DUSE_MKLDNN=OFF \
         -DUSE_DIST_KVSTORE=ON \
         -DUSE_CUDA=OFF \
+        -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
@@ -299,7 +300,8 @@ build_centos7_gpu() {
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_DIST_KVSTORE=ON\
+        -DUSE_DIST_KVSTORE=ON \
+        -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja

From f4a025bfbc6f550c039daa82f5ac6d88b06fe1dd Mon Sep 17 00:00:00 2001
From: Rohit Kumar Srivastava <srivastava.141@buckeyemail.osu.edu>
Date: Tue, 17 Nov 2020 01:57:17 +0000
Subject: [PATCH 8/8] fixing insert-op for win-GPU CI

---
 src/operator/numpy/np_insert_op_slice-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_insert_op_slice-inl.h b/src/operator/numpy/np_insert_op_slice-inl.h
index 8fd8e0bfc9e5..fd32d9bc7bc1 100644
--- a/src/operator/numpy/np_insert_op_slice-inl.h
+++ b/src/operator/numpy/np_insert_op_slice-inl.h
@@ -154,7 +154,7 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs,
       CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
     }
     size_t temp_storage_bytes, temp_mem_size;
-    temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, int, xpu>(indices_len, false, true);
+    temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, index_t, xpu>(indices_len, false, true);
     temp_mem_size = indices_len * sizeof(int64_t) * 2 +
                     indices_len * sizeof(index_t) +
                     outshape[axis] * sizeof(index_t) * 2 +