From 88c555a777db5035c8de4c6dfbdcfa27fd2e32ba Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Sat, 27 Jun 2020 05:35:31 +0000 Subject: [PATCH 1/8] enable Large Tensor Support on master as default for Unix CPU/GPU and Windows CPU/GPU --- CMakeLists.txt | 14 ++++++++++++-- ci/docker/runtime_functions.sh | 2 -- config/darwin.cmake | 2 +- config/linux.cmake | 2 +- config/linux_gpu.cmake | 2 +- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30839b45c339..cac025c26722 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,18 @@ endif() include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake) +if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4) + message(STATUS "Large Tensor disabled !!") + set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +else() + message(STATUS "Large Tensor enabled !!") + set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") + if(USE_BLAS STREQUAL "MKL") + # Enable MKL ILP64 support when Large Tensor enabled + set(MKL_USE_ILP64 ON CACHE BOOL "enable MKL ILP64 interface.") + endif() +endif() + include(CMakeDependentOption) #Some things have order. This must be put in front alone option(MXNET_BUILD_SHARED_LIBS "Build shared libraries instead of static libraries" ON) @@ -88,8 +100,6 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON) option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF) option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF) -option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF) -option(BUILD_EXTENSION_PATH "Path to extension to build" "") option(BUILD_CYTHON_MODULES "Build cython modules." OFF) option(LOG_FATAL_THROW "Log exceptions but do not abort" ON) cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f5ee9ba6f0ab..4be3b7a0b4ef 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -694,7 +694,6 @@ build_ubuntu_cpu_large_tensor() { -DUSE_CUDA=OFF \ -DUSE_CUDNN=OFF \ -DUSE_MKLDNN=ON \ - -DUSE_INT64_TENSOR_SIZE=ON \ -G Ninja \ /work/mxnet @@ -714,7 +713,6 @@ build_ubuntu_gpu_large_tensor() { -DUSE_DIST_KVSTORE=ON \ -DCMAKE_BUILD_TYPE=Release \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ - -DUSE_INT64_TENSOR_SIZE=ON \ -G Ninja \ /work/mxnet diff --git a/config/darwin.cmake b/config/darwin.cmake index 65e93efb7373..5a7899e018e7 100644 --- a/config/darwin.cmake +++ b/config/darwin.cmake @@ -122,7 +122,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) diff --git a/config/linux.cmake b/config/linux.cmake index 8881402ede8e..55c1d0810d81 100644 --- a/config/linux.cmake +++ b/config/linux.cmake @@ -121,7 +121,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake index 933857ce6739..cedcbac9c5f9 100644 --- a/config/linux_gpu.cmake +++ b/config/linux_gpu.cmake @@ -125,7 +125,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) From 485dc645f2607d8c740e78762d4edc8d7b98e33d Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Tue, 30 Jun 2020 07:27:35 +0000 Subject: [PATCH 2/8] fixing windows CI for Large Tensor --- src/operator/numpy/np_polynomial_op.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/numpy/np_polynomial_op.cc b/src/operator/numpy/np_polynomial_op.cc index 3fc94395946a..4018964263dd 100644 --- a/src/operator/numpy/np_polynomial_op.cc +++ b/src/operator/numpy/np_polynomial_op.cc @@ -53,7 +53,8 @@ struct polyval_backward_p { DType igrad_p = 0; index_t j = x_size - 1; while (j >= 0) { - igrad_p += pow(x_dptr[j], p_size - i - 1) * ograd_dptr[j]; + igrad_p += pow(x_dptr[j], static_cast(p_size) - + static_cast(i + 1)) * ograd_dptr[j]; j--; } KERNEL_ASSIGN(igrad_p_dptr[i], req, igrad_p); From 48d73286c1dab9c89143037124bf89afd43e881d Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Fri, 3 Jul 2020 00:04:45 +0000 Subject: [PATCH 3/8] disable Lasrge Tensor on miscellneaous build. To be done in stage 2 --- ci/docker/runtime_functions.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 4be3b7a0b4ef..c25738ac5b40 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -407,6 +407,7 @@ build_ubuntu_cpu_gcc8_werror() { -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=ON \ -DMXNET_USE_CPU=ON \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } @@ -419,6 +420,7 @@ build_ubuntu_cpu_clang10_werror() { -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=ON \ -DMXNET_USE_CPU=ON \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } @@ -438,6 +440,7 @@ build_ubuntu_gpu_clang10_werror() { -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=OFF \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } From 63690096b7d511417953f7e8e8994c11f5fb464e Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Fri, 3 Jul 2020 04:14:25 +0000 Subject: [PATCH 4/8] revert disable --- ci/docker/runtime_functions.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index c25738ac5b40..4be3b7a0b4ef 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -407,7 +407,6 @@ build_ubuntu_cpu_gcc8_werror() { -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=ON \ -DMXNET_USE_CPU=ON \ - -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } @@ -420,7 +419,6 @@ build_ubuntu_cpu_clang10_werror() { -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=ON \ -DMXNET_USE_CPU=ON \ - -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } @@ -440,7 +438,6 @@ build_ubuntu_gpu_clang10_werror() { -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_CPP_PACKAGE=OFF \ - -DUSE_INT64_TENSOR_SIZE=OFF \ -GNinja /work/mxnet ninja } From fcdee7172e2f35765bcd4d8038e5c7f9248fbdd3 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Fri, 3 Jul 2020 05:25:15 +0000 Subject: [PATCH 5/8] fixing lapack issues and mkl blas issues --- 3rdparty/mshadow/mshadow/dot_engine-inl.h | 28 +++++++++++------------ CMakeLists.txt | 21 ++++++++--------- src/operator/contrib/transformer.cc | 14 ++++++------ 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/3rdparty/mshadow/mshadow/dot_engine-inl.h b/3rdparty/mshadow/mshadow/dot_engine-inl.h index 93273154b429..1adfdf600326 100644 --- a/3rdparty/mshadow/mshadow/dot_engine-inl.h +++ b/3rdparty/mshadow/mshadow/dot_engine-inl.h @@ -314,12 +314,12 @@ struct BLASEngine { #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) // since same m/n/k is used for all single gemms, so we put all gemms into one group const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; float p_alpha[GROUP_SIZE] = {alpha}; float p_beta[GROUP_SIZE] = {beta}; @@ -327,7 +327,7 @@ struct BLASEngine { CBLAS_TRANSPOSE cblas_a_trans = GetT(transa); CBLAS_TRANSPOSE cblas_b_trans = GetT(transb); - MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batch_count)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; @@ -423,12 +423,12 @@ struct BLASEngine { #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) // since same m/n/k is used for all single gemms, so we put all gemms into one group const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; double p_alpha[GROUP_SIZE] = {alpha}; double p_beta[GROUP_SIZE] = {beta}; @@ -436,7 +436,7 @@ struct BLASEngine { CBLAS_TRANSPOSE cblas_a_trans = GetT(transa); CBLAS_TRANSPOSE cblas_b_trans = GetT(transb); - MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batch_count)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; diff --git a/CMakeLists.txt b/CMakeLists.txt index cac025c26722..69858b658d2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,18 +36,6 @@ endif() include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake) -if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4) - message(STATUS "Large Tensor disabled !!") - set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") -else() - message(STATUS "Large Tensor enabled !!") - set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") - if(USE_BLAS STREQUAL "MKL") - # Enable MKL ILP64 support when Large Tensor enabled - set(MKL_USE_ILP64 ON CACHE BOOL "enable MKL ILP64 interface.") - endif() -endif() - include(CMakeDependentOption) #Some things have order. This must be put in front alone option(MXNET_BUILD_SHARED_LIBS "Build shared libraries instead of static libraries" ON) @@ -316,6 +304,14 @@ endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4) + message(STATUS "Large Tensor disabled") + set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +else() + message(STATUS "Large Tensor enabled") + set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +endif() + include(cmake/ChooseBlas.cmake) if(USE_ASAN) @@ -994,3 +990,4 @@ if(BUILD_CYTHON_MODULES) message(FATAL_ERROR "No python interpreter found to build cython modules") endif() endif() + diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc index 43c322e9ca21..e85e1d22b6b0 100644 --- a/src/operator/contrib/transformer.cc +++ b/src/operator/contrib/transformer.cc @@ -140,12 +140,12 @@ void strided_batch_sgemm(bool transA, bool transB, #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; float p_alpha[GROUP_SIZE] = {alpha}; float p_beta[GROUP_SIZE] = {beta}; @@ -153,7 +153,7 @@ void strided_batch_sgemm(bool transA, bool transB, CBLAS_TRANSPOSE cblas_a_trans = transA ? CblasTrans : CblasNoTrans; CBLAS_TRANSPOSE cblas_b_trans = transB ? CblasTrans : CblasNoTrans; - MKL_INT p_group_sizeb[GROUP_SIZE] = {batchCount}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batchCount)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; From 9e0796db0c1fecd505b9266e04a237c6cff0b642 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Tue, 10 Nov 2020 21:52:55 +0000 Subject: [PATCH 6/8] make CentOS builds as int32 only --- ci/docker/runtime_functions.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 4be3b7a0b4ef..7fdcff16be48 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -267,7 +267,7 @@ build_centos7_cpu() { -DUSE_MKLDNN=OFF \ -DUSE_DIST_KVSTORE=ON \ -DUSE_CUDA=OFF \ - -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } @@ -282,6 +282,7 @@ build_centos7_mkldnn() { -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLDNN=ON \ -DUSE_CUDA=OFF \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } @@ -299,7 +300,7 @@ build_centos7_gpu() { -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DUSE_DIST_KVSTORE=ON\ - -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } From 5cdf8d87c634e4e036a1bd44bffc794c22d94a15 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Mon, 16 Nov 2020 18:58:23 +0000 Subject: [PATCH 7/8] cleanup after rebase --- CMakeLists.txt | 9 ++------- ci/docker/runtime_functions.sh | 4 +++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 69858b658d2d..2413c5679e95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON) option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF) option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF) +option(BUILD_EXTENSION_PATH "Path to extension to build" "") option(BUILD_CYTHON_MODULES "Build cython modules." OFF) option(LOG_FATAL_THROW "Log exceptions but do not abort" ON) cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF) @@ -304,13 +305,7 @@ endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) -if(DEFINED USE_INT64_TENSOR_SIZE AND NOT USE_INT64_TENSOR_SIZE OR CMAKE_SIZEOF_VOID_P EQUAL 4) - message(STATUS "Large Tensor disabled") - set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") -else() - message(STATUS "Large Tensor enabled") - set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") -endif() +cmake_dependent_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" ON "CMAKE_SIZEOF_VOID_P EQUAL 8" OFF) include(cmake/ChooseBlas.cmake) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 7fdcff16be48..08f10dbca90d 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -267,6 +267,7 @@ build_centos7_cpu() { -DUSE_MKLDNN=OFF \ -DUSE_DIST_KVSTORE=ON \ -DUSE_CUDA=OFF \ + -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja @@ -299,7 +300,8 @@ build_centos7_gpu() { -DUSE_MKLDNN=ON \ -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ - -DUSE_DIST_KVSTORE=ON\ + -DUSE_DIST_KVSTORE=ON \ + -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja From f4a025bfbc6f550c039daa82f5ac6d88b06fe1dd Mon Sep 17 00:00:00 2001 From: Rohit Kumar Srivastava Date: Tue, 17 Nov 2020 01:57:17 +0000 Subject: [PATCH 8/8] fixing insert-op for win-GPU CI --- src/operator/numpy/np_insert_op_slice-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/numpy/np_insert_op_slice-inl.h b/src/operator/numpy/np_insert_op_slice-inl.h index 8fd8e0bfc9e5..fd32d9bc7bc1 100644 --- a/src/operator/numpy/np_insert_op_slice-inl.h +++ b/src/operator/numpy/np_insert_op_slice-inl.h @@ -154,7 +154,7 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs, CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz)); } size_t temp_storage_bytes, temp_mem_size; - temp_storage_bytes = SortByKeyWorkspaceSize(indices_len, false, true); + temp_storage_bytes = SortByKeyWorkspaceSize(indices_len, false, true); temp_mem_size = indices_len * sizeof(int64_t) * 2 + indices_len * sizeof(index_t) + outshape[axis] * sizeof(index_t) * 2 +