Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

Commit 29d6f27

Browse files
authored
Use RTC for elementwise and broadcast ops (#18622)
* Reapplying PR #17767 * Making RTC required * Move cuda utils to src/common/cuda and refactor RTC part * Unary ops via RTC * Support binary_scalar forward Remove elemwise_scatter_op.* Fix BinaryScalar usage in NumPy * Backward of binary scalar * Binary forward * Fix for binary_scalar * Moving all binary forward to RTC Reorganization * Backward of binary ops * Suuport broadcast Add RTC to NumPy ops * RTC for elementwise sum Fixes * RTC for backward usenone of broadcast * RTC for broadcast bwd usein * Remove non-RTC vectorization support * Remove template from ReduceWorkspaceSize * Fixes from rebase * Guarding RTC usage behing MXNET_USE_CUDA * More guards * C++17 for CUDA code * MixedUnaryBackwardInOut as RTC * Removing unused variable * Revert "C++17 for CUDA code" This reverts commit b09090c. * Get rid of CI tests without RTC Get rid of if constexpr as CUDA 10 does not support it * Fix lint * Change a few more elemwise functions Fix for too long value * Fix large tensor build * Another try with DBL_MAX * Fix Windows compilation * Fix the large int test * Add the printing of error code value to CUDA_DRIVER_CALL * Fix * Fix binary scalar * Get more information when cuLaunchKernel fails * Going easy on Windows compiler * Fix lint * Reorganization to split strings due to Windows compilation problems * Fix error with uninitialized value * Fix handling of different types for backward of binary scalar * Decreasing RTC overhead * Fix lint and remove rest of mentions of ENABLE_RTC * Jetson with RTC * Fix the aws s3 command * Debugging Windows failure * More debugging of Windows failure * Debug * Fix the issue on Windows (long -> long long for 8B) * libcuda.so for Jetson * Enable debug information for RTC kernels and cleaning debug ptx dump * Fix lint * Try without linking the stub of libcuda.so to different place in Jetson * Add docstring * Answering review comments * Unifying vectorization * Fix * Fixes for reduce ops * Fix M=1 case * Fixes from rebase Fixes for mixed type gradient functions Set the launch bounds on RTC kernels * Fix * Fix tests * Adding tutorial for RTC * Fixes after merge * Fixes from review * Change env var doc and undo the change to toctree
1 parent bbc39fa commit 29d6f27

File tree

141 files changed

+7274
-3548
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+7274
-3548
lines changed

3rdparty/mshadow/mshadow/base.h

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,6 @@ extern "C" {
272272
}
273273

274274
#include "./half.h"
275-
#include "./half2.h"
276275
#include "./bfloat.h"
277276
#define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP) \
278277
MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
@@ -387,11 +386,6 @@ struct DataType<half::half_t> {
387386
#endif
388387
};
389388
template<>
390-
struct DataType<half::half2_t> {
391-
static const int kFlag = kFloat16;
392-
static const int kLanes = 2;
393-
};
394-
template<>
395389
struct DataType<bfloat::bf16_t> {
396390
static const int kFlag = kBfloat16;
397391
static const int kLanes = 1;
@@ -1144,48 +1138,6 @@ struct minimum {
11441138
}
11451139
#endif
11461140

1147-
#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...) \
1148-
switch (type) { \
1149-
case mshadow::kFloat32: \
1150-
{ \
1151-
typedef float DType; \
1152-
{__VA_ARGS__} \
1153-
} \
1154-
break; \
1155-
case mshadow::kFloat64: \
1156-
{ \
1157-
typedef double DType; \
1158-
{__VA_ARGS__} \
1159-
} \
1160-
break; \
1161-
case mshadow::kFloat16: \
1162-
{ \
1163-
typedef mshadow::half::half2_t DType; \
1164-
{__VA_ARGS__} \
1165-
} \
1166-
break; \
1167-
case mshadow::kUint8: \
1168-
{ \
1169-
typedef uint8_t DType; \
1170-
{__VA_ARGS__} \
1171-
} \
1172-
break; \
1173-
case mshadow::kInt32: \
1174-
{ \
1175-
typedef int32_t DType; \
1176-
{__VA_ARGS__} \
1177-
} \
1178-
break; \
1179-
case mshadow::kInt64: \
1180-
{ \
1181-
typedef int64_t DType; \
1182-
{__VA_ARGS__} \
1183-
} \
1184-
break; \
1185-
default: \
1186-
LOG(FATAL) << "Unknown type enum " << type; \
1187-
}
1188-
11891141
#define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...) \
11901142
switch (type) { \
11911143
case mshadow::kFloat32: \

3rdparty/mshadow/mshadow/half2.h

Lines changed: 0 additions & 162 deletions
This file was deleted.

CMakeLists.txt

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
7979
option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
8080
option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
8181
option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
82-
option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
8382
option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
8483
option(INSTALL_EXAMPLES "Install the example source files." OFF)
8584
option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
@@ -547,18 +546,11 @@ if(USE_CUDA)
547546

548547
string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")
549548

550-
find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
551-
OPTIONAL_COMPONENTS nvToolsExt nvrtc)
549+
find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver
550+
OPTIONAL_COMPONENTS nvToolsExt)
552551

553-
list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
554-
if(ENABLE_CUDA_RTC)
555-
if(CUDA_nvrtc_LIBRARY)
556-
list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
557-
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
558-
else()
559-
message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
560-
endif()
561-
endif()
552+
list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand
553+
CUDA::nvrtc CUDA::cuda_driver)
562554
list(APPEND SOURCE ${CUDA})
563555
add_definitions(-DMXNET_USE_CUDA=1)
564556

ci/build_windows.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ class BuildFlavour(Enum):
6161
'-DCMAKE_CXX_COMPILER=cl '
6262
'-DUSE_CUDA=OFF '
6363
'-DUSE_CUDNN=OFF '
64-
'-DENABLE_CUDA_RTC=OFF '
6564
'-DUSE_OPENCV=ON '
6665
'-DUSE_OPENMP=ON '
6766
'-DUSE_BLAS=open '
@@ -76,7 +75,6 @@ class BuildFlavour(Enum):
7675
'-DCMAKE_CXX_COMPILER=cl '
7776
'-DUSE_CUDA=OFF '
7877
'-DUSE_CUDNN=OFF '
79-
'-DENABLE_CUDA_RTC=OFF '
8078
'-DUSE_OPENCV=ON '
8179
'-DUSE_OPENMP=ON '
8280
'-DUSE_BLAS=open '
@@ -91,7 +89,6 @@ class BuildFlavour(Enum):
9189
'-DCMAKE_CXX_COMPILER=cl '
9290
'-DUSE_CUDA=OFF '
9391
'-DUSE_CUDNN=OFF '
94-
'-DENABLE_CUDA_RTC=OFF '
9592
'-DUSE_OPENCV=ON '
9693
'-DUSE_OPENMP=ON '
9794
'-DUSE_BLAS=mkl '
@@ -106,7 +103,6 @@ class BuildFlavour(Enum):
106103
'-DCMAKE_CXX_COMPILER=cl '
107104
'-DUSE_CUDA=OFF '
108105
'-DUSE_CUDNN=OFF '
109-
'-DENABLE_CUDA_RTC=OFF '
110106
'-DUSE_OPENCV=ON '
111107
'-DUSE_OPENMP=ON '
112108
'-DUSE_BLAS=mkl '
@@ -121,7 +117,6 @@ class BuildFlavour(Enum):
121117
'-DCMAKE_CXX_COMPILER=cl '
122118
'-DUSE_CUDA=ON '
123119
'-DUSE_CUDNN=ON '
124-
'-DENABLE_CUDA_RTC=ON '
125120
'-DUSE_OPENCV=ON '
126121
'-DUSE_OPENMP=ON '
127122
'-DUSE_BLAS=open '
@@ -136,7 +131,6 @@ class BuildFlavour(Enum):
136131
'-DCMAKE_CXX_COMPILER=cl '
137132
'-DUSE_CUDA=ON '
138133
'-DUSE_CUDNN=ON '
139-
'-DENABLE_CUDA_RTC=ON '
140134
'-DUSE_OPENCV=ON '
141135
'-DUSE_OPENMP=ON '
142136
'-DUSE_BLAS=open '

ci/docker/runtime_functions.sh

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ build_jetson() {
142142
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
143143
-DUSE_CUDA=ON \
144144
-DMXNET_CUDA_ARCH="5.2" \
145-
-DENABLE_CUDA_RTC=OFF \
146145
-DUSE_OPENCV=OFF \
147146
-DUSE_OPENMP=ON \
148147
-DUSE_LAPACK=OFF \
@@ -670,27 +669,6 @@ build_ubuntu_gpu_cmake() {
670669
ninja
671670
}
672671

673-
build_ubuntu_gpu_cmake_no_rtc() {
674-
set -ex
675-
cd /work/build
676-
CC=gcc-7 CXX=g++-7 cmake \
677-
-DUSE_SIGNAL_HANDLER=ON \
678-
-DUSE_CUDA=ON \
679-
-DUSE_CUDNN=ON \
680-
-DUSE_MKL_IF_AVAILABLE=OFF \
681-
-DUSE_MKLML_MKL=OFF \
682-
-DUSE_MKLDNN=ON \
683-
-DUSE_DIST_KVSTORE=ON \
684-
-DCMAKE_BUILD_TYPE=Release \
685-
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
686-
-DBUILD_CYTHON_MODULES=1 \
687-
-DENABLE_CUDA_RTC=OFF \
688-
-G Ninja \
689-
/work/mxnet
690-
691-
ninja
692-
}
693-
694672
build_ubuntu_cpu_large_tensor() {
695673
set -ex
696674
cd /work/build

ci/jenkins/Jenkins_steps.groovy

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -258,20 +258,6 @@ def compile_unix_cmake_gpu(lib_name) {
258258
}]
259259
}
260260

261-
def compile_unix_cmake_gpu_no_rtc(lib_name) {
262-
return ['GPU: CMake CUDA RTC OFF': {
263-
node(NODE_LINUX_CPU) {
264-
ws('workspace/build-cmake-gpu-no-rtc') {
265-
timeout(time: max_time, unit: 'MINUTES') {
266-
utils.init_git()
267-
utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false)
268-
utils.pack_lib(lib_name, mx_cmake_lib)
269-
}
270-
}
271-
}
272-
}]
273-
}
274-
275261
def compile_unix_tensorrt_gpu(lib_name) {
276262
return ['TensorRT': {
277263
node(NODE_LINUX_CPU) {

ci/jenkins/Jenkinsfile_unix_gpu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ core_logic: {
4141
custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
4242
custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
4343
custom_steps.compile_unix_int64_gpu('gpu_int64'),
44-
custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
4544
])
4645

4746
utils.parallel_stage('Tests', [

config/darwin.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,5 +126,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
126126
# Other GPU features
127127
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
128128
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
129-
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
130129
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")

config/linux.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
125125
# Other GPU features
126126
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
127127
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
128-
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
129128
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")

config/linux_gpu.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
125125
# Other GPU features
126126
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
127127
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
128-
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
129128
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")

0 commit comments

Comments
 (0)