From 815b60faedf7ffe961dcfd078b8c11a3861845bc Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Sun, 11 Oct 2020 17:33:38 -0700 Subject: [PATCH] Create FindCUTENSOR.cmake update cmake config Update CMakeLists.txt debug Update FindCUTENSOR.cmake Update linux_gpu.cmake remove int types to support cuda 10 Update CMakeLists.txt update Update libinfo.cc Revert "Update CMakeLists.txt" This reverts commit 70afbf00cb442fd4b53637fdb1d21190ae5954a4. Revert "Update linux_gpu.cmake" This reverts commit bc5d2fc258773b8ba326f786e7732ff13cad51db. Revert "Revert "Update linux_gpu.cmake"" This reverts commit 96d5f4373eb27ac909af4defdc8a56724f072c2f. Revert "Revert "Update CMakeLists.txt"" This reverts commit ec20b33fed888d584108851a1e44805f92378104. --- CMakeLists.txt | 1 + cmake/Modules/FindCUTENSOR.cmake | 33 ++++++++++++++++++++++++++++++ config/darwin.cmake | 1 + config/linux.cmake | 1 + config/linux_gpu.cmake | 1 + include/mxnet/libinfo.h | 1 + src/libinfo.cc | 2 ++ src/operator/numpy/np_einsum_op.cu | 29 ++------------------------ 8 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 cmake/Modules/FindCUTENSOR.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index af0a04904802..2d76fba2f023 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(USE_OPENMP "Build with Openmp support" ON) option(USE_FATBIN_COMPRESSION "Compress nvcc fatbin output" ON) cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF) cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path +cmake_dependent_option(USE_CUTENSOR "Build with cuTENSOR support" ON "USE_CUDA" OFF) # one could set CUTENSOR_ROOT for search path cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF) cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64" OFF) diff --git a/cmake/Modules/FindCUTENSOR.cmake b/cmake/Modules/FindCUTENSOR.cmake new file mode 100644 index 000000000000..b3e9bcde0225 --- /dev/null +++ b/cmake/Modules/FindCUTENSOR.cmake @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include(FindPackageHandleStandardArgs) + +set(CUTENSOR_ROOT "/usr/local/cuda" CACHE PATH "cuTensor root folder") + +find_path(CUTENSOR_INCLUDE cutensor.h + PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT} + DOC "Path to cuTensor include directory." ) + +find_library(CUTENSOR_LIBRARY NAMES libcutensor.so # libcutensor_static.a + PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT} ${CUTENSOR_INCLUDE} + PATH_SUFFIXES lib lib/x64 cuda/lib cuda/lib64 lib/x64 + DOC "Path to cuTensor library.") + +find_package_handle_standard_args(CUTENSOR DEFAULT_MSG CUTENSOR_LIBRARY CUTENSOR_INCLUDE) + +mark_as_advanced(CUTENSOR_ROOT CUTENSOR_INCLUDE CUTENSOR_LIBRARY) diff --git a/config/darwin.cmake b/config/darwin.cmake index 59f031e49f01..65e93efb7373 100644 --- a/config/darwin.cmake +++ b/config/darwin.cmake @@ -97,6 +97,7 @@ SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH") #--------------------------------------------- set(USE_CUDA OFF CACHE BOOL "Build with CUDA support") set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found") +set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found") # Target NVIDIA GPU achitecture. # Valid options are "Auto" for autodetection, "All" for all available diff --git a/config/linux.cmake b/config/linux.cmake index ff338231e277..8881402ede8e 100644 --- a/config/linux.cmake +++ b/config/linux.cmake @@ -40,6 +40,7 @@ #--------------------------------------------- set(USE_CUDA OFF CACHE BOOL "Build with CUDA support") set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found") +set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found") # Target NVIDIA GPU achitecture. # Valid options are "Auto" for autodetection, "All" for all available diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake index 50932d8d41d1..933857ce6739 100644 --- a/config/linux_gpu.cmake +++ b/config/linux_gpu.cmake @@ -41,6 +41,7 @@ #--------------------------------------------- set(USE_CUDA ON CACHE BOOL "Build with CUDA support") set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found") +set(USE_CUTENSOR ON CACHE BOOL "Build with cutensor support, if found") # Target NVIDIA GPU achitecture. # Valid options are: diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h index d526ca1c78da..9f640d79fffb 100644 --- a/include/mxnet/libinfo.h +++ b/include/mxnet/libinfo.h @@ -143,6 +143,7 @@ enum : unsigned { CUDNN, NCCL, TENSORRT, + CUTENSOR, // CPU Features / optimizations CPU_SSE, diff --git a/src/libinfo.cc b/src/libinfo.cc index 28ddab200c49..9348b721e801 100644 --- a/src/libinfo.cc +++ b/src/libinfo.cc @@ -40,6 +40,7 @@ class FeatureSet { feature_bits.set(CUDNN, MXNET_USE_CUDNN); feature_bits.set(NCCL, MXNET_USE_NCCL); feature_bits.set(TENSORRT, MXNET_USE_TENSORRT); + feature_bits.set(CUTENSOR, MXNET_USE_CUTENSOR); // Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX" #if __SSE__ @@ -133,6 +134,7 @@ const std::vector EnumNames::names = { "CUDNN", "NCCL", "TENSORRT", + "CUTENSOR", "CPU_SSE", "CPU_SSE2", "CPU_SSE3", diff --git a/src/operator/numpy/np_einsum_op.cu b/src/operator/numpy/np_einsum_op.cu index 3ddb6f6a3522..9882898192f0 100644 --- a/src/operator/numpy/np_einsum_op.cu +++ b/src/operator/numpy/np_einsum_op.cu @@ -51,31 +51,6 @@ struct CuTensorTypeTraits { static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_16F; typedef float ScalarType; }; -template<> -struct CuTensorTypeTraits { - static const cudaDataType_t cudaType = CUDA_R_64I; - static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I; - typedef int ScalarType; -}; -template<> -struct CuTensorTypeTraits { - static const cudaDataType_t cudaType = CUDA_R_32I; - static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I; - typedef int ScalarType; -}; -template<> -struct CuTensorTypeTraits { - static const cudaDataType_t cudaType = CUDA_R_8I; - static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8I; - typedef int ScalarType; -}; -template<> -struct CuTensorTypeTraits { - static const cudaDataType_t cudaType = CUDA_R_8U; - static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8U; - typedef int ScalarType; -}; -using ModeType = int32_t; // Round num elements 'x' to be mem aligned according to 'multiple' and 'dtype_size' size_t RoundToMultiple(size_t x, size_t multiple, size_t dtype_size) { @@ -867,7 +842,7 @@ inline void NumpyEinsumForwardGpu(const OpStatePtr& state_ptr, for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = inputs[i].shape_; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { EinsumOpGPU &op = GetEinsumOpGPU (state, in_shape, out_shape, req, ctx, false); @@ -902,7 +877,7 @@ inline void NumpyEinsumBackwardGpu(const OpStatePtr& state_ptr, in_shape[i] = inputs[i].shape_; for (size_t i = 0; i < out_shape.size(); i++) out_shape[i] = outputs[i].shape_; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { EinsumOpGPU &op = GetEinsumOpGPU (state, in_shape, out_shape, req, ctx, true);