apache · leezu · Oct 14, 2020 · Oct 12, 2020
@@ -54,6 +54,7 @@ option(USE_OPENMP "Build with Openmp support" ON)
 option(USE_FATBIN_COMPRESSION "Compress nvcc fatbin output" ON)
 cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
+cmake_dependent_option(USE_CUTENSOR "Build with cuTENSOR support" ON "USE_CUDA" OFF) # one could set CUTENSOR_ROOT for search path
 cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON
   "CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64" OFF)

@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(FindPackageHandleStandardArgs)
+
+set(CUTENSOR_ROOT "/usr/local/cuda" CACHE PATH "cuTensor root folder")
+
+find_path(CUTENSOR_INCLUDE cutensor.h
+        PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT}
+        DOC "Path to cuTensor include directory." )
+
+find_library(CUTENSOR_LIBRARY NAMES libcutensor.so # libcutensor_static.a
+        PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT} ${CUTENSOR_INCLUDE}
+        PATH_SUFFIXES lib lib/x64  cuda/lib cuda/lib64 lib/x64
+        DOC "Path to cuTensor library.")
+
+find_package_handle_standard_args(CUTENSOR DEFAULT_MSG CUTENSOR_LIBRARY CUTENSOR_INCLUDE)
+
+mark_as_advanced(CUTENSOR_ROOT CUTENSOR_INCLUDE CUTENSOR_LIBRARY)
diff --git a/config/darwin.cmake b/config/darwin.cmake
@@ -97,6 +97,7 @@ SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 #---------------------------------------------
 set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are "Auto" for autodetection, "All" for all available

diff --git a/config/linux.cmake b/config/linux.cmake
@@ -40,6 +40,7 @@
 #---------------------------------------------
 set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are "Auto" for autodetection, "All" for all available

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
@@ -41,6 +41,7 @@
 #---------------------------------------------
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR ON CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are:

@@ -143,6 +143,7 @@ enum : unsigned {
   CUDNN,
   NCCL,
   TENSORRT,
+  CUTENSOR,
 
   // CPU Features / optimizations
   CPU_SSE,

diff --git a/src/libinfo.cc b/src/libinfo.cc
@@ -40,6 +40,7 @@ class FeatureSet {
     feature_bits.set(CUDNN, MXNET_USE_CUDNN);
     feature_bits.set(NCCL, MXNET_USE_NCCL);
     feature_bits.set(TENSORRT, MXNET_USE_TENSORRT);
+    feature_bits.set(CUTENSOR, MXNET_USE_CUTENSOR);
 
     // Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX"
 #if __SSE__
@@ -133,6 +134,7 @@ const std::vector<std::string> EnumNames::names = {
   "CUDNN",
   "NCCL",
   "TENSORRT",
+  "CUTENSOR",
   "CPU_SSE",
   "CPU_SSE2",
   "CPU_SSE3",

diff --git a/src/operator/numpy/np_einsum_op.cu b/src/operator/numpy/np_einsum_op.cu
@@ -51,31 +51,6 @@ struct CuTensorTypeTraits<mshadow::half::half_t> {
   static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_16F;
   typedef float ScalarType;
 };
-template<>
-struct CuTensorTypeTraits<int64_t> {
-  static const cudaDataType_t cudaType = CUDA_R_64I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<int32_t> {
-  static const cudaDataType_t cudaType = CUDA_R_32I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<int8_t> {
-  static const cudaDataType_t cudaType = CUDA_R_8I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<uint8_t> {
-  static const cudaDataType_t cudaType = CUDA_R_8U;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8U;
-  typedef int ScalarType;
-};
-using ModeType = int32_t;
 
 // Round num elements 'x' to be mem aligned according to 'multiple' and 'dtype_size'
 size_t RoundToMultiple(size_t x, size_t multiple, size_t dtype_size) {
@@ -867,7 +842,7 @@ inline void NumpyEinsumForwardGpu(const OpStatePtr& state_ptr,
     for (size_t i = 0; i < in_shape.size(); i++)
       in_shape[i] = inputs[i].shape_;
 
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
           (state, in_shape, out_shape,
            req, ctx, false);
@@ -902,7 +877,7 @@ inline void NumpyEinsumBackwardGpu(const OpStatePtr& state_ptr,
       in_shape[i] = inputs[i].shape_;
     for (size_t i = 0; i < out_shape.size(); i++)
       out_shape[i] = outputs[i].shape_;
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
           (state, in_shape, out_shape,
            req, ctx, true);