From 815b60faedf7ffe961dcfd078b8c11a3861845bc Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Sun, 11 Oct 2020 17:33:38 -0700
Subject: [PATCH] Create FindCUTENSOR.cmake

update cmake config

Update CMakeLists.txt

debug

Update FindCUTENSOR.cmake

Update linux_gpu.cmake

remove int types to support cuda 10

Update CMakeLists.txt

update

Update libinfo.cc

Revert "Update CMakeLists.txt"

This reverts commit 70afbf00cb442fd4b53637fdb1d21190ae5954a4.

Revert "Update linux_gpu.cmake"

This reverts commit bc5d2fc258773b8ba326f786e7732ff13cad51db.

Revert "Revert "Update linux_gpu.cmake""

This reverts commit 96d5f4373eb27ac909af4defdc8a56724f072c2f.

Revert "Revert "Update CMakeLists.txt""

This reverts commit ec20b33fed888d584108851a1e44805f92378104.
---
 CMakeLists.txt                     |  1 +
 cmake/Modules/FindCUTENSOR.cmake   | 33 ++++++++++++++++++++++++++++++
 config/darwin.cmake                |  1 +
 config/linux.cmake                 |  1 +
 config/linux_gpu.cmake             |  1 +
 include/mxnet/libinfo.h            |  1 +
 src/libinfo.cc                     |  2 ++
 src/operator/numpy/np_einsum_op.cu | 29 ++------------------------
 8 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 cmake/Modules/FindCUTENSOR.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af0a04904802..2d76fba2f023 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(USE_OPENMP "Build with Openmp support" ON)
 option(USE_FATBIN_COMPRESSION "Compress nvcc fatbin output" ON)
 cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
+cmake_dependent_option(USE_CUTENSOR "Build with cuTENSOR support" ON "USE_CUDA" OFF) # one could set CUTENSOR_ROOT for search path
 cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON
   "CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64" OFF)
diff --git a/cmake/Modules/FindCUTENSOR.cmake b/cmake/Modules/FindCUTENSOR.cmake
new file mode 100644
index 000000000000..b3e9bcde0225
--- /dev/null
+++ b/cmake/Modules/FindCUTENSOR.cmake
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(FindPackageHandleStandardArgs)
+
+set(CUTENSOR_ROOT "/usr/local/cuda" CACHE PATH "cuTensor root folder")
+
+find_path(CUTENSOR_INCLUDE cutensor.h
+        PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT}
+        DOC "Path to cuTensor include directory." )
+
+find_library(CUTENSOR_LIBRARY NAMES libcutensor.so # libcutensor_static.a
+        PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT} ${CUTENSOR_INCLUDE}
+        PATH_SUFFIXES lib lib/x64  cuda/lib cuda/lib64 lib/x64
+        DOC "Path to cuTensor library.")
+
+find_package_handle_standard_args(CUTENSOR DEFAULT_MSG CUTENSOR_LIBRARY CUTENSOR_INCLUDE)
+
+mark_as_advanced(CUTENSOR_ROOT CUTENSOR_INCLUDE CUTENSOR_LIBRARY)
diff --git a/config/darwin.cmake b/config/darwin.cmake
index 59f031e49f01..65e93efb7373 100644
--- a/config/darwin.cmake
+++ b/config/darwin.cmake
@@ -97,6 +97,7 @@ SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 #---------------------------------------------
 set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are "Auto" for autodetection, "All" for all available
diff --git a/config/linux.cmake b/config/linux.cmake
index ff338231e277..8881402ede8e 100644
--- a/config/linux.cmake
+++ b/config/linux.cmake
@@ -40,6 +40,7 @@
 #---------------------------------------------
 set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are "Auto" for autodetection, "All" for all available
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 50932d8d41d1..933857ce6739 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -41,6 +41,7 @@
 #---------------------------------------------
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
+set(USE_CUTENSOR ON CACHE BOOL "Build with cutensor support, if found")
 
 # Target NVIDIA GPU achitecture.
 # Valid options are:
diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index d526ca1c78da..9f640d79fffb 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -143,6 +143,7 @@ enum : unsigned {
   CUDNN,
   NCCL,
   TENSORRT,
+  CUTENSOR,
 
   // CPU Features / optimizations
   CPU_SSE,
diff --git a/src/libinfo.cc b/src/libinfo.cc
index 28ddab200c49..9348b721e801 100644
--- a/src/libinfo.cc
+++ b/src/libinfo.cc
@@ -40,6 +40,7 @@ class FeatureSet {
     feature_bits.set(CUDNN, MXNET_USE_CUDNN);
     feature_bits.set(NCCL, MXNET_USE_NCCL);
     feature_bits.set(TENSORRT, MXNET_USE_TENSORRT);
+    feature_bits.set(CUTENSOR, MXNET_USE_CUTENSOR);
 
     // Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX"
 #if __SSE__
@@ -133,6 +134,7 @@ const std::vector<std::string> EnumNames::names = {
   "CUDNN",
   "NCCL",
   "TENSORRT",
+  "CUTENSOR",
   "CPU_SSE",
   "CPU_SSE2",
   "CPU_SSE3",
diff --git a/src/operator/numpy/np_einsum_op.cu b/src/operator/numpy/np_einsum_op.cu
index 3ddb6f6a3522..9882898192f0 100644
--- a/src/operator/numpy/np_einsum_op.cu
+++ b/src/operator/numpy/np_einsum_op.cu
@@ -51,31 +51,6 @@ struct CuTensorTypeTraits<mshadow::half::half_t> {
   static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_16F;
   typedef float ScalarType;
 };
-template<>
-struct CuTensorTypeTraits<int64_t> {
-  static const cudaDataType_t cudaType = CUDA_R_64I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<int32_t> {
-  static const cudaDataType_t cudaType = CUDA_R_32I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<int8_t> {
-  static const cudaDataType_t cudaType = CUDA_R_8I;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8I;
-  typedef int ScalarType;
-};
-template<>
-struct CuTensorTypeTraits<uint8_t> {
-  static const cudaDataType_t cudaType = CUDA_R_8U;
-  static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8U;
-  typedef int ScalarType;
-};
-using ModeType = int32_t;
 
 // Round num elements 'x' to be mem aligned according to 'multiple' and 'dtype_size'
 size_t RoundToMultiple(size_t x, size_t multiple, size_t dtype_size) {
@@ -867,7 +842,7 @@ inline void NumpyEinsumForwardGpu(const OpStatePtr& state_ptr,
     for (size_t i = 0; i < in_shape.size(); i++)
       in_shape[i] = inputs[i].shape_;
 
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
           (state, in_shape, out_shape,
            req, ctx, false);
@@ -902,7 +877,7 @@ inline void NumpyEinsumBackwardGpu(const OpStatePtr& state_ptr,
       in_shape[i] = inputs[i].shape_;
     for (size_t i = 0; i < out_shape.size(); i++)
       out_shape[i] = outputs[i].shape_;
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
           (state, in_shape, out_shape,
            req, ctx, true);