Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ option(USE_OPENMP "Build with Openmp support" ON)
option(USE_FATBIN_COMPRESSION "Compress nvcc fatbin output" ON)
cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
cmake_dependent_option(USE_CUTENSOR "Build with cuTENSOR support" ON "USE_CUDA" OFF) # one could set CUTENSOR_ROOT for search path
cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON
"CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64" OFF)
Expand Down
33 changes: 33 additions & 0 deletions cmake/Modules/FindCUTENSOR.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

include(FindPackageHandleStandardArgs)

set(CUTENSOR_ROOT "/usr/local/cuda" CACHE PATH "cuTensor root folder")

find_path(CUTENSOR_INCLUDE cutensor.h
PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT}
DOC "Path to cuTensor include directory." )

find_library(CUTENSOR_LIBRARY NAMES libcutensor.so # libcutensor_static.a
PATHS ${CUTENSOR_ROOT} $ENV{CUTENSOR_ROOT} ${CUTENSOR_INCLUDE}
PATH_SUFFIXES lib lib/x64 cuda/lib cuda/lib64 lib/x64
DOC "Path to cuTensor library.")

find_package_handle_standard_args(CUTENSOR DEFAULT_MSG CUTENSOR_LIBRARY CUTENSOR_INCLUDE)

mark_as_advanced(CUTENSOR_ROOT CUTENSOR_INCLUDE CUTENSOR_LIBRARY)
1 change: 1 addition & 0 deletions config/darwin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
#---------------------------------------------
set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")

# Target NVIDIA GPU achitecture.
# Valid options are "Auto" for autodetection, "All" for all available
Expand Down
1 change: 1 addition & 0 deletions config/linux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#---------------------------------------------
set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
set(USE_CUTENSOR OFF CACHE BOOL "Build with cutensor support, if found")

# Target NVIDIA GPU achitecture.
# Valid options are "Auto" for autodetection, "All" for all available
Expand Down
1 change: 1 addition & 0 deletions config/linux_gpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#---------------------------------------------
set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
set(USE_CUTENSOR ON CACHE BOOL "Build with cutensor support, if found")

# Target NVIDIA GPU achitecture.
# Valid options are:
Expand Down
1 change: 1 addition & 0 deletions include/mxnet/libinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ enum : unsigned {
CUDNN,
NCCL,
TENSORRT,
CUTENSOR,

// CPU Features / optimizations
CPU_SSE,
Expand Down
2 changes: 2 additions & 0 deletions src/libinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class FeatureSet {
feature_bits.set(CUDNN, MXNET_USE_CUDNN);
feature_bits.set(NCCL, MXNET_USE_NCCL);
feature_bits.set(TENSORRT, MXNET_USE_TENSORRT);
feature_bits.set(CUTENSOR, MXNET_USE_CUTENSOR);

// Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX"
#if __SSE__
Expand Down Expand Up @@ -133,6 +134,7 @@ const std::vector<std::string> EnumNames::names = {
"CUDNN",
"NCCL",
"TENSORRT",
"CUTENSOR",
"CPU_SSE",
"CPU_SSE2",
"CPU_SSE3",
Expand Down
29 changes: 2 additions & 27 deletions src/operator/numpy/np_einsum_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,31 +51,6 @@ struct CuTensorTypeTraits<mshadow::half::half_t> {
static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_16F;
typedef float ScalarType;
};
template<>
struct CuTensorTypeTraits<int64_t> {
static const cudaDataType_t cudaType = CUDA_R_64I;
static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
typedef int ScalarType;
};
template<>
struct CuTensorTypeTraits<int32_t> {
static const cudaDataType_t cudaType = CUDA_R_32I;
static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_32I;
typedef int ScalarType;
};
template<>
struct CuTensorTypeTraits<int8_t> {
static const cudaDataType_t cudaType = CUDA_R_8I;
static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8I;
typedef int ScalarType;
};
template<>
struct CuTensorTypeTraits<uint8_t> {
static const cudaDataType_t cudaType = CUDA_R_8U;
static const cutensorComputeType_t cutensorType = CUTENSOR_COMPUTE_8U;
typedef int ScalarType;
};
using ModeType = int32_t;

// Round num elements 'x' to be mem aligned according to 'multiple' and 'dtype_size'
size_t RoundToMultiple(size_t x, size_t multiple, size_t dtype_size) {
Expand Down Expand Up @@ -867,7 +842,7 @@ inline void NumpyEinsumForwardGpu(const OpStatePtr& state_ptr,
for (size_t i = 0; i < in_shape.size(); i++)
in_shape[i] = inputs[i].shape_;

MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
(state, in_shape, out_shape,
req, ctx, false);
Expand Down Expand Up @@ -902,7 +877,7 @@ inline void NumpyEinsumBackwardGpu(const OpStatePtr& state_ptr,
in_shape[i] = inputs[i].shape_;
for (size_t i = 0; i < out_shape.size(); i++)
out_shape[i] = outputs[i].shape_;
MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
EinsumOpGPU<DType> &op = GetEinsumOpGPU<DType>
(state, in_shape, out_shape,
req, ctx, true);
Expand Down