From 41d657c2f137467eaad897deee7a2e99dd52891b Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Wed, 13 Nov 2019 11:41:32 -0800 Subject: [PATCH 1/2] add volatile override back --- src/codegen/codegen_cuda.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 22e8d842e424..3a5adb0bbf5e 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -58,14 +58,18 @@ std::string CodeGenCUDA::Finish() { << "{\n return __hgt(__half(a), __half(b)) ? a : b;\n}\n"; decl_stream << "__device__ half min(half a, half b)\n" << "{\n return __hlt(__half(a), __half(b)) ? a : b;\n}\n"; + // FIXME(tvm-team): add "volatile" to enable cross thread reduction, + // which is needed by operations such as softmax. + // However, volatile overloading is not supported in NVRTC and CUDA < 9.2. + // We need to figure out a solution which can satisfy the whole world. decl_stream << "__device__ half operator<=" - << "(__half a, __half b)\n" + << "(const volatile __half &a, const volatile __half &b)\n" << "{\n return __hlt(a, b);\n}\n"; decl_stream << "__device__ half operator+" - << "(__half a, __half &b)\n" + << "(const volatile __half &a, const volatile __half &b)\n" <<"{\n return __hadd(a, b);\n}\n"; decl_stream << "__device__ half operator*" - << "(__half a, __half b)\n" + << "(const volatile __half &a, const volatile __half &b)\n" << "{\n return __hmul(a, b);\n}\n"; // otherwise simulate computation via float32 decl_stream << "#else\n"; From 07685b448fd4c37c9358a5e4a03ab95283cdf7c4 Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Wed, 13 Nov 2019 16:28:11 -0800 Subject: [PATCH 2/2] [codegen] remove fp16 function override for cuda --- src/codegen/codegen_cuda.cc | 22 +++++++++++----------- src/codegen/literal/cuda_half_t.h | 3 ++- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 3a5adb0bbf5e..2a412823d6ef 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -58,19 +58,19 @@ std::string CodeGenCUDA::Finish() { << "{\n return __hgt(__half(a), __half(b)) ? a : b;\n}\n"; decl_stream << "__device__ half min(half a, half b)\n" << "{\n return __hlt(__half(a), __half(b)) ? a : b;\n}\n"; - // FIXME(tvm-team): add "volatile" to enable cross thread reduction, + // FIXME(tvm-team): "volatile" is used to enable cross thread reduction, // which is needed by operations such as softmax. // However, volatile overloading is not supported in NVRTC and CUDA < 9.2. - // We need to figure out a solution which can satisfy the whole world. - decl_stream << "__device__ half operator<=" - << "(const volatile __half &a, const volatile __half &b)\n" - << "{\n return __hlt(a, b);\n}\n"; - decl_stream << "__device__ half operator+" - << "(const volatile __half &a, const volatile __half &b)\n" - <<"{\n return __hadd(a, b);\n}\n"; - decl_stream << "__device__ half operator*" - << "(const volatile __half &a, const volatile __half &b)\n" - << "{\n return __hmul(a, b);\n}\n"; + // We need to figure out a solution which can satisfy both scenario. + // decl_stream << "__device__ half operator<=" + // << "(const volatile __half &a, const volatile __half &b)\n" + // << "{\n return __hlt(a, b);\n}\n"; + // decl_stream << "__device__ half operator+" + // << "(const volatile __half &a, const volatile __half &b)\n" + // <<"{\n return __hadd(a, b);\n}\n"; + // decl_stream << "__device__ half operator*" + // << "(const volatile __half &a, const volatile __half &b)\n" + // << "{\n return __hmul(a, b);\n}\n"; // otherwise simulate computation via float32 decl_stream << "#else\n"; decl_stream << _cuda_half_t_def; diff --git a/src/codegen/literal/cuda_half_t.h b/src/codegen/literal/cuda_half_t.h index 23075b0b6e76..0889032aadd4 100644 --- a/src/codegen/literal/cuda_half_t.h +++ b/src/codegen/literal/cuda_half_t.h @@ -28,6 +28,7 @@ static constexpr const char* _cuda_half_t_def = R"( typedef unsigned short uint16_t; typedef unsigned char uint8_t; +typedef signed char int8_t; typedef int int32_t; typedef unsigned long long uint64_t; typedef unsigned int uint32_t; @@ -76,7 +77,7 @@ class TVM_ALIGNED(2) half { TVM_XINLINE explicit half(const uint8_t& value) { constructor(value); } TVM_XINLINE explicit half(const int32_t& value) { constructor(value); } TVM_XINLINE explicit half(const uint32_t& value) { constructor(value); } - TVM_XINLINE explicit half(const int64_t& value) { constructor(value); } + TVM_XINLINE explicit half(const long long& value) { constructor(value); } TVM_XINLINE explicit half(const uint64_t& value) { constructor(value); } TVM_XINLINE operator float() const { \