From 41d657c2f137467eaad897deee7a2e99dd52891b Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 13 Nov 2019 11:41:32 -0800
Subject: [PATCH 1/2] add volatile override back

---
 src/codegen/codegen_cuda.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 22e8d842e424..3a5adb0bbf5e 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -58,14 +58,18 @@ std::string CodeGenCUDA::Finish() {
                 << "{\n  return __hgt(__half(a), __half(b)) ? a : b;\n}\n";
     decl_stream << "__device__ half min(half a, half b)\n"
                 << "{\n  return __hlt(__half(a), __half(b)) ? a : b;\n}\n";
+    // FIXME(tvm-team): add "volatile" to enable cross thread reduction,
+    // which is needed by operations such as softmax.
+    // However, volatile overloading is not supported in NVRTC and CUDA < 9.2.
+    // We need to figure out a solution which can satisfy the whole world.
     decl_stream << "__device__ half operator<="
-                << "(__half a,  __half b)\n"
+                << "(const volatile __half &a,  const volatile __half &b)\n"
                 << "{\n  return __hlt(a, b);\n}\n";
     decl_stream << "__device__ half operator+"
-                << "(__half a,  __half &b)\n"
+                << "(const volatile __half &a,  const volatile __half &b)\n"
                 <<"{\n  return __hadd(a, b);\n}\n";
     decl_stream << "__device__ half operator*"
-                << "(__half a, __half b)\n"
+                << "(const volatile __half &a, const volatile __half &b)\n"
                 <<   "{\n  return __hmul(a, b);\n}\n";
     // otherwise simulate computation via float32
     decl_stream << "#else\n";

From 07685b448fd4c37c9358a5e4a03ab95283cdf7c4 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 13 Nov 2019 16:28:11 -0800
Subject: [PATCH 2/2] [codegen] remove fp16 function override for cuda

---
 src/codegen/codegen_cuda.cc       | 22 +++++++++++-----------
 src/codegen/literal/cuda_half_t.h |  3 ++-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 3a5adb0bbf5e..2a412823d6ef 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -58,19 +58,19 @@ std::string CodeGenCUDA::Finish() {
                 << "{\n  return __hgt(__half(a), __half(b)) ? a : b;\n}\n";
     decl_stream << "__device__ half min(half a, half b)\n"
                 << "{\n  return __hlt(__half(a), __half(b)) ? a : b;\n}\n";
-    // FIXME(tvm-team): add "volatile" to enable cross thread reduction,
+    // FIXME(tvm-team): "volatile" is used to enable cross thread reduction,
     // which is needed by operations such as softmax.
     // However, volatile overloading is not supported in NVRTC and CUDA < 9.2.
-    // We need to figure out a solution which can satisfy the whole world.
-    decl_stream << "__device__ half operator<="
-                << "(const volatile __half &a,  const volatile __half &b)\n"
-                << "{\n  return __hlt(a, b);\n}\n";
-    decl_stream << "__device__ half operator+"
-                << "(const volatile __half &a,  const volatile __half &b)\n"
-                <<"{\n  return __hadd(a, b);\n}\n";
-    decl_stream << "__device__ half operator*"
-                << "(const volatile __half &a, const volatile __half &b)\n"
-                <<   "{\n  return __hmul(a, b);\n}\n";
+    // We need to figure out a solution which can satisfy both scenario.
+    // decl_stream << "__device__ half operator<="
+    //             << "(const volatile __half &a,  const volatile __half &b)\n"
+    //             << "{\n  return __hlt(a, b);\n}\n";
+    // decl_stream << "__device__ half operator+"
+    //             << "(const volatile __half &a,  const volatile __half &b)\n"
+    //             <<"{\n  return __hadd(a, b);\n}\n";
+    // decl_stream << "__device__ half operator*"
+    //             << "(const volatile __half &a, const volatile __half &b)\n"
+    //             <<   "{\n  return __hmul(a, b);\n}\n";
     // otherwise simulate computation via float32
     decl_stream << "#else\n";
     decl_stream << _cuda_half_t_def;
diff --git a/src/codegen/literal/cuda_half_t.h b/src/codegen/literal/cuda_half_t.h
index 23075b0b6e76..0889032aadd4 100644
--- a/src/codegen/literal/cuda_half_t.h
+++ b/src/codegen/literal/cuda_half_t.h
@@ -28,6 +28,7 @@
 static constexpr const char* _cuda_half_t_def = R"(
 typedef unsigned short uint16_t;
 typedef unsigned char uint8_t;
+typedef signed char int8_t;
 typedef int int32_t;
 typedef unsigned long long uint64_t;
 typedef unsigned int uint32_t;
@@ -76,7 +77,7 @@ class TVM_ALIGNED(2) half {
   TVM_XINLINE explicit half(const uint8_t& value) { constructor(value); }
   TVM_XINLINE explicit half(const int32_t& value) { constructor(value); }
   TVM_XINLINE explicit half(const uint32_t& value) { constructor(value); }
-  TVM_XINLINE explicit half(const int64_t& value) { constructor(value); }
+  TVM_XINLINE explicit half(const long long& value) { constructor(value); }
   TVM_XINLINE explicit half(const uint64_t& value) { constructor(value); }
 
   TVM_XINLINE operator float() const {                          \