NVIDIA · janekb04 · Aug 18, 2023 · Aug 18, 2023
diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -229,11 +229,11 @@ void cublas_gemm(const Tensor *inputA,
           preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
           &workspaceSize, sizeof(workspaceSize)));
 
-  NVTE_CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc,
-                                                   Ddesc, preference, 1, &heuristicResult,
-                                                   &returnedResults));
-
-  if (returnedResults == 0) throw std::runtime_error("Unable to find any suitable algorithms");
+  const auto status = cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc,
+                                                     Ddesc, preference, 1, &heuristicResult,
+                                                     &returnedResults);
+  if (status == CUBLAS_STATUS_NOT_SUPPORTED) throw std::runtime_error("Unable to find suitable CUBLAS GEMM algorithm.");
+  NVTE_CHECK_CUBLAS(status);
 
   // D = alpha * (A * B) + beta * C
 

diff --git a/transformer_engine/common/include/transformer_engine/logging.h b/transformer_engine/common/include/transformer_engine/logging.h
@@ -7,68 +7,70 @@
 #ifndef TRANSFORMER_ENGINE_LOGGING_H_
 #define TRANSFORMER_ENGINE_LOGGING_H_
 
-#include <cuda_runtime_api.h>
 #include <cublas_v2.h>
+#include <cuda_runtime_api.h>
 #include <cudnn.h>
 #include <nvrtc.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 
-#define NVTE_ERROR(x) \
-    do { \
-        throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) +            \
-                                 " in function " + __func__ + ": " + x);                           \
-    } while (false)
-
-#define NVTE_CHECK(x, ...)                                                                         \
-    do {                                                                                           \
-        if (!(x)) {                                                                                \
-            NVTE_ERROR(std::string("Assertion failed: "  #x ". ") + std::string(__VA_ARGS__));     \
-        }                                                                                          \
-    } while (false)
-
-namespace {
-
-inline void check_cuda_(cudaError_t status) {
-    if ( status != cudaSuccess ) {
-        NVTE_ERROR("CUDA Error: " + std::string(cudaGetErrorString(status)));
-    }
-}
-
-inline void check_cublas_(cublasStatus_t status) {
-    if ( status != CUBLAS_STATUS_SUCCESS ) {
-        NVTE_ERROR("CUBLAS Error: " + std::string(cublasGetStatusString(status)));
-    }
-}
-
-inline void check_cudnn_(cudnnStatus_t status) {
-    if ( status != CUDNN_STATUS_SUCCESS ) {
-        std::string message;
-        message.reserve(1024);
-        message += "CUDNN Error: ";
-        message += cudnnGetErrorString(status);
-        message += (". "
-                    "For more information, enable cuDNN error logging "
-                    "by setting CUDNN_LOGERR_DBG=1 and "
-                    "CUDNN_LOGDEST_DBG=stderr in the environment.");
-        NVTE_ERROR(message);
-    }
-}
-
-inline void check_nvrtc_(nvrtcResult status) {
-    if ( status != NVRTC_SUCCESS ) {
-        NVTE_ERROR("NVRTC Error: " + std::string(nvrtcGetErrorString(status)));
-    }
-}
+#define NVTE_ERROR(x)                                                          \
+  do {                                                                         \
+    throw std::runtime_error(std::string(__FILE__ ":") +                       \
+                             std::to_string(__LINE__) + " in function " +      \
+                             __func__ + ": " + x);                             \
+  } while (false)
 
-}  // namespace
+#define NVTE_CHECK(x, ...)                                                     \
+  do {                                                                         \
+    if (!(x)) {                                                                \
+      NVTE_ERROR(std::string("Assertion failed: " #x ". ") +                   \
+                 std::string(__VA_ARGS__));                                    \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUDA(ans) { check_cuda_(ans); }
+#define NVTE_CHECK_CUDA(status)                                                \
+  do {                                                                         \
+    if (status != cudaSuccess) {                                               \
+      NVTE_ERROR("CUDA Error: " + std::string(cudaGetErrorString(status)));    \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUBLAS(ans) { check_cublas_(ans); }
+#define NVTE_CHECK_CUBLAS(status)                                              \
+  do {                                                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      std::string message;                                                     \
+      message.reserve(1024);                                                   \
+      message += "CUBLAS Error: ";                                             \
+      message += cublasGetStatusString(status);                                \
+      message += (". "                                                         \
+                  "For more information, increase CUBLASLT_LOG_LEVEL, "        \
+                  "by setting CUBLASLT_LOG_LEVEL=N [0-5] "                     \
+                  "in the environment.");                                      \
+      NVTE_ERROR(message);                                                     \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUDNN(ans) { check_cudnn_(ans); }
+#define NVTE_CHECK_CUDNN(status)                                               \
+  do {                                                                         \
+    if (status != CUDNN_STATUS_SUCCESS) {                                      \
+      std::string message;                                                     \
+      message.reserve(1024);                                                   \
+      message += "CUDNN Error: ";                                              \
+      message += cudnnGetErrorString(status);                                  \
+      message += (". "                                                         \
+                  "For more information, enable cuDNN error logging "          \
+                  "by setting CUDNN_LOGERR_DBG=1 and "                         \
+                  "CUDNN_LOGDEST_DBG=stderr in the environment.");             \
+      NVTE_ERROR(message);                                                     \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_NVRTC(ans) { check_nvrtc_(ans); }
+#define NVTE_CHECK_NVRTC(status)                                               \
+  do {                                                                         \
+    if (status != NVRTC_SUCCESS) {                                             \
+      NVTE_ERROR("NVRTC Error: " + std::string(nvrtcGetErrorString(status)));  \
+    }                                                                          \
+  } while (false)
 
-#endif  // TRANSFORMER_ENGINE_LOGGING_H_
+#endif // TRANSFORMER_ENGINE_LOGGING_H_
diff --git a/transformer_engine/common/include/transformer_engine/transpose.h b/transformer_engine/common/include/transformer_engine/transpose.h
@@ -146,9 +146,6 @@ void nvte_multi_cast_transpose(size_t num_tensors,
  *  - `cast_output` is the result of the cast
  *  - `transposed_output` is the transposed result of the cast.
  *
- *  Calling this function with workspace being an empty tensor will not perform the operation,
- *  but instead set the shape and type of the workspace tensor to the required values.
- *
  *  \param[in]     input               Input tensor of shape [N, H].
  *  \param[in]     geglu_input         Tensor used as input to the forward of GeGLU operation.
  *                                     Shape [N, H * 2].

diff --git a/transformer_engine/common/util/cuda_driver.h b/transformer_engine/common/util/cuda_driver.h
@@ -43,30 +43,28 @@ inline CUresult call(const char *symbol, ArgTs... args) {
 
 }  // namespace transformer_engine
 
-namespace {
-
-/*! \brief Throw exception if CUDA driver call has failed */
-inline void check_cuda_driver_(CUresult status) {
-  if (status != CUDA_SUCCESS) {
-    const char *description;
-    transformer_engine::cuda_driver::call("cuGetErrorString", &description);
-    NVTE_ERROR(transformer_engine::concat_strings("CUDA Error: ", description));
-  }
-}
-
-/*! \brief Call CUDA driver function and throw exception if it fails */
-template <typename... ArgTs>
-inline void call_and_check_cuda_driver_(const char *symbol,
-                                        ArgTs &&... args) {
-  check_cuda_driver_(transformer_engine::cuda_driver::call(symbol,
-                                                           std::forward<ArgTs>(args)...));
-}
-
-}  // namespace
-
-#define NVTE_CHECK_CUDA_DRIVER(ans) { check_cuda_driver_(ans); }
-
-#define NVTE_CALL_CHECK_CUDA_DRIVER(func, ...) \
-  { call_and_check_cuda_driver_(#func, __VA_ARGS__); }
-
-#endif  // TRANSFORMER_ENGINE_COMMON_UTIL_CUDA_DRIVER_H_
+#define NVTE_CHECK_CUDA_DRIVER(status)                                         \
+  do {                                                                         \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char *description;                                                 \
+      transformer_engine::cuda_driver::call("cuGetErrorString", status,        \
+                                            &description);                     \
+      NVTE_ERROR(                                                              \
+          transformer_engine::concat_strings("CUDA Error: ", description));    \
+    }                                                                          \
+  } while (false)
+
+#define NVTE_CALL_CHECK_CUDA_DRIVER(symbol, ...)                               \
+  do {                                                                         \
+    CUresult status =                                                          \
+        transformer_engine::cuda_driver::call(#symbol, __VA_ARGS__);           \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char *description;                                                 \
+      transformer_engine::cuda_driver::call("cuGetErrorString", status,        \
+                                            &description);                     \
+      NVTE_ERROR(                                                              \
+          transformer_engine::concat_strings(#symbol": ", description));       \
+    }                                                                          \
+  } while (false)
+
+#endif // TRANSFORMER_ENGINE_COMMON_UTIL_CUDA_DRIVER_H_