From 14a9ab6b87c8edde81e6d80b04dbed6e8597e5b2 Mon Sep 17 00:00:00 2001
From: Chenfan <jcf94@outlook.com>
Date: Wed, 6 Aug 2025 13:44:48 +0800
Subject: [PATCH 1/3] Codegen fix

---
 src/relax/backend/contrib/codegen_c/codegen_c.h | 15 ++++++---------
 src/relax/backend/contrib/cutlass/codegen.cc    |  1 +
 2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index 795b691dec4c..aa03a8bc1a49 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -85,29 +85,26 @@ class CodegenCBase {
     code_stream_ << "#endif\n";
     code_stream_ << "TVM_DLL int32_t ";
     code_stream_ << func_name << "(";
-    code_stream_ << "TVMValue* args, ";
-    code_stream_ << "int* type_code, ";
-    code_stream_ << "int num_args, ";
-    code_stream_ << "TVMValue* out_value, ";
-    code_stream_ << "int* out_type_code) {\n";
+    code_stream_ << "tvm::ffi::PackedArgs args, ";
+    code_stream_ << "tvm::ffi::AnyView* out_value) {\n";
   }
 
   /*!
-   * \brief Adds a line to convert TVMValue args to DLTensors
+   * \brief Adds a line to convert tvm::ffi::PackedArgs args to DLTensors
    */
   void PrintArgToData(int idx) {
     PrintIndents();
     code_stream_ << "DLTensor* arg" << idx << " = ";
-    code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
+    code_stream_ << "(DLTensor*)(args[" << idx << "].cast<DLTensor*>());\n";
   }
 
   /*!
-   * \brief Adds a line to convert TVMValue rets to DLTensors
+   * \brief Adds a line to convert tvm::ffi::PackedArgs rets to DLTensors
    */
   void PrintRetToData(int idx) {
     PrintIndents();
     code_stream_ << "DLTensor* ret" << idx << " = ";
-    code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
+    code_stream_ << "(DLTensor*)(args[" << idx << "].cast<DLTensor*>());\n";
   }
 
   /*!
diff --git a/src/relax/backend/contrib/cutlass/codegen.cc b/src/relax/backend/contrib/cutlass/codegen.cc
index b6307af0237b..b7b9404a5fb1 100644
--- a/src/relax/backend/contrib/cutlass/codegen.cc
+++ b/src/relax/backend/contrib/cutlass/codegen.cc
@@ -67,6 +67,7 @@ runtime::Module Finalize(const std::string& code, const Array<String>& func_name
   default_headers << "#include <cutlass/coord.h>\n";
   default_headers << "#include <cutlass/tensor_ref.h>\n";
   default_headers << "#include <cutlass/util/host_tensor.h>\n";
+  default_headers << "#include <tvm/runtime/logging.h>\n";
 
   const auto pf = tvm::ffi::Function::GetGlobalRequired("runtime.CSourceModuleCreate");
   VLOG(1) << "Generated CUTLASS code:" << std::endl << code;

From d58c14059eb2096d72c1c94a81b270e540f128fb Mon Sep 17 00:00:00 2001
From: Chenfan <jcf94@outlook.com>
Date: Sat, 9 Aug 2025 22:34:56 +0800
Subject: [PATCH 2/3] Update

---
 python/tvm/contrib/cutlass/conv2d_operation.py  | 8 ++++----
 python/tvm/contrib/cutlass/gemm_operation.py    | 6 +++---
 src/relax/backend/contrib/codegen_c/codegen_c.h | 2 +-
 src/relax/backend/contrib/cutlass/codegen.cc    | 1 -
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/tvm/contrib/cutlass/conv2d_operation.py b/python/tvm/contrib/cutlass/conv2d_operation.py
index a37e46f4046c..361bcb54e532 100644
--- a/python/tvm/contrib/cutlass/conv2d_operation.py
+++ b/python/tvm/contrib/cutlass/conv2d_operation.py
@@ -418,17 +418,17 @@ def instantiate_conv2d_template(attrs):
   size_t workspace_size = conv2d_op.get_workspace_size(arguments);
   cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
   cutlass::Status status = conv2d_op.can_implement(arguments);
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
   ${split_k_reset}
   status = conv2d_op.initialize(arguments, workspace.get());
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
   ${split_k_update}
 
   auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
   cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
 
   status = conv2d_op(stream);
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
   ${split_k_reduction}
 """
 
@@ -439,7 +439,7 @@ def instantiate_conv2d_template(attrs):
     split_k_update = """
   arguments.output_op = {ElementComputeEpilogue(1), ElementComputeEpilogue(0)};
   status = conv2d_op.update(arguments, workspace.get());
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
 """
 
     split_k_reduction = """
diff --git a/python/tvm/contrib/cutlass/gemm_operation.py b/python/tvm/contrib/cutlass/gemm_operation.py
index 46b68c29ee63..65dc5da772c1 100644
--- a/python/tvm/contrib/cutlass/gemm_operation.py
+++ b/python/tvm/contrib/cutlass/gemm_operation.py
@@ -341,15 +341,15 @@ def instantiate_gemm_template(attrs):
   cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
   ${kernel} gemm_op;
   cutlass::Status status = gemm_op.can_implement(arguments);
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
   status = gemm_op.initialize(arguments, workspace.get());
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
 
   auto func = tvm::ffi::Function::GetGlobalRequired("runtime.get_cuda_stream");
   cudaStream_t stream = static_cast<cudaStream_t>(func().cast<void*>());
 
   status = gemm_op(stream);
-  CHECK(status == cutlass::Status::kSuccess);
+  TVM_FFI_ICHECK(status == cutlass::Status::kSuccess);
 """
     op_type = attrs["op_type"]
     has_bias = "bias" in op_type
diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index aa03a8bc1a49..48e3dc1d36fb 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -83,7 +83,7 @@ class CodegenCBase {
     code_stream_ << "#ifdef __cplusplus\n";
     code_stream_ << "extern \"C\" {\n";
     code_stream_ << "#endif\n";
-    code_stream_ << "TVM_DLL int32_t ";
+    code_stream_ << "int32_t ";
     code_stream_ << func_name << "(";
     code_stream_ << "tvm::ffi::PackedArgs args, ";
     code_stream_ << "tvm::ffi::AnyView* out_value) {\n";
diff --git a/src/relax/backend/contrib/cutlass/codegen.cc b/src/relax/backend/contrib/cutlass/codegen.cc
index b7b9404a5fb1..b6307af0237b 100644
--- a/src/relax/backend/contrib/cutlass/codegen.cc
+++ b/src/relax/backend/contrib/cutlass/codegen.cc
@@ -67,7 +67,6 @@ runtime::Module Finalize(const std::string& code, const Array<String>& func_name
   default_headers << "#include <cutlass/coord.h>\n";
   default_headers << "#include <cutlass/tensor_ref.h>\n";
   default_headers << "#include <cutlass/util/host_tensor.h>\n";
-  default_headers << "#include <tvm/runtime/logging.h>\n";
 
   const auto pf = tvm::ffi::Function::GetGlobalRequired("runtime.CSourceModuleCreate");
   VLOG(1) << "Generated CUTLASS code:" << std::endl << code;

From 9be2638bfd4ba899af62f1ce72faa38aad71d6f2 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Aug 2025 13:58:37 -0400
Subject: [PATCH 3/3] Update codegen_c.h

---
 src/relax/backend/contrib/codegen_c/codegen_c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index 48e3dc1d36fb..7f04091fc178 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -83,7 +83,7 @@ class CodegenCBase {
     code_stream_ << "#ifdef __cplusplus\n";
     code_stream_ << "extern \"C\" {\n";
     code_stream_ << "#endif\n";
-    code_stream_ << "int32_t ";
+    code_stream_ << "TVM_FFI_DLL_EXPORT int32_t ";
     code_stream_ << func_name << "(";
     code_stream_ << "tvm::ffi::PackedArgs args, ";
     code_stream_ << "tvm::ffi::AnyView* out_value) {\n";