From bcb6998bb25fd54fed4ce082c09bff2985e70460 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 19 Apr 2025 11:43:00 -0400
Subject: [PATCH 01/18] tests for add'l modules

---
 .../relax/test_from_exported_to_cuda.py       | 340 +++++++++++++++++-
 1 file changed, 336 insertions(+), 4 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index e92855885e35..01dbb0efc337 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -24,6 +24,7 @@
 from torch.export import export
 from tvm.relax.frontend.torch import from_exported_program
 from torch.nn import Softmax, Upsample
+import torch.nn.functional as F
 
 
 def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
@@ -73,9 +74,7 @@ def forward(self, x):
             return torch.full((2, 3), 3.141592)
 
     torch_module = FullModel().eval()
-
     raw_data = np.random.rand(3, 3).astype("float32")
-
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
@@ -91,7 +90,6 @@ def forward(self, x):
 
     torch_module = FullLike().eval()
     raw_data = np.random.rand(2, 3).astype("float32")
-
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
@@ -105,9 +103,32 @@ def forward(self, x):
             return torch.ones((2, 3))
 
     torch_module = FullModel().eval()
-
     raw_data = np.random.rand(1, 1).astype("float32")
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
 
+@tvm.testing.parametrize_targets("cuda")
+def test_sort(target, dev):
+    raw_data = np.array([[4, 1, 13], [-30, 1, 3], [4, 0, 10]]).astype("float32")
+
+    # Test values
+    class SortModelValues(nn.Module):
+        def forward(self, x):
+            A, _ = torch.sort(x, dim=0, descending=True)
+            B, _ = torch.sort(x, dim=1, descending=False)
+            return A + B
+
+    torch_module = SortModelValues().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+    # Test indices
+    class SortModelIndices(nn.Module):
+        def forward(self, x):
+            _, A = torch.sort(x, dim=0, descending=True)
+            _, B = torch.sort(x, dim=1, descending=False)
+            return A + B
+
+    torch_module = SortModelIndices().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
@@ -588,5 +609,316 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda")
+def test_leakyrelu_module(target, dev):
+    class LeakyReLUModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.act = nn.LeakyReLU(negative_slope=0.1)
+
+        def forward(self, x):
+            return self.act(x)
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = LeakyReLUModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_log_softmax_module(target, dev):
+    class LogSoftmaxModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.logsoftmax = nn.LogSoftmax(dim=1)
+
+        def forward(self, x):
+            return self.logsoftmax(x)
+
+    raw_data = np.random.randn(4, 5).astype(np.float32)
+    torch_module = LogSoftmaxModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_softmax_module(target, dev):
+    class SoftmaxModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.softmax = nn.Softmax(dim=1)
+
+        def forward(self, x):
+            return self.softmax(x)
+
+    raw_data = np.random.randn(4, 5).astype(np.float32)
+    torch_module = SoftmaxModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_adaptive_avg_pool2d_module(target, dev):
+    class AdaptiveAvgPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.AdaptiveAvgPool2d((1, 1))
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = AdaptiveAvgPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_avg_pool2d_module(target, dev):
+    class AvgPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.AvgPool2d(kernel_size=2)
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = AvgPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv1d_module(target, dev):
+    class Conv1dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(2, 3, 10).astype(np.float32)
+    torch_module = Conv1dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv2d_module(target, dev):
+    class Conv2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32)
+    torch_module = Conv2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv3d_module(target, dev):
+    class Conv3dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32)
+    torch_module = Conv3dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_group_norm_module(target, dev):
+    class GroupNormModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gn = nn.GroupNorm(num_groups=1, num_channels=4)
+
+        def forward(self, x):
+            return self.gn(x)
+
+    raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32)
+    torch_module = GroupNormModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_layer_norm_module(target, dev):
+    class LayerNormModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.ln = nn.LayerNorm(normalized_shape=8)
+
+        def forward(self, x):
+            return self.ln(x)
+
+    raw_data = np.random.randn(2, 4, 8).astype(np.float32)
+    torch_module = LayerNormModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_linear_module(target, dev):
+    class LinearModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    raw_data = np.random.randn(4, 10).astype(np.float32)
+    torch_module = LinearModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_max_pool2d_module(target, dev):
+    class MaxPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.MaxPool2d(kernel_size=2)
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = MaxPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_embedding_module(target, dev):
+    class EmbeddingModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3)
+
+        def forward(self, x):
+            return self.embed(x)
+
+    raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64)
+    torch_module = EmbeddingModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_flatten_module(target, dev):
+    class FlattenModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.flatten = nn.Flatten()
+
+        def forward(self, x):
+            return self.flatten(x)
+
+    raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32)
+    torch_module = FlattenModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_numel(target, dev):
+    class NumelModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.numel())
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = NumelModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_size(target, dev):
+    class SizeModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.size(0))
+
+    raw_data = np.random.randn(5, 4).astype(np.float32)
+    torch_module = SizeModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_tensor(target, dev):
+    class TensorModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor([1, 2, 3])
+
+    raw_data = np.zeros((1,)).astype(np.float32)
+    torch_module = TensorModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_type(target, dev):
+    class TypeModule(nn.Module):
+        def forward(self, x):
+            return x.type(torch.float16)
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = TypeModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_float(target, dev):
+    class FloatModule(nn.Module):
+        def forward(self, x):
+            return x.float()
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = FloatModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_half(target, dev):
+    class HalfModule(nn.Module):
+        def forward(self, x):
+            return x.half()
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = HalfModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_getattr(target, dev):
+    class GetAttrModule(nn.Module):
+        def forward(self, x):
+            # Use getattr to call the ndimension method.
+            return torch.tensor(getattr(x, "ndimension")())
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = GetAttrModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_sym_size_int(target, dev):
+    class SymSizeIntModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.shape[1])
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = SymSizeIntModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_interpolate(target, dev):
+    class InterpolateModule(nn.Module):
+        def forward(self, x):
+            # Upsample to a fixed size.
+            return F.interpolate(x, size=(16, 16), mode="nearest")
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = InterpolateModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 751121bd6f1ed39768be0ccff574227f72b2f221 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 19 Apr 2025 11:54:04 -0400
Subject: [PATCH 02/18] no sort

---
 .../relax/test_from_exported_to_cuda.py       | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 01dbb0efc337..562f9b020adc 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -107,31 +107,6 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-@tvm.testing.parametrize_targets("cuda")
-def test_sort(target, dev):
-    raw_data = np.array([[4, 1, 13], [-30, 1, 3], [4, 0, 10]]).astype("float32")
-
-    # Test values
-    class SortModelValues(nn.Module):
-        def forward(self, x):
-            A, _ = torch.sort(x, dim=0, descending=True)
-            B, _ = torch.sort(x, dim=1, descending=False)
-            return A + B
-
-    torch_module = SortModelValues().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-    # Test indices
-    class SortModelIndices(nn.Module):
-        def forward(self, x):
-            _, A = torch.sort(x, dim=0, descending=True)
-            _, B = torch.sort(x, dim=1, descending=False)
-            return A + B
-
-    torch_module = SortModelIndices().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
 @tvm.testing.parametrize_targets("cuda")
 def test_tensor_clamp(target, dev):
     class ClampBothTensor(torch.nn.Module):

From 175ac346777aac26eea49db623c162f13113f31d Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 19 Apr 2025 14:41:21 -0400
Subject: [PATCH 03/18] cross entropy test passes

---
 python/tvm/dlight/gpu/general_reduction.py    | 23 +++++++++++++++++++
 .../torch/base_fx_graph_translator.py         | 22 +++++++++++++++++-
 .../torch/exported_program_translator.py      | 11 ++++++++-
 .../tvm/relax/frontend/torch/fx_translator.py | 17 ++++++--------
 .../relax/test_from_exported_to_cuda.py       | 13 +++++++++++
 5 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index d3979ce0e4c3..d61ec624411c 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -60,6 +60,29 @@ def apply(  # pylint: disable=too-many-locals
 
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
+
+        # If the last block is a scalar value, there is nothing left to 
+        # tile/parallelise, and the code below would crash because `iters` is 
+        # an empty tuple.
+        # Special case: the whole reduction collapsed to a *scalar*.
+        # Add a unit thread loop so the final write happens inside a valid
+        # GPU thread environment and passes `VerifyMemory`.
+        # ------------------------------------------------------------------
+        if num_last_block_iter == 0:
+            # Put every block (both the running reductions and the final
+            # scalar write) inside a trivial GPU thread to satisfy
+            # VerifyMemory.  The very first block gets a `blockIdx.x`
+            # wrapper so that kernels still have a unique block scope.
+            for i, info in enumerate(block_infos):
+                loop_rv = sch.add_unit_loop(info.block_rv)   # extent = 1
+                if i == 0:
+                    sch.bind(loop_rv, "blockIdx.x")
+                else:
+                    sch.bind(loop_rv, "threadIdx.x")
+
+            return sch
+
+
         if num_last_block_iter < len(dom_kind):
 
             def f_layout_mapping(*iters):
diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index ae4c918900ec..c3cafb32d123 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -782,6 +782,25 @@ def _conv3d(self, node: fx.Node) -> relax.Var:
             groups=groups,
         )
 
+    def _cross_entropy_loss(
+        self,
+        preds: relax.Expr,
+        targets: relax.Expr,
+        weights: Optional[relax.Expr],
+        reduction: str,
+        ignore_index: int,
+    ) -> relax.Expr:
+        log_probs = relax.op.nn.log_softmax(preds)
+        return self.block_builder.emit(
+            relax.op.nn.nll_loss(
+                log_probs,
+                targets,
+                weights,
+                reduction,
+                ignore_index,
+            )
+        )
+
     def _einsum(self, node: fx.Node) -> relax.Var:
         import torch  # type: ignore
 
@@ -1096,8 +1115,9 @@ def _cumsum(self, node: fx.Node) -> relax.Var:
 
         return self.block_builder.emit(relax.op.cumsum(x, dim, dtype))
 
-    def _expand(self, node: fx.Node) -> relax.Var:
         args = self.retrieve_args(node)
+
+    def _expand(self, node: fx.Node) -> relax.Var:
         sizes = args[1:] if len(args) > 2 else args[1]
         broadcast_shape, in_shape = [], self.shape_of(args[0])
         for idx, i in enumerate(sizes):
diff --git a/python/tvm/relax/frontend/torch/exported_program_translator.py b/python/tvm/relax/frontend/torch/exported_program_translator.py
index 932607287571..aea8196aefcf 100644
--- a/python/tvm/relax/frontend/torch/exported_program_translator.py
+++ b/python/tvm/relax/frontend/torch/exported_program_translator.py
@@ -66,7 +66,7 @@ def _reciprocal(self, node: fx.Node) -> relax.Var:
 
     ########## Neural Network ##########
 
-    def _batch_norm(self, node: fx.Node, training) -> relax.Var:
+    def _batch_norm(self, node: fx.Node, training: bool) -> relax.Var:
         import numpy as np
 
         x = self.env[node.args[0]]
@@ -113,6 +113,14 @@ def _batch_norm_legit_no_training(self, node: fx.Node) -> relax.Var:
         training = False
         return self._batch_norm(node, training)
 
+    def _cross_entropy_default(self, node: fx.Node) -> relax.Expr:
+        preds = self.env[node.args[0]]
+        targets = self.env[node.args[1]]
+        weight = self.env.get(node.args[2], None) if len(node.args) > 2 else None
+        reduction = node.kwargs.get("reduction", "mean")
+        ignore_index = node.kwargs.get("ignore_index", -100)
+        return self._cross_entropy_loss(preds, targets, weight, reduction, ignore_index)
+
     def _group_norm(self, node: fx.Node) -> relax.Var:
         x = self.env[node.args[0]]
         num_groups = node.args[1]
@@ -382,6 +390,7 @@ def create_convert_map(
             "conv1d.default": self._conv1d,
             "conv2d.default": self._conv2d,
             "conv3d.default": self._conv3d,
+            "cross_entropy_loss.default": self._cross_entropy_default,
             "einsum.default": self._einsum,
             "embedding.default": lambda node: self._embedding_impl(
                 self.env[node.args[1]], self.env[node.args[0]]
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
index 5a34befb9296..5a07fb7c9707 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -260,12 +260,7 @@ def _cross_entropy(self, node: fx.Node) -> relax.Expr:
         weights = self.env.get(node.kwargs["weight"], None)
         reduction = node.kwargs["reduction"]
         ignore_index = node.kwargs["ignore_index"]
-
-        return self.block_builder.emit(
-            relax.op.nn.nll_loss(
-                relax.op.nn.log_softmax(preds), targets, weights, reduction, ignore_index
-            )
-        )
+        return self._cross_entropy_loss(preds, targets, weights, reduction, ignore_index)
 
     def _cross_entropy_module(self, node: fx.Node) -> relax.Expr:
         preds = self.env[node.args[0]]
@@ -282,10 +277,12 @@ def _cross_entropy_module(self, node: fx.Node) -> relax.Expr:
         reduction = module.reduction
         ignore_index = module.ignore_index
 
-        return self.block_builder.emit(
-            relax.op.nn.nll_loss(
-                relax.op.nn.log_softmax(preds), targets, weights, reduction, ignore_index
-            )
+        return self._cross_entropy_loss(
+            preds,
+            targets,
+            weights,
+            reduction,
+            ignore_index,
         )
 
     def _embedding_module(self, node: fx.Node) -> relax.Var:
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 562f9b020adc..7c753d4c06c2 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -703,6 +703,18 @@ def forward(self, x):
     torch_module = Conv3dModule().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+@tvm.testing.parametrize_targets("cuda")
+def test_cross_entropy_module(target, dev):
+    class CrossEntropyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.criterion = nn.CrossEntropyLoss()
+            self.register_buffer("target", torch.tensor([0, 1, 2, 1]))
+        def forward(self, x):
+            return self.criterion(x, self.target)
+    raw_data = np.random.randn(4, 3).astype(np.float32)
+    torch_module = CrossEntropyModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 @tvm.testing.parametrize_targets("cuda")
 def test_group_norm_module(target, dev):
@@ -895,5 +907,6 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
+
 if __name__ == "__main__":
     tvm.testing.main()

From 03d04bea8c24604eaeaf5a76b7a24970fb0a653a Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 19 Apr 2025 14:46:13 -0400
Subject: [PATCH 04/18] cleanup

---
 python/tvm/dlight/gpu/general_reduction.py     | 18 +++++++-----------
 .../python/relax/test_from_exported_to_cuda.py |  7 +++++--
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index d61ec624411c..328a3eac914d 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -61,20 +61,17 @@ def apply(  # pylint: disable=too-many-locals
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
 
-        # If the last block is a scalar value, there is nothing left to 
-        # tile/parallelise, and the code below would crash because `iters` is 
-        # an empty tuple.
-        # Special case: the whole reduction collapsed to a *scalar*.
+        # If the last block is a scalar value, there is nothing left to
+        # tile/parallelise, and  `iters` is an empty tuple.
         # Add a unit thread loop so the final write happens inside a valid
-        # GPU thread environment and passes `VerifyMemory`.
-        # ------------------------------------------------------------------
+        # GPU thread environment.
         if num_last_block_iter == 0:
             # Put every block (both the running reductions and the final
-            # scalar write) inside a trivial GPU thread to satisfy
-            # VerifyMemory.  The very first block gets a `blockIdx.x`
-            # wrapper so that kernels still have a unique block scope.
+            # scalar write) inside a trivial GPU thread. The very first block
+            # gets a `blockIdx.x` wrapper so that kernels still have a unique
+            # block scope.
             for i, info in enumerate(block_infos):
-                loop_rv = sch.add_unit_loop(info.block_rv)   # extent = 1
+                loop_rv = sch.add_unit_loop(info.block_rv)
                 if i == 0:
                     sch.bind(loop_rv, "blockIdx.x")
                 else:
@@ -82,7 +79,6 @@ def apply(  # pylint: disable=too-many-locals
 
             return sch
 
-
         if num_last_block_iter < len(dom_kind):
 
             def f_layout_mapping(*iters):
diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 7c753d4c06c2..bd9869a9e4cf 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -703,19 +703,23 @@ def forward(self, x):
     torch_module = Conv3dModule().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+
 @tvm.testing.parametrize_targets("cuda")
 def test_cross_entropy_module(target, dev):
     class CrossEntropyModule(nn.Module):
         def __init__(self):
             super().__init__()
             self.criterion = nn.CrossEntropyLoss()
-            self.register_buffer("target", torch.tensor([0, 1, 2, 1]))
+            self.target = torch.tensor([0, 1, 2, 1])
+
         def forward(self, x):
             return self.criterion(x, self.target)
+
     raw_data = np.random.randn(4, 3).astype(np.float32)
     torch_module = CrossEntropyModule().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+
 @tvm.testing.parametrize_targets("cuda")
 def test_group_norm_module(target, dev):
     class GroupNormModule(nn.Module):
@@ -907,6 +911,5 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-
 if __name__ == "__main__":
     tvm.testing.main()

From cf77abc19f4ad5af0b81ee0eba4b8a024a9f1d2a Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sat, 19 Apr 2025 17:47:39 -0400
Subject: [PATCH 05/18] fix expand

---
 python/tvm/relax/frontend/torch/base_fx_graph_translator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index c3cafb32d123..6348980b226d 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -1115,9 +1115,8 @@ def _cumsum(self, node: fx.Node) -> relax.Var:
 
         return self.block_builder.emit(relax.op.cumsum(x, dim, dtype))
 
-        args = self.retrieve_args(node)
-
     def _expand(self, node: fx.Node) -> relax.Var:
+        args = self.retrieve_args(node)
         sizes = args[1:] if len(args) > 2 else args[1]
         broadcast_shape, in_shape = [], self.shape_of(args[0])
         for idx, i in enumerate(sizes):

From f503d6a8c6b4ade1f9b7251dfad72ce2fa88b382 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 21 Apr 2025 12:25:16 -0400
Subject: [PATCH 06/18] remove new e2e tests

---
 .../relax/test_from_exported_to_cuda.py       | 340 ------------------
 1 file changed, 340 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index fa25f58f24ba..de0d8c726317 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -685,20 +685,6 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-@tvm.testing.parametrize_targets("cuda")
-def test_leakyrelu_module(target, dev):
-    class LeakyReLUModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.act = nn.LeakyReLU(negative_slope=0.1)
-
-        def forward(self, x):
-            return self.act(x)
-
-    raw_data = np.random.randn(2, 3).astype(np.float32)
-    torch_module = LeakyReLUModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
 def test_mul(target, dev):
     class MulModule(nn.Module):
         def __init__(self):
@@ -713,331 +699,5 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-@tvm.testing.parametrize_targets("cuda")
-def test_log_softmax_module(target, dev):
-    class LogSoftmaxModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.logsoftmax = nn.LogSoftmax(dim=1)
-
-        def forward(self, x):
-            return self.logsoftmax(x)
-
-    raw_data = np.random.randn(4, 5).astype(np.float32)
-    torch_module = LogSoftmaxModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_softmax_module(target, dev):
-    class SoftmaxModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.softmax = nn.Softmax(dim=1)
-
-        def forward(self, x):
-            return self.softmax(x)
-
-    raw_data = np.random.randn(4, 5).astype(np.float32)
-    torch_module = SoftmaxModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_adaptive_avg_pool2d_module(target, dev):
-    class AdaptiveAvgPool2dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = nn.AdaptiveAvgPool2d((1, 1))
-
-        def forward(self, x):
-            return self.pool(x)
-
-    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
-    torch_module = AdaptiveAvgPool2dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_avg_pool2d_module(target, dev):
-    class AvgPool2dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = nn.AvgPool2d(kernel_size=2)
-
-        def forward(self, x):
-            return self.pool(x)
-
-    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
-    torch_module = AvgPool2dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_conv1d_module(target, dev):
-    class Conv1dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    raw_data = np.random.randn(2, 3, 10).astype(np.float32)
-    torch_module = Conv1dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_conv2d_module(target, dev):
-    class Conv2dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32)
-    torch_module = Conv2dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_conv3d_module(target, dev):
-    class Conv3dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32)
-    torch_module = Conv3dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_cross_entropy_module(target, dev):
-    class CrossEntropyModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.criterion = nn.CrossEntropyLoss()
-            self.target = torch.tensor([0, 1, 2, 1])
-
-        def forward(self, x):
-            return self.criterion(x, self.target)
-
-    raw_data = np.random.randn(4, 3).astype(np.float32)
-    torch_module = CrossEntropyModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_group_norm_module(target, dev):
-    class GroupNormModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.gn = nn.GroupNorm(num_groups=1, num_channels=4)
-
-        def forward(self, x):
-            return self.gn(x)
-
-    raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32)
-    torch_module = GroupNormModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_layer_norm_module(target, dev):
-    class LayerNormModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.ln = nn.LayerNorm(normalized_shape=8)
-
-        def forward(self, x):
-            return self.ln(x)
-
-    raw_data = np.random.randn(2, 4, 8).astype(np.float32)
-    torch_module = LayerNormModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_linear_module(target, dev):
-    class LinearModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = nn.Linear(10, 5)
-
-        def forward(self, x):
-            return self.linear(x)
-
-    raw_data = np.random.randn(4, 10).astype(np.float32)
-    torch_module = LinearModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_max_pool2d_module(target, dev):
-    class MaxPool2dModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.pool = nn.MaxPool2d(kernel_size=2)
-
-        def forward(self, x):
-            return self.pool(x)
-
-    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
-    torch_module = MaxPool2dModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_embedding_module(target, dev):
-    class EmbeddingModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3)
-
-        def forward(self, x):
-            return self.embed(x)
-
-    raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64)
-    torch_module = EmbeddingModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_flatten_module(target, dev):
-    class FlattenModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.flatten = nn.Flatten()
-
-        def forward(self, x):
-            return self.flatten(x)
-
-    raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32)
-    torch_module = FlattenModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_numel(target, dev):
-    class NumelModule(nn.Module):
-        def forward(self, x):
-            return torch.tensor(x.numel())
-
-    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
-    torch_module = NumelModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_size(target, dev):
-    class SizeModule(nn.Module):
-        def forward(self, x):
-            return torch.tensor(x.size(0))
-
-    raw_data = np.random.randn(5, 4).astype(np.float32)
-    torch_module = SizeModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_tensor(target, dev):
-    class TensorModule(nn.Module):
-        def forward(self, x):
-            return torch.tensor([1, 2, 3])
-
-    raw_data = np.zeros((1,)).astype(np.float32)
-    torch_module = TensorModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_type(target, dev):
-    class TypeModule(nn.Module):
-        def forward(self, x):
-            return x.type(torch.float16)
-
-    raw_data = np.random.randn(2, 3).astype(np.float32)
-    torch_module = TypeModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_float(target, dev):
-    class FloatModule(nn.Module):
-        def forward(self, x):
-            return x.float()
-
-    raw_data = np.random.randn(2, 3).astype(np.float32)
-    torch_module = FloatModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_half(target, dev):
-    class HalfModule(nn.Module):
-        def forward(self, x):
-            return x.half()
-
-    raw_data = np.random.randn(2, 3).astype(np.float32)
-    torch_module = HalfModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_getattr(target, dev):
-    class GetAttrModule(nn.Module):
-        def forward(self, x):
-            # Use getattr to call the ndimension method.
-            return torch.tensor(getattr(x, "ndimension")())
-
-    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
-    torch_module = GetAttrModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_sym_size_int(target, dev):
-    class SymSizeIntModule(nn.Module):
-        def forward(self, x):
-            return torch.tensor(x.shape[1])
-
-    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
-    torch_module = SymSizeIntModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-@tvm.testing.parametrize_targets("cuda")
-def test_interpolate(target, dev):
-    class InterpolateModule(nn.Module):
-        def forward(self, x):
-            # Upsample to a fixed size.
-            return F.interpolate(x, size=(16, 16), mode="nearest")
-
-    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
-    torch_module = InterpolateModule().eval()
-def test_concat(target, dev):
-    class ConcatFour(nn.Module):
-        def __init__(self, dim=0):
-            super(ConcatFour, self).__init__()
-            self.dim = dim
-            self.x2 = torch.randn(2, 3)
-            self.x3 = torch.randn(2, 3)
-            self.x4 = torch.randn(2, 3)
-
-        def forward(self, x):
-            return torch.cat((x, self.x2, self.x3, self.x4), dim=self.dim)
-
-    torch_module = ConcatFour().eval()
-    raw_data = np.random.rand(2, 3).astype("float32")
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
 if __name__ == "__main__":
     tvm.testing.main()

From d58301692976c04dff3d0f9feea8bb637c024aa3 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 21 Apr 2025 12:26:26 -0400
Subject: [PATCH 07/18] remove new e2e tests

---
 tests/python/relax/test_from_exported_to_cuda.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index de0d8c726317..16b7eb59ca94 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -698,6 +698,22 @@ def forward(self, x):
     raw_data = np.random.rand(2, 3).astype("float32")
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+@tvm.testing.parametrize_targets("cuda")
+def test_cross_entropy_module(target, dev):
+    class CrossEntropyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.criterion = nn.CrossEntropyLoss()
+            self.target = torch.tensor([0, 1, 2, 1])
+
+        def forward(self, x):
+            return self.criterion(x, self.target)
+
+    raw_data = np.random.randn(4, 3).astype(np.float32)
+    torch_module = CrossEntropyModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
 
 if __name__ == "__main__":
     tvm.testing.main()

From 0e4ca8df2bd6061b4b9b9e1e9a67938fef2a4e5b Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 21 Apr 2025 12:51:09 -0400
Subject: [PATCH 08/18] convert e2e test to unit test

---
 .../relax/test_from_exported_to_cuda.py       | 30 ------------------
 .../test_frontend_from_exported_program.py    | 31 +++++++++++++++++++
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 16b7eb59ca94..7926b15b5e5a 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -685,35 +685,5 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
-def test_mul(target, dev):
-    class MulModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.y = torch.tensor(np.random.rand(2, 3).astype("float32"))
-
-        def forward(self, x):
-            return x.mul(self.y)
-
-    torch_module = MulModule().eval()
-    raw_data = np.random.rand(2, 3).astype("float32")
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-@tvm.testing.parametrize_targets("cuda")
-def test_cross_entropy_module(target, dev):
-    class CrossEntropyModule(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.criterion = nn.CrossEntropyLoss()
-            self.target = torch.tensor([0, 1, 2, 1])
-
-        def forward(self, x):
-            return self.criterion(x, self.target)
-
-    raw_data = np.random.randn(4, 3).astype(np.float32)
-    torch_module = CrossEntropyModule().eval()
-    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
-
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index a386a989f00e..dfd6be580e3f 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -17,6 +17,7 @@
 import operator
 import pytest
 import torch
+from torch import nn
 from torch.nn import Module
 from torch.export import export
 
@@ -4418,6 +4419,36 @@ def main(
     example_args2 = (torch.randn(5, dtype=torch.float32),)
     verify_model(Eye2(), example_args2, {}, Expected2)
 
+def test_cross_entropy():
+
+    class CrossEntropyModule(Module):
+        def init(self):
+            super().init()
+            self.criterion = nn.CrossEntropyLoss()
+            self.target = torch.tensor([0, 1, 2, 1])
+
+        def forward(self, x):
+            return self.criterion(x, self.target)
+
+    raw_data = np.random.randn(4, 3).astype(np.float32)
+    torch_module = CrossEntropyModule().eval()
+
+    @tvm.script.ir_module
+    class Expected1:
+        @R.function
+        def main(
+            input: R.Tensor((3, 5), dtype="float32")
+        ) -> R.Tuple(R.Tensor((3, 5), dtype="float32")):
+            with R.dataflow():
+                lv: R.Tensor((3, 5), dtype="float32") = R.eye(3, 5, dtype="float32")
+                gv: R.Tuple(R.Tensor((3, 5), dtype="float32")) = (lv,)
+                R.output(gv)
+            return gv
+
+    example_args1 = (torch.randn(4, 3, dtype=torch.float32),)
+    verify_model(CrossEntropyModule(), example_args1, {}, Expected1)
+
+
 
 if __name__ == "__main__":
     tvm.testing.main()

From 320ad6b23cd616d40b09f6c8bbbba99588a351cf Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 21 Apr 2025 13:18:15 -0400
Subject: [PATCH 09/18] unit test

---
 .../test_frontend_from_exported_program.py    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index dfd6be580e3f..140f71ea3818 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -4419,29 +4419,30 @@ def main(
     example_args2 = (torch.randn(5, dtype=torch.float32),)
     verify_model(Eye2(), example_args2, {}, Expected2)
 
-def test_cross_entropy():
 
+def test_cross_entropy():
     class CrossEntropyModule(Module):
-        def init(self):
-            super().init()
+        def __init__(self):
+            super().__init__()
             self.criterion = nn.CrossEntropyLoss()
             self.target = torch.tensor([0, 1, 2, 1])
 
         def forward(self, x):
             return self.criterion(x, self.target)
 
-    raw_data = np.random.randn(4, 3).astype(np.float32)
-    torch_module = CrossEntropyModule().eval()
-
     @tvm.script.ir_module
     class Expected1:
         @R.function
-        def main(
-            input: R.Tensor((3, 5), dtype="float32")
-        ) -> R.Tuple(R.Tensor((3, 5), dtype="float32")):
+        def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="float32")):
             with R.dataflow():
-                lv: R.Tensor((3, 5), dtype="float32") = R.eye(3, 5, dtype="float32")
-                gv: R.Tuple(R.Tensor((3, 5), dtype="float32")) = (lv,)
+                lv: R.Tensor((4, 3), dtype="float32") = R.nn.log_softmax(x, axis=-1)
+                lv1: R.Tensor((), dtype="float32") = R.nn.nll_loss(
+                    lv,
+                    targets=R.const([0, 1, 2, 1], dtype="int64"),
+                    reduction="mean",
+                    ignore_index=-100,
+                )
+                gv: R.Tuple(R.Tensor((), dtype="float32")) = (lv1,)
                 R.output(gv)
             return gv
 
@@ -4449,6 +4450,5 @@ def main(
     verify_model(CrossEntropyModule(), example_args1, {}, Expected1)
 
 
-
 if __name__ == "__main__":
     tvm.testing.main()

From 3f8247c0e49b7215a4434dfb154702e3387d51f2 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 21 Apr 2025 13:22:47 -0400
Subject: [PATCH 10/18] restore tests

---
 .../relax/test_from_exported_to_cuda.py       | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py
index 7926b15b5e5a..76a4bb203925 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/relax/test_from_exported_to_cuda.py
@@ -24,7 +24,6 @@
 from torch.export import export
 from tvm.relax.frontend.torch import from_exported_program
 from torch.nn import Softmax, Upsample
-import torch.nn.functional as F
 
 
 def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
@@ -685,5 +684,38 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda")
+def test_mul(target, dev):
+    class MulModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.y = torch.tensor(np.random.rand(2, 3).astype("float32"))
+
+        def forward(self, x):
+            return x.mul(self.y)
+
+    torch_module = MulModule().eval()
+    raw_data = np.random.rand(2, 3).astype("float32")
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_concat(target, dev):
+    class ConcatFour(nn.Module):
+        def __init__(self, dim=0):
+            super(ConcatFour, self).__init__()
+            self.dim = dim
+            self.x2 = torch.randn(2, 3)
+            self.x3 = torch.randn(2, 3)
+            self.x4 = torch.randn(2, 3)
+
+        def forward(self, x):
+            return torch.cat((x, self.x2, self.x3, self.x4), dim=self.dim)
+
+    torch_module = ConcatFour().eval()
+    raw_data = np.random.rand(2, 3).astype("float32")
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e7d6b9735f178e544d9490ab64a6d6854f2b2888 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 28 Apr 2025 10:55:01 -0400
Subject: [PATCH 11/18] move

---
 .../{relax => nightly/test_nnapi}/test_from_exported_to_cuda.py  | 1 -
 1 file changed, 1 deletion(-)
 rename tests/python/{relax => nightly/test_nnapi}/test_from_exported_to_cuda.py (99%)

diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
similarity index 99%
rename from tests/python/relax/test_from_exported_to_cuda.py
rename to tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index 6bb35b50b1df..8b23e899fc64 100644
--- a/tests/python/relax/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -1,4 +1,3 @@
-# Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file

From 9b2d9983ec478106ba38680c0ac29466b0046381 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 28 Apr 2025 10:56:07 -0400
Subject: [PATCH 12/18] add new tests

---
 .../test_nnapi/test_from_exported_to_cuda.py  | 311 ++++++++++++++++++
 1 file changed, 311 insertions(+)

diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index 8b23e899fc64..f68b5a234341 100644
--- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -1,3 +1,4 @@
+# Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
@@ -741,5 +742,315 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda")
+def test_leakyrelu_module(target, dev):
+    class LeakyReLUModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.act = nn.LeakyReLU(negative_slope=0.1)
+
+        def forward(self, x):
+            return self.act(x)
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = LeakyReLUModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_log_softmax_module(target, dev):
+    class LogSoftmaxModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.logsoftmax = nn.LogSoftmax(dim=1)
+
+        def forward(self, x):
+            return self.logsoftmax(x)
+
+    raw_data = np.random.randn(4, 5).astype(np.float32)
+    torch_module = LogSoftmaxModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_softmax_module(target, dev):
+    class SoftmaxModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.softmax = nn.Softmax(dim=1)
+
+        def forward(self, x):
+            return self.softmax(x)
+
+    raw_data = np.random.randn(4, 5).astype(np.float32)
+    torch_module = SoftmaxModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_adaptive_avg_pool2d_module(target, dev):
+    class AdaptiveAvgPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.AdaptiveAvgPool2d((1, 1))
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = AdaptiveAvgPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_avg_pool2d_module(target, dev):
+    class AvgPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.AvgPool2d(kernel_size=2)
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = AvgPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv1d_module(target, dev):
+    class Conv1dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(2, 3, 10).astype(np.float32)
+    torch_module = Conv1dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv2d_module(target, dev):
+    class Conv2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32)
+    torch_module = Conv2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_conv3d_module(target, dev):
+    class Conv3dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32)
+    torch_module = Conv3dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_group_norm_module(target, dev):
+    class GroupNormModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gn = nn.GroupNorm(num_groups=1, num_channels=4)
+
+        def forward(self, x):
+            return self.gn(x)
+
+    raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32)
+    torch_module = GroupNormModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_layer_norm_module(target, dev):
+    class LayerNormModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.ln = nn.LayerNorm(normalized_shape=8)
+
+        def forward(self, x):
+            return self.ln(x)
+
+    raw_data = np.random.randn(2, 4, 8).astype(np.float32)
+    torch_module = LayerNormModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_linear_module(target, dev):
+    class LinearModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    raw_data = np.random.randn(4, 10).astype(np.float32)
+    torch_module = LinearModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_max_pool2d_module(target, dev):
+    class MaxPool2dModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = nn.MaxPool2d(kernel_size=2)
+
+        def forward(self, x):
+            return self.pool(x)
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = MaxPool2dModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_embedding_module(target, dev):
+    class EmbeddingModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3)
+
+        def forward(self, x):
+            return self.embed(x)
+
+    raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64)
+    torch_module = EmbeddingModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_flatten_module(target, dev):
+    class FlattenModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.flatten = nn.Flatten()
+
+        def forward(self, x):
+            return self.flatten(x)
+
+    raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32)
+    torch_module = FlattenModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_numel(target, dev):
+    class NumelModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.numel())
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = NumelModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_size(target, dev):
+    class SizeModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.size(0))
+
+    raw_data = np.random.randn(5, 4).astype(np.float32)
+    torch_module = SizeModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_tensor(target, dev):
+    class TensorModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor([1, 2, 3])
+
+    raw_data = np.zeros((1,)).astype(np.float32)
+    torch_module = TensorModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_type(target, dev):
+    class TypeModule(nn.Module):
+        def forward(self, x):
+            return x.type(torch.float16)
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = TypeModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_float(target, dev):
+    class FloatModule(nn.Module):
+        def forward(self, x):
+            return x.float()
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = FloatModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_half(target, dev):
+    class HalfModule(nn.Module):
+        def forward(self, x):
+            return x.half()
+
+    raw_data = np.random.randn(2, 3).astype(np.float32)
+    torch_module = HalfModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_getattr(target, dev):
+    class GetAttrModule(nn.Module):
+        def forward(self, x):
+            # Use getattr to call the ndimension method.
+            return torch.tensor(getattr(x, "ndimension")())
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = GetAttrModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_sym_size_int(target, dev):
+    class SymSizeIntModule(nn.Module):
+        def forward(self, x):
+            return torch.tensor(x.shape[1])
+
+    raw_data = np.random.randn(2, 3, 4).astype(np.float32)
+    torch_module = SymSizeIntModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
+@tvm.testing.parametrize_targets("cuda")
+def test_interpolate(target, dev):
+    class InterpolateModule(nn.Module):
+        def forward(self, x):
+            # Upsample to a fixed size.
+            return F.interpolate(x, size=(16, 16), mode="nearest")
+
+    raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32)
+    torch_module = InterpolateModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8897b2861612c0a641adc128a6ceea6d08ef2f01 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 28 Apr 2025 11:01:36 -0400
Subject: [PATCH 13/18] add new tests from 17862

---
 tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index f68b5a234341..500e52151537 100644
--- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -21,11 +21,13 @@
 import numpy as np
 import torch
 from torch import nn
+from torch.nn import functional as F
 from torch.export import export
 from tvm.relax.frontend.torch import from_exported_program
 from torch.nn import Softmax, Upsample
 
 
+
 def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
     """
     This util ensures that a torch module can successfully be exported to TVM

From 2a9615370c63b07951dbf3a5675f08925262ccc6 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Mon, 28 Apr 2025 11:06:31 -0400
Subject: [PATCH 14/18] whitespace

---
 tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index 500e52151537..0184688a9e84 100644
--- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -27,7 +27,6 @@
 from torch.nn import Softmax, Upsample
 
 
-
 def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev):
     """
     This util ensures that a torch module can successfully be exported to TVM
@@ -1054,5 +1053,6 @@ def forward(self, x):
     torch_module = InterpolateModule().eval()
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
+
 if __name__ == "__main__":
     tvm.testing.main()

From a60a8627546b686365c7189af016e1fdc29f4b80 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 4 May 2025 14:05:20 -0400
Subject: [PATCH 15/18] print statemetns

---
 python/tvm/dlight/gpu/general_reduction.py    | 53 ++++++++++++-------
 .../test_frontend_from_exported_program.py    |  3 +-
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index 328a3eac914d..4fd310276f90 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -38,9 +38,16 @@ def apply(  # pylint: disable=too-many-locals
         if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
             return None
 
+        print("\n##########################################################################################")
+        print("                      ENTERING general_reduction.py     ")
+        print("##########################################################################################\n")
+
+        print("func:")
+        print(func)
+
         if target.kind.name == "cuda":
-            len_tx = 256
-            unroll_depth = 256
+            len_tx = 256 # Number of threads per block
+            unroll_depth = 256 # How many iterations of loop to unroll
         elif target.kind.name == "opencl":
             len_tx = 256
             unroll_depth = 64
@@ -54,6 +61,13 @@ def apply(  # pylint: disable=too-many-locals
         if block_infos is None or len(block_infos) == 0:
             return None
 
+        print("block_infos:")
+        for block_info in block_infos:
+            print(block_info)
+
+        print("len(block_infos):")
+        print(len(block_infos))
+
         dom_kind = block_infos[0].dom_kind()
         num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S"))
         num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R"))
@@ -61,23 +75,24 @@ def apply(  # pylint: disable=too-many-locals
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
 
-        # If the last block is a scalar value, there is nothing left to
-        # tile/parallelise, and  `iters` is an empty tuple.
-        # Add a unit thread loop so the final write happens inside a valid
-        # GPU thread environment.
-        if num_last_block_iter == 0:
-            # Put every block (both the running reductions and the final
-            # scalar write) inside a trivial GPU thread. The very first block
-            # gets a `blockIdx.x` wrapper so that kernels still have a unique
-            # block scope.
-            for i, info in enumerate(block_infos):
-                loop_rv = sch.add_unit_loop(info.block_rv)
-                if i == 0:
-                    sch.bind(loop_rv, "blockIdx.x")
-                else:
-                    sch.bind(loop_rv, "threadIdx.x")
-
-            return sch
+        # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. 
+        # # If the last block is a scalar value, there is nothing left to
+        # # tile/parallelise, and  `iters` is an empty tuple.
+        # # Add a unit thread loop so the final write happens inside a valid
+        # # GPU thread environment.
+        # if num_last_block_iter == 0:
+        #     # Put every block (both the running reductions and the final
+        #     # scalar write) inside a trivial GPU thread. The very first block
+        #     # gets a `blockIdx.x` wrapper so that kernels still have a unique
+        #     # block scope.
+        #     for i, info in enumerate(block_infos):
+        #         loop_rv = sch.add_unit_loop(info.block_rv)
+        #         if i == 0:
+        #             sch.bind(loop_rv, "blockIdx.x")
+        #         else:
+        #             sch.bind(loop_rv, "threadIdx.x")
+
+        #     return sch
 
         if num_last_block_iter < len(dom_kind):
 
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index 3429927ca5c7..d39a5cee1e17 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -5006,7 +5006,7 @@ def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="fl
 
     example_args1 = (torch.randn(4, 3, dtype=torch.float32),)
     verify_model(CrossEntropyModule(), example_args1, {}, Expected1)
-    
+
 def test_linspace():
     class Linspace(Module):
         def forward(self, input):
@@ -5057,3 +5057,4 @@ def main(
 
 if __name__ == "__main__":
     tvm.testing.main()
+1
\ No newline at end of file

From fb85449896844f5bac93304a252309510aa17204 Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 4 May 2025 15:13:57 -0400
Subject: [PATCH 16/18] all tests pass

---
 python/tvm/dlight/gpu/general_reduction.py | 50 +++++++++++++---------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index 4fd310276f90..756d332c9d98 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -65,8 +65,7 @@ def apply(  # pylint: disable=too-many-locals
         for block_info in block_infos:
             print(block_info)
 
-        print("len(block_infos):")
-        print(len(block_infos))
+        print("len(block_infos):", len(block_infos))
 
         dom_kind = block_infos[0].dom_kind()
         num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S"))
@@ -75,28 +74,37 @@ def apply(  # pylint: disable=too-many-locals
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
 
-        # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. 
-        # # If the last block is a scalar value, there is nothing left to
-        # # tile/parallelise, and  `iters` is an empty tuple.
-        # # Add a unit thread loop so the final write happens inside a valid
-        # # GPU thread environment.
-        # if num_last_block_iter == 0:
-        #     # Put every block (both the running reductions and the final
-        #     # scalar write) inside a trivial GPU thread. The very first block
-        #     # gets a `blockIdx.x` wrapper so that kernels still have a unique
-        #     # block scope.
-        #     for i, info in enumerate(block_infos):
-        #         loop_rv = sch.add_unit_loop(info.block_rv)
-        #         if i == 0:
-        #             sch.bind(loop_rv, "blockIdx.x")
-        #         else:
-        #             sch.bind(loop_rv, "threadIdx.x")
-
-        #     return sch
+        print("num_last_block_iter:", num_last_block_iter)
+        print("len(dom_kind)",len(dom_kind))
+        print("dom_kind:", dom_kind)
+ 
 
         if num_last_block_iter < len(dom_kind):
+            # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. 
+            # # If the last block is a scalar value, there is nothing left to
+            # # tile/parallelise, and  `iters` is an empty tuple.
+            # # Add a unit thread loop so the final write happens inside a valid
+            # # GPU thread environment.
+            if num_last_block_iter == 0:
+                print("ENTERING THE THING!!!!!!!!!!!!!!!!!")
+                # assert 0, "ENTERING THE THING!!!!!!!!!!!!!!!!!"
+                
+                # Put every block (both the running reductions and the final
+                # scalar write) inside a trivial GPU thread. The very first block
+                # gets a `blockIdx.x` wrapper so that kernels still have a unique
+                # block scope.
+                for i, info in enumerate(block_infos):
+                    loop_rv = sch.add_unit_loop(info.block_rv)
+                    if i == 0:
+                        sch.bind(loop_rv, "blockIdx.x")
+                    else:
+                        sch.bind(loop_rv, "threadIdx.x")
+
+                return sch
+
 
             def f_layout_mapping(*iters):
+                print("iters:", iters)
                 analyzer = arith.Analyzer()
                 # Try to match the iters of last block to the iters of the first block.
                 # For matched positions, use the iter from the input `iters`.
@@ -121,6 +129,8 @@ def f_layout_mapping(*iters):
                 ) + list(iters)
 
             index_map = tir.IndexMap.from_func(f_layout_mapping, ndim=num_last_block_iter)
+            print("index_map:")
+            print(index_map)
             sch.transform_block_layout(block_infos[-1].block_rv, index_map)
 
         try:

From 22ede511afa5985cc071de93abcb66d1c21f929d Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 4 May 2025 15:16:09 -0400
Subject: [PATCH 17/18] cleanup - all tests still pass

---
 python/tvm/dlight/gpu/general_reduction.py | 34 +++-------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index 756d332c9d98..1a43a629e80f 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -38,13 +38,6 @@ def apply(  # pylint: disable=too-many-locals
         if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
             return None
 
-        print("\n##########################################################################################")
-        print("                      ENTERING general_reduction.py     ")
-        print("##########################################################################################\n")
-
-        print("func:")
-        print(func)
-
         if target.kind.name == "cuda":
             len_tx = 256 # Number of threads per block
             unroll_depth = 256 # How many iterations of loop to unroll
@@ -61,12 +54,6 @@ def apply(  # pylint: disable=too-many-locals
         if block_infos is None or len(block_infos) == 0:
             return None
 
-        print("block_infos:")
-        for block_info in block_infos:
-            print(block_info)
-
-        print("len(block_infos):", len(block_infos))
-
         dom_kind = block_infos[0].dom_kind()
         num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S"))
         num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R"))
@@ -74,21 +61,12 @@ def apply(  # pylint: disable=too-many-locals
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
 
-        print("num_last_block_iter:", num_last_block_iter)
-        print("len(dom_kind)",len(dom_kind))
-        print("dom_kind:", dom_kind)
- 
-
         if num_last_block_iter < len(dom_kind):
-            # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. 
-            # # If the last block is a scalar value, there is nothing left to
-            # # tile/parallelise, and  `iters` is an empty tuple.
-            # # Add a unit thread loop so the final write happens inside a valid
-            # # GPU thread environment.
+            # If the last block is a scalar value, there is nothing left to
+            # tile/parallelise, and  `iters` is an empty tuple.
+            # Add a unit thread loop so the final write happens inside a valid
+            # GPU thread environment.
             if num_last_block_iter == 0:
-                print("ENTERING THE THING!!!!!!!!!!!!!!!!!")
-                # assert 0, "ENTERING THE THING!!!!!!!!!!!!!!!!!"
-                
                 # Put every block (both the running reductions and the final
                 # scalar write) inside a trivial GPU thread. The very first block
                 # gets a `blockIdx.x` wrapper so that kernels still have a unique
@@ -102,9 +80,7 @@ def apply(  # pylint: disable=too-many-locals
 
                 return sch
 
-
             def f_layout_mapping(*iters):
-                print("iters:", iters)
                 analyzer = arith.Analyzer()
                 # Try to match the iters of last block to the iters of the first block.
                 # For matched positions, use the iter from the input `iters`.
@@ -129,8 +105,6 @@ def f_layout_mapping(*iters):
                 ) + list(iters)
 
             index_map = tir.IndexMap.from_func(f_layout_mapping, ndim=num_last_block_iter)
-            print("index_map:")
-            print(index_map)
             sch.transform_block_layout(block_infos[-1].block_rv, index_map)
 
         try:

From 396d4471638d9651065581ace8c183dbff00d3aa Mon Sep 17 00:00:00 2001
From: Hugo Latendresse <hugolatendresse@gmail.com>
Date: Sun, 4 May 2025 15:26:51 -0400
Subject: [PATCH 18/18] cleanup. All nightly tests pass

---
 python/tvm/dlight/gpu/general_reduction.py       |  5 ++---
 .../test_nnapi/test_from_exported_to_cuda.py     | 16 ++++++++++++++++
 .../relax/test_frontend_from_exported_program.py |  3 ++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index 1a43a629e80f..b1564bf61fa9 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -39,8 +39,8 @@ def apply(  # pylint: disable=too-many-locals
             return None
 
         if target.kind.name == "cuda":
-            len_tx = 256 # Number of threads per block
-            unroll_depth = 256 # How many iterations of loop to unroll
+            len_tx = 256
+            unroll_depth = 256
         elif target.kind.name == "opencl":
             len_tx = 256
             unroll_depth = 64
@@ -60,7 +60,6 @@ def apply(  # pylint: disable=too-many-locals
 
         # Align the number of block iters of the last block.
         num_last_block_iter = len(block_infos[-1].dom_kind())
-
         if num_last_block_iter < len(dom_kind):
             # If the last block is a scalar value, there is nothing left to
             # tile/parallelise, and  `iters` is an empty tuple.
diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
index 0184688a9e84..3f0964cfa8ed 100644
--- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
+++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py
@@ -1054,5 +1054,21 @@ def forward(self, x):
     assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda")
+def test_cross_entropy_module(target, dev):
+    class CrossEntropyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.criterion = nn.CrossEntropyLoss()
+            self.target = torch.tensor([0, 1, 2, 1])
+
+        def forward(self, x):
+            return self.criterion(x, self.target)
+
+    raw_data = np.random.randn(4, 3).astype(np.float32)
+    torch_module = CrossEntropyModule().eval()
+    assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py
index d39a5cee1e17..e5d307b895a9 100644
--- a/tests/python/relax/test_frontend_from_exported_program.py
+++ b/tests/python/relax/test_frontend_from_exported_program.py
@@ -5007,6 +5007,7 @@ def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="fl
     example_args1 = (torch.randn(4, 3, dtype=torch.float32),)
     verify_model(CrossEntropyModule(), example_args1, {}, Expected1)
 
+
 def test_linspace():
     class Linspace(Module):
         def forward(self, input):
@@ -5057,4 +5058,4 @@ def main(
 
 if __name__ == "__main__":
     tvm.testing.main()
-1
\ No newline at end of file
+1