From bcb6998bb25fd54fed4ce082c09bff2985e70460 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sat, 19 Apr 2025 11:43:00 -0400 Subject: [PATCH 01/18] tests for add'l modules --- .../relax/test_from_exported_to_cuda.py | 340 +++++++++++++++++- 1 file changed, 336 insertions(+), 4 deletions(-) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index e92855885e35..01dbb0efc337 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -24,6 +24,7 @@ from torch.export import export from tvm.relax.frontend.torch import from_exported_program from torch.nn import Softmax, Upsample +import torch.nn.functional as F def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev): @@ -73,9 +74,7 @@ def forward(self, x): return torch.full((2, 3), 3.141592) torch_module = FullModel().eval() - raw_data = np.random.rand(3, 3).astype("float32") - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) @@ -91,7 +90,6 @@ def forward(self, x): torch_module = FullLike().eval() raw_data = np.random.rand(2, 3).astype("float32") - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) @@ -105,9 +103,32 @@ def forward(self, x): return torch.ones((2, 3)) torch_module = FullModel().eval() - raw_data = np.random.rand(1, 1).astype("float32") + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + +@tvm.testing.parametrize_targets("cuda") +def test_sort(target, dev): + raw_data = np.array([[4, 1, 13], [-30, 1, 3], [4, 0, 10]]).astype("float32") + + # Test values + class SortModelValues(nn.Module): + def forward(self, x): + A, _ = torch.sort(x, dim=0, descending=True) + B, _ = torch.sort(x, dim=1, descending=False) + return A + B + + torch_module = SortModelValues().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + # Test indices + class SortModelIndices(nn.Module): + def forward(self, x): + _, A = torch.sort(x, dim=0, descending=True) + _, B = torch.sort(x, dim=1, descending=False) + return A + B + + torch_module = SortModelIndices().eval() assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) @@ -588,5 +609,316 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_leakyrelu_module(target, dev): + class LeakyReLUModule(nn.Module): + def __init__(self): + super().__init__() + self.act = nn.LeakyReLU(negative_slope=0.1) + + def forward(self, x): + return self.act(x) + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = LeakyReLUModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_log_softmax_module(target, dev): + class LogSoftmaxModule(nn.Module): + def __init__(self): + super().__init__() + self.logsoftmax = nn.LogSoftmax(dim=1) + + def forward(self, x): + return self.logsoftmax(x) + + raw_data = np.random.randn(4, 5).astype(np.float32) + torch_module = LogSoftmaxModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_softmax_module(target, dev): + class SoftmaxModule(nn.Module): + def __init__(self): + super().__init__() + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + return self.softmax(x) + + raw_data = np.random.randn(4, 5).astype(np.float32) + torch_module = SoftmaxModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_adaptive_avg_pool2d_module(target, dev): + class AdaptiveAvgPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = AdaptiveAvgPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_avg_pool2d_module(target, dev): + class AvgPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.AvgPool2d(kernel_size=2) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = AvgPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv1d_module(target, dev): + class Conv1dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(2, 3, 10).astype(np.float32) + torch_module = Conv1dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv2d_module(target, dev): + class Conv2dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32) + torch_module = Conv2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv3d_module(target, dev): + class Conv3dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32) + torch_module = Conv3dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_group_norm_module(target, dev): + class GroupNormModule(nn.Module): + def __init__(self): + super().__init__() + self.gn = nn.GroupNorm(num_groups=1, num_channels=4) + + def forward(self, x): + return self.gn(x) + + raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32) + torch_module = GroupNormModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_layer_norm_module(target, dev): + class LayerNormModule(nn.Module): + def __init__(self): + super().__init__() + self.ln = nn.LayerNorm(normalized_shape=8) + + def forward(self, x): + return self.ln(x) + + raw_data = np.random.randn(2, 4, 8).astype(np.float32) + torch_module = LayerNormModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_linear_module(target, dev): + class LinearModule(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 5) + + def forward(self, x): + return self.linear(x) + + raw_data = np.random.randn(4, 10).astype(np.float32) + torch_module = LinearModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_max_pool2d_module(target, dev): + class MaxPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.MaxPool2d(kernel_size=2) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = MaxPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_embedding_module(target, dev): + class EmbeddingModule(nn.Module): + def __init__(self): + super().__init__() + self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3) + + def forward(self, x): + return self.embed(x) + + raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64) + torch_module = EmbeddingModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_flatten_module(target, dev): + class FlattenModule(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + + def forward(self, x): + return self.flatten(x) + + raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32) + torch_module = FlattenModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_numel(target, dev): + class NumelModule(nn.Module): + def forward(self, x): + return torch.tensor(x.numel()) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = NumelModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_size(target, dev): + class SizeModule(nn.Module): + def forward(self, x): + return torch.tensor(x.size(0)) + + raw_data = np.random.randn(5, 4).astype(np.float32) + torch_module = SizeModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_tensor(target, dev): + class TensorModule(nn.Module): + def forward(self, x): + return torch.tensor([1, 2, 3]) + + raw_data = np.zeros((1,)).astype(np.float32) + torch_module = TensorModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_type(target, dev): + class TypeModule(nn.Module): + def forward(self, x): + return x.type(torch.float16) + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = TypeModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_float(target, dev): + class FloatModule(nn.Module): + def forward(self, x): + return x.float() + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = FloatModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_half(target, dev): + class HalfModule(nn.Module): + def forward(self, x): + return x.half() + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = HalfModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_getattr(target, dev): + class GetAttrModule(nn.Module): + def forward(self, x): + # Use getattr to call the ndimension method. + return torch.tensor(getattr(x, "ndimension")()) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = GetAttrModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_sym_size_int(target, dev): + class SymSizeIntModule(nn.Module): + def forward(self, x): + return torch.tensor(x.shape[1]) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = SymSizeIntModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_interpolate(target, dev): + class InterpolateModule(nn.Module): + def forward(self, x): + # Upsample to a fixed size. + return F.interpolate(x, size=(16, 16), mode="nearest") + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = InterpolateModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + if __name__ == "__main__": tvm.testing.main() From 751121bd6f1ed39768be0ccff574227f72b2f221 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sat, 19 Apr 2025 11:54:04 -0400 Subject: [PATCH 02/18] no sort --- .../relax/test_from_exported_to_cuda.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index 01dbb0efc337..562f9b020adc 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -107,31 +107,6 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) -@tvm.testing.parametrize_targets("cuda") -def test_sort(target, dev): - raw_data = np.array([[4, 1, 13], [-30, 1, 3], [4, 0, 10]]).astype("float32") - - # Test values - class SortModelValues(nn.Module): - def forward(self, x): - A, _ = torch.sort(x, dim=0, descending=True) - B, _ = torch.sort(x, dim=1, descending=False) - return A + B - - torch_module = SortModelValues().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - # Test indices - class SortModelIndices(nn.Module): - def forward(self, x): - _, A = torch.sort(x, dim=0, descending=True) - _, B = torch.sort(x, dim=1, descending=False) - return A + B - - torch_module = SortModelIndices().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - @tvm.testing.parametrize_targets("cuda") def test_tensor_clamp(target, dev): class ClampBothTensor(torch.nn.Module): From 175ac346777aac26eea49db623c162f13113f31d Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sat, 19 Apr 2025 14:41:21 -0400 Subject: [PATCH 03/18] cross entropy test passes --- python/tvm/dlight/gpu/general_reduction.py | 23 +++++++++++++++++++ .../torch/base_fx_graph_translator.py | 22 +++++++++++++++++- .../torch/exported_program_translator.py | 11 ++++++++- .../tvm/relax/frontend/torch/fx_translator.py | 17 ++++++-------- .../relax/test_from_exported_to_cuda.py | 13 +++++++++++ 5 files changed, 74 insertions(+), 12 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index d3979ce0e4c3..d61ec624411c 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -60,6 +60,29 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) + + # If the last block is a scalar value, there is nothing left to + # tile/parallelise, and the code below would crash because `iters` is + # an empty tuple. + # Special case: the whole reduction collapsed to a *scalar*. + # Add a unit thread loop so the final write happens inside a valid + # GPU thread environment and passes `VerifyMemory`. + # ------------------------------------------------------------------ + if num_last_block_iter == 0: + # Put every block (both the running reductions and the final + # scalar write) inside a trivial GPU thread to satisfy + # VerifyMemory. The very first block gets a `blockIdx.x` + # wrapper so that kernels still have a unique block scope. + for i, info in enumerate(block_infos): + loop_rv = sch.add_unit_loop(info.block_rv) # extent = 1 + if i == 0: + sch.bind(loop_rv, "blockIdx.x") + else: + sch.bind(loop_rv, "threadIdx.x") + + return sch + + if num_last_block_iter < len(dom_kind): def f_layout_mapping(*iters): diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py index ae4c918900ec..c3cafb32d123 100644 --- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py +++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py @@ -782,6 +782,25 @@ def _conv3d(self, node: fx.Node) -> relax.Var: groups=groups, ) + def _cross_entropy_loss( + self, + preds: relax.Expr, + targets: relax.Expr, + weights: Optional[relax.Expr], + reduction: str, + ignore_index: int, + ) -> relax.Expr: + log_probs = relax.op.nn.log_softmax(preds) + return self.block_builder.emit( + relax.op.nn.nll_loss( + log_probs, + targets, + weights, + reduction, + ignore_index, + ) + ) + def _einsum(self, node: fx.Node) -> relax.Var: import torch # type: ignore @@ -1096,8 +1115,9 @@ def _cumsum(self, node: fx.Node) -> relax.Var: return self.block_builder.emit(relax.op.cumsum(x, dim, dtype)) - def _expand(self, node: fx.Node) -> relax.Var: args = self.retrieve_args(node) + + def _expand(self, node: fx.Node) -> relax.Var: sizes = args[1:] if len(args) > 2 else args[1] broadcast_shape, in_shape = [], self.shape_of(args[0]) for idx, i in enumerate(sizes): diff --git a/python/tvm/relax/frontend/torch/exported_program_translator.py b/python/tvm/relax/frontend/torch/exported_program_translator.py index 932607287571..aea8196aefcf 100644 --- a/python/tvm/relax/frontend/torch/exported_program_translator.py +++ b/python/tvm/relax/frontend/torch/exported_program_translator.py @@ -66,7 +66,7 @@ def _reciprocal(self, node: fx.Node) -> relax.Var: ########## Neural Network ########## - def _batch_norm(self, node: fx.Node, training) -> relax.Var: + def _batch_norm(self, node: fx.Node, training: bool) -> relax.Var: import numpy as np x = self.env[node.args[0]] @@ -113,6 +113,14 @@ def _batch_norm_legit_no_training(self, node: fx.Node) -> relax.Var: training = False return self._batch_norm(node, training) + def _cross_entropy_default(self, node: fx.Node) -> relax.Expr: + preds = self.env[node.args[0]] + targets = self.env[node.args[1]] + weight = self.env.get(node.args[2], None) if len(node.args) > 2 else None + reduction = node.kwargs.get("reduction", "mean") + ignore_index = node.kwargs.get("ignore_index", -100) + return self._cross_entropy_loss(preds, targets, weight, reduction, ignore_index) + def _group_norm(self, node: fx.Node) -> relax.Var: x = self.env[node.args[0]] num_groups = node.args[1] @@ -382,6 +390,7 @@ def create_convert_map( "conv1d.default": self._conv1d, "conv2d.default": self._conv2d, "conv3d.default": self._conv3d, + "cross_entropy_loss.default": self._cross_entropy_default, "einsum.default": self._einsum, "embedding.default": lambda node: self._embedding_impl( self.env[node.args[1]], self.env[node.args[0]] diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py index 5a34befb9296..5a07fb7c9707 100644 --- a/python/tvm/relax/frontend/torch/fx_translator.py +++ b/python/tvm/relax/frontend/torch/fx_translator.py @@ -260,12 +260,7 @@ def _cross_entropy(self, node: fx.Node) -> relax.Expr: weights = self.env.get(node.kwargs["weight"], None) reduction = node.kwargs["reduction"] ignore_index = node.kwargs["ignore_index"] - - return self.block_builder.emit( - relax.op.nn.nll_loss( - relax.op.nn.log_softmax(preds), targets, weights, reduction, ignore_index - ) - ) + return self._cross_entropy_loss(preds, targets, weights, reduction, ignore_index) def _cross_entropy_module(self, node: fx.Node) -> relax.Expr: preds = self.env[node.args[0]] @@ -282,10 +277,12 @@ def _cross_entropy_module(self, node: fx.Node) -> relax.Expr: reduction = module.reduction ignore_index = module.ignore_index - return self.block_builder.emit( - relax.op.nn.nll_loss( - relax.op.nn.log_softmax(preds), targets, weights, reduction, ignore_index - ) + return self._cross_entropy_loss( + preds, + targets, + weights, + reduction, + ignore_index, ) def _embedding_module(self, node: fx.Node) -> relax.Var: diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index 562f9b020adc..7c753d4c06c2 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -703,6 +703,18 @@ def forward(self, x): torch_module = Conv3dModule().eval() assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_cross_entropy_module(target, dev): + class CrossEntropyModule(nn.Module): + def __init__(self): + super().__init__() + self.criterion = nn.CrossEntropyLoss() + self.register_buffer("target", torch.tensor([0, 1, 2, 1])) + def forward(self, x): + return self.criterion(x, self.target) + raw_data = np.random.randn(4, 3).astype(np.float32) + torch_module = CrossEntropyModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) @tvm.testing.parametrize_targets("cuda") def test_group_norm_module(target, dev): @@ -895,5 +907,6 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + if __name__ == "__main__": tvm.testing.main() From 03d04bea8c24604eaeaf5a76b7a24970fb0a653a Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sat, 19 Apr 2025 14:46:13 -0400 Subject: [PATCH 04/18] cleanup --- python/tvm/dlight/gpu/general_reduction.py | 18 +++++++----------- .../python/relax/test_from_exported_to_cuda.py | 7 +++++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index d61ec624411c..328a3eac914d 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -61,20 +61,17 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) - # If the last block is a scalar value, there is nothing left to - # tile/parallelise, and the code below would crash because `iters` is - # an empty tuple. - # Special case: the whole reduction collapsed to a *scalar*. + # If the last block is a scalar value, there is nothing left to + # tile/parallelise, and `iters` is an empty tuple. # Add a unit thread loop so the final write happens inside a valid - # GPU thread environment and passes `VerifyMemory`. - # ------------------------------------------------------------------ + # GPU thread environment. if num_last_block_iter == 0: # Put every block (both the running reductions and the final - # scalar write) inside a trivial GPU thread to satisfy - # VerifyMemory. The very first block gets a `blockIdx.x` - # wrapper so that kernels still have a unique block scope. + # scalar write) inside a trivial GPU thread. The very first block + # gets a `blockIdx.x` wrapper so that kernels still have a unique + # block scope. for i, info in enumerate(block_infos): - loop_rv = sch.add_unit_loop(info.block_rv) # extent = 1 + loop_rv = sch.add_unit_loop(info.block_rv) if i == 0: sch.bind(loop_rv, "blockIdx.x") else: @@ -82,7 +79,6 @@ def apply( # pylint: disable=too-many-locals return sch - if num_last_block_iter < len(dom_kind): def f_layout_mapping(*iters): diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index 7c753d4c06c2..bd9869a9e4cf 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -703,19 +703,23 @@ def forward(self, x): torch_module = Conv3dModule().eval() assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + @tvm.testing.parametrize_targets("cuda") def test_cross_entropy_module(target, dev): class CrossEntropyModule(nn.Module): def __init__(self): super().__init__() self.criterion = nn.CrossEntropyLoss() - self.register_buffer("target", torch.tensor([0, 1, 2, 1])) + self.target = torch.tensor([0, 1, 2, 1]) + def forward(self, x): return self.criterion(x, self.target) + raw_data = np.random.randn(4, 3).astype(np.float32) torch_module = CrossEntropyModule().eval() assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + @tvm.testing.parametrize_targets("cuda") def test_group_norm_module(target, dev): class GroupNormModule(nn.Module): @@ -907,6 +911,5 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - if __name__ == "__main__": tvm.testing.main() From cf77abc19f4ad5af0b81ee0eba4b8a024a9f1d2a Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sat, 19 Apr 2025 17:47:39 -0400 Subject: [PATCH 05/18] fix expand --- python/tvm/relax/frontend/torch/base_fx_graph_translator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py index c3cafb32d123..6348980b226d 100644 --- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py +++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py @@ -1115,9 +1115,8 @@ def _cumsum(self, node: fx.Node) -> relax.Var: return self.block_builder.emit(relax.op.cumsum(x, dim, dtype)) - args = self.retrieve_args(node) - def _expand(self, node: fx.Node) -> relax.Var: + args = self.retrieve_args(node) sizes = args[1:] if len(args) > 2 else args[1] broadcast_shape, in_shape = [], self.shape_of(args[0]) for idx, i in enumerate(sizes): From f503d6a8c6b4ade1f9b7251dfad72ce2fa88b382 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 21 Apr 2025 12:25:16 -0400 Subject: [PATCH 06/18] remove new e2e tests --- .../relax/test_from_exported_to_cuda.py | 340 ------------------ 1 file changed, 340 deletions(-) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index fa25f58f24ba..de0d8c726317 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -685,20 +685,6 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) -@tvm.testing.parametrize_targets("cuda") -def test_leakyrelu_module(target, dev): - class LeakyReLUModule(nn.Module): - def __init__(self): - super().__init__() - self.act = nn.LeakyReLU(negative_slope=0.1) - - def forward(self, x): - return self.act(x) - - raw_data = np.random.randn(2, 3).astype(np.float32) - torch_module = LeakyReLUModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - def test_mul(target, dev): class MulModule(nn.Module): def __init__(self): @@ -713,331 +699,5 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) -@tvm.testing.parametrize_targets("cuda") -def test_log_softmax_module(target, dev): - class LogSoftmaxModule(nn.Module): - def __init__(self): - super().__init__() - self.logsoftmax = nn.LogSoftmax(dim=1) - - def forward(self, x): - return self.logsoftmax(x) - - raw_data = np.random.randn(4, 5).astype(np.float32) - torch_module = LogSoftmaxModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_softmax_module(target, dev): - class SoftmaxModule(nn.Module): - def __init__(self): - super().__init__() - self.softmax = nn.Softmax(dim=1) - - def forward(self, x): - return self.softmax(x) - - raw_data = np.random.randn(4, 5).astype(np.float32) - torch_module = SoftmaxModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_adaptive_avg_pool2d_module(target, dev): - class AdaptiveAvgPool2dModule(nn.Module): - def __init__(self): - super().__init__() - self.pool = nn.AdaptiveAvgPool2d((1, 1)) - - def forward(self, x): - return self.pool(x) - - raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) - torch_module = AdaptiveAvgPool2dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_avg_pool2d_module(target, dev): - class AvgPool2dModule(nn.Module): - def __init__(self): - super().__init__() - self.pool = nn.AvgPool2d(kernel_size=2) - - def forward(self, x): - return self.pool(x) - - raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) - torch_module = AvgPool2dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_conv1d_module(target, dev): - class Conv1dModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3) - - def forward(self, x): - return self.conv(x) - - raw_data = np.random.randn(2, 3, 10).astype(np.float32) - torch_module = Conv1dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_conv2d_module(target, dev): - class Conv2dModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3) - - def forward(self, x): - return self.conv(x) - - raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32) - torch_module = Conv2dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_conv3d_module(target, dev): - class Conv3dModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3) - - def forward(self, x): - return self.conv(x) - - raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32) - torch_module = Conv3dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_cross_entropy_module(target, dev): - class CrossEntropyModule(nn.Module): - def __init__(self): - super().__init__() - self.criterion = nn.CrossEntropyLoss() - self.target = torch.tensor([0, 1, 2, 1]) - - def forward(self, x): - return self.criterion(x, self.target) - - raw_data = np.random.randn(4, 3).astype(np.float32) - torch_module = CrossEntropyModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_group_norm_module(target, dev): - class GroupNormModule(nn.Module): - def __init__(self): - super().__init__() - self.gn = nn.GroupNorm(num_groups=1, num_channels=4) - - def forward(self, x): - return self.gn(x) - - raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32) - torch_module = GroupNormModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_layer_norm_module(target, dev): - class LayerNormModule(nn.Module): - def __init__(self): - super().__init__() - self.ln = nn.LayerNorm(normalized_shape=8) - - def forward(self, x): - return self.ln(x) - - raw_data = np.random.randn(2, 4, 8).astype(np.float32) - torch_module = LayerNormModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_linear_module(target, dev): - class LinearModule(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.Linear(10, 5) - - def forward(self, x): - return self.linear(x) - - raw_data = np.random.randn(4, 10).astype(np.float32) - torch_module = LinearModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_max_pool2d_module(target, dev): - class MaxPool2dModule(nn.Module): - def __init__(self): - super().__init__() - self.pool = nn.MaxPool2d(kernel_size=2) - - def forward(self, x): - return self.pool(x) - - raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) - torch_module = MaxPool2dModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_embedding_module(target, dev): - class EmbeddingModule(nn.Module): - def __init__(self): - super().__init__() - self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3) - - def forward(self, x): - return self.embed(x) - - raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64) - torch_module = EmbeddingModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_flatten_module(target, dev): - class FlattenModule(nn.Module): - def __init__(self): - super().__init__() - self.flatten = nn.Flatten() - - def forward(self, x): - return self.flatten(x) - - raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32) - torch_module = FlattenModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_numel(target, dev): - class NumelModule(nn.Module): - def forward(self, x): - return torch.tensor(x.numel()) - - raw_data = np.random.randn(2, 3, 4).astype(np.float32) - torch_module = NumelModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_size(target, dev): - class SizeModule(nn.Module): - def forward(self, x): - return torch.tensor(x.size(0)) - - raw_data = np.random.randn(5, 4).astype(np.float32) - torch_module = SizeModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_tensor(target, dev): - class TensorModule(nn.Module): - def forward(self, x): - return torch.tensor([1, 2, 3]) - - raw_data = np.zeros((1,)).astype(np.float32) - torch_module = TensorModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_type(target, dev): - class TypeModule(nn.Module): - def forward(self, x): - return x.type(torch.float16) - - raw_data = np.random.randn(2, 3).astype(np.float32) - torch_module = TypeModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_float(target, dev): - class FloatModule(nn.Module): - def forward(self, x): - return x.float() - - raw_data = np.random.randn(2, 3).astype(np.float32) - torch_module = FloatModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_half(target, dev): - class HalfModule(nn.Module): - def forward(self, x): - return x.half() - - raw_data = np.random.randn(2, 3).astype(np.float32) - torch_module = HalfModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_getattr(target, dev): - class GetAttrModule(nn.Module): - def forward(self, x): - # Use getattr to call the ndimension method. - return torch.tensor(getattr(x, "ndimension")()) - - raw_data = np.random.randn(2, 3, 4).astype(np.float32) - torch_module = GetAttrModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_sym_size_int(target, dev): - class SymSizeIntModule(nn.Module): - def forward(self, x): - return torch.tensor(x.shape[1]) - - raw_data = np.random.randn(2, 3, 4).astype(np.float32) - torch_module = SymSizeIntModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - -@tvm.testing.parametrize_targets("cuda") -def test_interpolate(target, dev): - class InterpolateModule(nn.Module): - def forward(self, x): - # Upsample to a fixed size. - return F.interpolate(x, size=(16, 16), mode="nearest") - - raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) - torch_module = InterpolateModule().eval() -def test_concat(target, dev): - class ConcatFour(nn.Module): - def __init__(self, dim=0): - super(ConcatFour, self).__init__() - self.dim = dim - self.x2 = torch.randn(2, 3) - self.x3 = torch.randn(2, 3) - self.x4 = torch.randn(2, 3) - - def forward(self, x): - return torch.cat((x, self.x2, self.x3, self.x4), dim=self.dim) - - torch_module = ConcatFour().eval() - raw_data = np.random.rand(2, 3).astype("float32") - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - if __name__ == "__main__": tvm.testing.main() From d58301692976c04dff3d0f9feea8bb637c024aa3 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 21 Apr 2025 12:26:26 -0400 Subject: [PATCH 07/18] remove new e2e tests --- tests/python/relax/test_from_exported_to_cuda.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index de0d8c726317..16b7eb59ca94 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -698,6 +698,22 @@ def forward(self, x): raw_data = np.random.rand(2, 3).astype("float32") assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_cross_entropy_module(target, dev): + class CrossEntropyModule(nn.Module): + def __init__(self): + super().__init__() + self.criterion = nn.CrossEntropyLoss() + self.target = torch.tensor([0, 1, 2, 1]) + + def forward(self, x): + return self.criterion(x, self.target) + + raw_data = np.random.randn(4, 3).astype(np.float32) + torch_module = CrossEntropyModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + if __name__ == "__main__": tvm.testing.main() From 0e4ca8df2bd6061b4b9b9e1e9a67938fef2a4e5b Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 21 Apr 2025 12:51:09 -0400 Subject: [PATCH 08/18] convert e2e test to unit test --- .../relax/test_from_exported_to_cuda.py | 30 ------------------ .../test_frontend_from_exported_program.py | 31 +++++++++++++++++++ 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index 16b7eb59ca94..7926b15b5e5a 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -685,35 +685,5 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) -def test_mul(target, dev): - class MulModule(nn.Module): - def __init__(self): - super().__init__() - self.y = torch.tensor(np.random.rand(2, 3).astype("float32")) - - def forward(self, x): - return x.mul(self.y) - - torch_module = MulModule().eval() - raw_data = np.random.rand(2, 3).astype("float32") - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - -@tvm.testing.parametrize_targets("cuda") -def test_cross_entropy_module(target, dev): - class CrossEntropyModule(nn.Module): - def __init__(self): - super().__init__() - self.criterion = nn.CrossEntropyLoss() - self.target = torch.tensor([0, 1, 2, 1]) - - def forward(self, x): - return self.criterion(x, self.target) - - raw_data = np.random.randn(4, 3).astype(np.float32) - torch_module = CrossEntropyModule().eval() - assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) - - - if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py index a386a989f00e..dfd6be580e3f 100644 --- a/tests/python/relax/test_frontend_from_exported_program.py +++ b/tests/python/relax/test_frontend_from_exported_program.py @@ -17,6 +17,7 @@ import operator import pytest import torch +from torch import nn from torch.nn import Module from torch.export import export @@ -4418,6 +4419,36 @@ def main( example_args2 = (torch.randn(5, dtype=torch.float32),) verify_model(Eye2(), example_args2, {}, Expected2) +def test_cross_entropy(): + + class CrossEntropyModule(Module): + def init(self): + super().init() + self.criterion = nn.CrossEntropyLoss() + self.target = torch.tensor([0, 1, 2, 1]) + + def forward(self, x): + return self.criterion(x, self.target) + + raw_data = np.random.randn(4, 3).astype(np.float32) + torch_module = CrossEntropyModule().eval() + + @tvm.script.ir_module + class Expected1: + @R.function + def main( + input: R.Tensor((3, 5), dtype="float32") + ) -> R.Tuple(R.Tensor((3, 5), dtype="float32")): + with R.dataflow(): + lv: R.Tensor((3, 5), dtype="float32") = R.eye(3, 5, dtype="float32") + gv: R.Tuple(R.Tensor((3, 5), dtype="float32")) = (lv,) + R.output(gv) + return gv + + example_args1 = (torch.randn(4, 3, dtype=torch.float32),) + verify_model(CrossEntropyModule(), example_args1, {}, Expected1) + + if __name__ == "__main__": tvm.testing.main() From 320ad6b23cd616d40b09f6c8bbbba99588a351cf Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 21 Apr 2025 13:18:15 -0400 Subject: [PATCH 09/18] unit test --- .../test_frontend_from_exported_program.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py index dfd6be580e3f..140f71ea3818 100644 --- a/tests/python/relax/test_frontend_from_exported_program.py +++ b/tests/python/relax/test_frontend_from_exported_program.py @@ -4419,29 +4419,30 @@ def main( example_args2 = (torch.randn(5, dtype=torch.float32),) verify_model(Eye2(), example_args2, {}, Expected2) -def test_cross_entropy(): +def test_cross_entropy(): class CrossEntropyModule(Module): - def init(self): - super().init() + def __init__(self): + super().__init__() self.criterion = nn.CrossEntropyLoss() self.target = torch.tensor([0, 1, 2, 1]) def forward(self, x): return self.criterion(x, self.target) - raw_data = np.random.randn(4, 3).astype(np.float32) - torch_module = CrossEntropyModule().eval() - @tvm.script.ir_module class Expected1: @R.function - def main( - input: R.Tensor((3, 5), dtype="float32") - ) -> R.Tuple(R.Tensor((3, 5), dtype="float32")): + def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="float32")): with R.dataflow(): - lv: R.Tensor((3, 5), dtype="float32") = R.eye(3, 5, dtype="float32") - gv: R.Tuple(R.Tensor((3, 5), dtype="float32")) = (lv,) + lv: R.Tensor((4, 3), dtype="float32") = R.nn.log_softmax(x, axis=-1) + lv1: R.Tensor((), dtype="float32") = R.nn.nll_loss( + lv, + targets=R.const([0, 1, 2, 1], dtype="int64"), + reduction="mean", + ignore_index=-100, + ) + gv: R.Tuple(R.Tensor((), dtype="float32")) = (lv1,) R.output(gv) return gv @@ -4449,6 +4450,5 @@ def main( verify_model(CrossEntropyModule(), example_args1, {}, Expected1) - if __name__ == "__main__": tvm.testing.main() From 3f8247c0e49b7215a4434dfb154702e3387d51f2 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 21 Apr 2025 13:22:47 -0400 Subject: [PATCH 10/18] restore tests --- .../relax/test_from_exported_to_cuda.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/relax/test_from_exported_to_cuda.py index 7926b15b5e5a..76a4bb203925 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/relax/test_from_exported_to_cuda.py @@ -24,7 +24,6 @@ from torch.export import export from tvm.relax.frontend.torch import from_exported_program from torch.nn import Softmax, Upsample -import torch.nn.functional as F def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev): @@ -685,5 +684,38 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_mul(target, dev): + class MulModule(nn.Module): + def __init__(self): + super().__init__() + self.y = torch.tensor(np.random.rand(2, 3).astype("float32")) + + def forward(self, x): + return x.mul(self.y) + + torch_module = MulModule().eval() + raw_data = np.random.rand(2, 3).astype("float32") + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_concat(target, dev): + class ConcatFour(nn.Module): + def __init__(self, dim=0): + super(ConcatFour, self).__init__() + self.dim = dim + self.x2 = torch.randn(2, 3) + self.x3 = torch.randn(2, 3) + self.x4 = torch.randn(2, 3) + + def forward(self, x): + return torch.cat((x, self.x2, self.x3, self.x4), dim=self.dim) + + torch_module = ConcatFour().eval() + raw_data = np.random.rand(2, 3).astype("float32") + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + if __name__ == "__main__": tvm.testing.main() From e7d6b9735f178e544d9490ab64a6d6854f2b2888 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 28 Apr 2025 10:55:01 -0400 Subject: [PATCH 11/18] move --- .../{relax => nightly/test_nnapi}/test_from_exported_to_cuda.py | 1 - 1 file changed, 1 deletion(-) rename tests/python/{relax => nightly/test_nnapi}/test_from_exported_to_cuda.py (99%) diff --git a/tests/python/relax/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py similarity index 99% rename from tests/python/relax/test_from_exported_to_cuda.py rename to tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py index 6bb35b50b1df..8b23e899fc64 100644 --- a/tests/python/relax/test_from_exported_to_cuda.py +++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py @@ -1,4 +1,3 @@ -# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file From 9b2d9983ec478106ba38680c0ac29466b0046381 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 28 Apr 2025 10:56:07 -0400 Subject: [PATCH 12/18] add new tests --- .../test_nnapi/test_from_exported_to_cuda.py | 311 ++++++++++++++++++ 1 file changed, 311 insertions(+) diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py index 8b23e899fc64..f68b5a234341 100644 --- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py +++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py @@ -1,3 +1,4 @@ +# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file @@ -741,5 +742,315 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_leakyrelu_module(target, dev): + class LeakyReLUModule(nn.Module): + def __init__(self): + super().__init__() + self.act = nn.LeakyReLU(negative_slope=0.1) + + def forward(self, x): + return self.act(x) + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = LeakyReLUModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_log_softmax_module(target, dev): + class LogSoftmaxModule(nn.Module): + def __init__(self): + super().__init__() + self.logsoftmax = nn.LogSoftmax(dim=1) + + def forward(self, x): + return self.logsoftmax(x) + + raw_data = np.random.randn(4, 5).astype(np.float32) + torch_module = LogSoftmaxModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_softmax_module(target, dev): + class SoftmaxModule(nn.Module): + def __init__(self): + super().__init__() + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + return self.softmax(x) + + raw_data = np.random.randn(4, 5).astype(np.float32) + torch_module = SoftmaxModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_adaptive_avg_pool2d_module(target, dev): + class AdaptiveAvgPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = AdaptiveAvgPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_avg_pool2d_module(target, dev): + class AvgPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.AvgPool2d(kernel_size=2) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = AvgPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv1d_module(target, dev): + class Conv1dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(2, 3, 10).astype(np.float32) + torch_module = Conv1dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv2d_module(target, dev): + class Conv2dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(2, 3, 10, 10).astype(np.float32) + torch_module = Conv2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_conv3d_module(target, dev): + class Conv3dModule(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv3d(in_channels=2, out_channels=3, kernel_size=3) + + def forward(self, x): + return self.conv(x) + + raw_data = np.random.randn(1, 2, 8, 8, 8).astype(np.float32) + torch_module = Conv3dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_group_norm_module(target, dev): + class GroupNormModule(nn.Module): + def __init__(self): + super().__init__() + self.gn = nn.GroupNorm(num_groups=1, num_channels=4) + + def forward(self, x): + return self.gn(x) + + raw_data = np.random.randn(2, 4, 8, 8).astype(np.float32) + torch_module = GroupNormModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_layer_norm_module(target, dev): + class LayerNormModule(nn.Module): + def __init__(self): + super().__init__() + self.ln = nn.LayerNorm(normalized_shape=8) + + def forward(self, x): + return self.ln(x) + + raw_data = np.random.randn(2, 4, 8).astype(np.float32) + torch_module = LayerNormModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_linear_module(target, dev): + class LinearModule(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 5) + + def forward(self, x): + return self.linear(x) + + raw_data = np.random.randn(4, 10).astype(np.float32) + torch_module = LinearModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_max_pool2d_module(target, dev): + class MaxPool2dModule(nn.Module): + def __init__(self): + super().__init__() + self.pool = nn.MaxPool2d(kernel_size=2) + + def forward(self, x): + return self.pool(x) + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = MaxPool2dModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_embedding_module(target, dev): + class EmbeddingModule(nn.Module): + def __init__(self): + super().__init__() + self.embed = nn.Embedding(num_embeddings=10, embedding_dim=3) + + def forward(self, x): + return self.embed(x) + + raw_data = np.random.randint(0, 10, (2, 4)).astype(np.int64) + torch_module = EmbeddingModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_flatten_module(target, dev): + class FlattenModule(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + + def forward(self, x): + return self.flatten(x) + + raw_data = np.random.randn(2, 3, 4, 5).astype(np.float32) + torch_module = FlattenModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_numel(target, dev): + class NumelModule(nn.Module): + def forward(self, x): + return torch.tensor(x.numel()) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = NumelModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_size(target, dev): + class SizeModule(nn.Module): + def forward(self, x): + return torch.tensor(x.size(0)) + + raw_data = np.random.randn(5, 4).astype(np.float32) + torch_module = SizeModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_tensor(target, dev): + class TensorModule(nn.Module): + def forward(self, x): + return torch.tensor([1, 2, 3]) + + raw_data = np.zeros((1,)).astype(np.float32) + torch_module = TensorModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_type(target, dev): + class TypeModule(nn.Module): + def forward(self, x): + return x.type(torch.float16) + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = TypeModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_float(target, dev): + class FloatModule(nn.Module): + def forward(self, x): + return x.float() + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = FloatModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_half(target, dev): + class HalfModule(nn.Module): + def forward(self, x): + return x.half() + + raw_data = np.random.randn(2, 3).astype(np.float32) + torch_module = HalfModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_getattr(target, dev): + class GetAttrModule(nn.Module): + def forward(self, x): + # Use getattr to call the ndimension method. + return torch.tensor(getattr(x, "ndimension")()) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = GetAttrModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_sym_size_int(target, dev): + class SymSizeIntModule(nn.Module): + def forward(self, x): + return torch.tensor(x.shape[1]) + + raw_data = np.random.randn(2, 3, 4).astype(np.float32) + torch_module = SymSizeIntModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + +@tvm.testing.parametrize_targets("cuda") +def test_interpolate(target, dev): + class InterpolateModule(nn.Module): + def forward(self, x): + # Upsample to a fixed size. + return F.interpolate(x, size=(16, 16), mode="nearest") + + raw_data = np.random.randn(2, 3, 8, 8).astype(np.float32) + torch_module = InterpolateModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + if __name__ == "__main__": tvm.testing.main() From 8897b2861612c0a641adc128a6ceea6d08ef2f01 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 28 Apr 2025 11:01:36 -0400 Subject: [PATCH 13/18] add new tests from 17862 --- tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py index f68b5a234341..500e52151537 100644 --- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py +++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py @@ -21,11 +21,13 @@ import numpy as np import torch from torch import nn +from torch.nn import functional as F from torch.export import export from tvm.relax.frontend.torch import from_exported_program from torch.nn import Softmax, Upsample + def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev): """ This util ensures that a torch module can successfully be exported to TVM From 2a9615370c63b07951dbf3a5675f08925262ccc6 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Mon, 28 Apr 2025 11:06:31 -0400 Subject: [PATCH 14/18] whitespace --- tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py index 500e52151537..0184688a9e84 100644 --- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py +++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py @@ -27,7 +27,6 @@ from torch.nn import Softmax, Upsample - def assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev): """ This util ensures that a torch module can successfully be exported to TVM @@ -1054,5 +1053,6 @@ def forward(self, x): torch_module = InterpolateModule().eval() assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + if __name__ == "__main__": tvm.testing.main() From a60a8627546b686365c7189af016e1fdc29f4b80 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sun, 4 May 2025 14:05:20 -0400 Subject: [PATCH 15/18] print statemetns --- python/tvm/dlight/gpu/general_reduction.py | 53 ++++++++++++------- .../test_frontend_from_exported_program.py | 3 +- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index 328a3eac914d..4fd310276f90 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -38,9 +38,16 @@ def apply( # pylint: disable=too-many-locals if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): return None + print("\n##########################################################################################") + print(" ENTERING general_reduction.py ") + print("##########################################################################################\n") + + print("func:") + print(func) + if target.kind.name == "cuda": - len_tx = 256 - unroll_depth = 256 + len_tx = 256 # Number of threads per block + unroll_depth = 256 # How many iterations of loop to unroll elif target.kind.name == "opencl": len_tx = 256 unroll_depth = 64 @@ -54,6 +61,13 @@ def apply( # pylint: disable=too-many-locals if block_infos is None or len(block_infos) == 0: return None + print("block_infos:") + for block_info in block_infos: + print(block_info) + + print("len(block_infos):") + print(len(block_infos)) + dom_kind = block_infos[0].dom_kind() num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) @@ -61,23 +75,24 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) - # If the last block is a scalar value, there is nothing left to - # tile/parallelise, and `iters` is an empty tuple. - # Add a unit thread loop so the final write happens inside a valid - # GPU thread environment. - if num_last_block_iter == 0: - # Put every block (both the running reductions and the final - # scalar write) inside a trivial GPU thread. The very first block - # gets a `blockIdx.x` wrapper so that kernels still have a unique - # block scope. - for i, info in enumerate(block_infos): - loop_rv = sch.add_unit_loop(info.block_rv) - if i == 0: - sch.bind(loop_rv, "blockIdx.x") - else: - sch.bind(loop_rv, "threadIdx.x") - - return sch + # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. + # # If the last block is a scalar value, there is nothing left to + # # tile/parallelise, and `iters` is an empty tuple. + # # Add a unit thread loop so the final write happens inside a valid + # # GPU thread environment. + # if num_last_block_iter == 0: + # # Put every block (both the running reductions and the final + # # scalar write) inside a trivial GPU thread. The very first block + # # gets a `blockIdx.x` wrapper so that kernels still have a unique + # # block scope. + # for i, info in enumerate(block_infos): + # loop_rv = sch.add_unit_loop(info.block_rv) + # if i == 0: + # sch.bind(loop_rv, "blockIdx.x") + # else: + # sch.bind(loop_rv, "threadIdx.x") + + # return sch if num_last_block_iter < len(dom_kind): diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py index 3429927ca5c7..d39a5cee1e17 100644 --- a/tests/python/relax/test_frontend_from_exported_program.py +++ b/tests/python/relax/test_frontend_from_exported_program.py @@ -5006,7 +5006,7 @@ def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="fl example_args1 = (torch.randn(4, 3, dtype=torch.float32),) verify_model(CrossEntropyModule(), example_args1, {}, Expected1) - + def test_linspace(): class Linspace(Module): def forward(self, input): @@ -5057,3 +5057,4 @@ def main( if __name__ == "__main__": tvm.testing.main() +1 \ No newline at end of file From fb85449896844f5bac93304a252309510aa17204 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sun, 4 May 2025 15:13:57 -0400 Subject: [PATCH 16/18] all tests pass --- python/tvm/dlight/gpu/general_reduction.py | 50 +++++++++++++--------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index 4fd310276f90..756d332c9d98 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -65,8 +65,7 @@ def apply( # pylint: disable=too-many-locals for block_info in block_infos: print(block_info) - print("len(block_infos):") - print(len(block_infos)) + print("len(block_infos):", len(block_infos)) dom_kind = block_infos[0].dom_kind() num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) @@ -75,28 +74,37 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) - # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. - # # If the last block is a scalar value, there is nothing left to - # # tile/parallelise, and `iters` is an empty tuple. - # # Add a unit thread loop so the final write happens inside a valid - # # GPU thread environment. - # if num_last_block_iter == 0: - # # Put every block (both the running reductions and the final - # # scalar write) inside a trivial GPU thread. The very first block - # # gets a `blockIdx.x` wrapper so that kernels still have a unique - # # block scope. - # for i, info in enumerate(block_infos): - # loop_rv = sch.add_unit_loop(info.block_rv) - # if i == 0: - # sch.bind(loop_rv, "blockIdx.x") - # else: - # sch.bind(loop_rv, "threadIdx.x") - - # return sch + print("num_last_block_iter:", num_last_block_iter) + print("len(dom_kind)",len(dom_kind)) + print("dom_kind:", dom_kind) + if num_last_block_iter < len(dom_kind): + # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. + # # If the last block is a scalar value, there is nothing left to + # # tile/parallelise, and `iters` is an empty tuple. + # # Add a unit thread loop so the final write happens inside a valid + # # GPU thread environment. + if num_last_block_iter == 0: + print("ENTERING THE THING!!!!!!!!!!!!!!!!!") + # assert 0, "ENTERING THE THING!!!!!!!!!!!!!!!!!" + + # Put every block (both the running reductions and the final + # scalar write) inside a trivial GPU thread. The very first block + # gets a `blockIdx.x` wrapper so that kernels still have a unique + # block scope. + for i, info in enumerate(block_infos): + loop_rv = sch.add_unit_loop(info.block_rv) + if i == 0: + sch.bind(loop_rv, "blockIdx.x") + else: + sch.bind(loop_rv, "threadIdx.x") + + return sch + def f_layout_mapping(*iters): + print("iters:", iters) analyzer = arith.Analyzer() # Try to match the iters of last block to the iters of the first block. # For matched positions, use the iter from the input `iters`. @@ -121,6 +129,8 @@ def f_layout_mapping(*iters): ) + list(iters) index_map = tir.IndexMap.from_func(f_layout_mapping, ndim=num_last_block_iter) + print("index_map:") + print(index_map) sch.transform_block_layout(block_infos[-1].block_rv, index_map) try: From 22ede511afa5985cc071de93abcb66d1c21f929d Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sun, 4 May 2025 15:16:09 -0400 Subject: [PATCH 17/18] cleanup - all tests still pass --- python/tvm/dlight/gpu/general_reduction.py | 34 +++------------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index 756d332c9d98..1a43a629e80f 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -38,13 +38,6 @@ def apply( # pylint: disable=too-many-locals if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): return None - print("\n##########################################################################################") - print(" ENTERING general_reduction.py ") - print("##########################################################################################\n") - - print("func:") - print(func) - if target.kind.name == "cuda": len_tx = 256 # Number of threads per block unroll_depth = 256 # How many iterations of loop to unroll @@ -61,12 +54,6 @@ def apply( # pylint: disable=too-many-locals if block_infos is None or len(block_infos) == 0: return None - print("block_infos:") - for block_info in block_infos: - print(block_info) - - print("len(block_infos):", len(block_infos)) - dom_kind = block_infos[0].dom_kind() num_leading_s = len(dom_kind) - len(dom_kind.lstrip("S")) num_trailing_r = len(dom_kind) - len(dom_kind.rstrip("R")) @@ -74,21 +61,12 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) - print("num_last_block_iter:", num_last_block_iter) - print("len(dom_kind)",len(dom_kind)) - print("dom_kind:", dom_kind) - - if num_last_block_iter < len(dom_kind): - # FIXME: this block of code makes CrossEntropyLoss work but Sort fail. - # # If the last block is a scalar value, there is nothing left to - # # tile/parallelise, and `iters` is an empty tuple. - # # Add a unit thread loop so the final write happens inside a valid - # # GPU thread environment. + # If the last block is a scalar value, there is nothing left to + # tile/parallelise, and `iters` is an empty tuple. + # Add a unit thread loop so the final write happens inside a valid + # GPU thread environment. if num_last_block_iter == 0: - print("ENTERING THE THING!!!!!!!!!!!!!!!!!") - # assert 0, "ENTERING THE THING!!!!!!!!!!!!!!!!!" - # Put every block (both the running reductions and the final # scalar write) inside a trivial GPU thread. The very first block # gets a `blockIdx.x` wrapper so that kernels still have a unique @@ -102,9 +80,7 @@ def apply( # pylint: disable=too-many-locals return sch - def f_layout_mapping(*iters): - print("iters:", iters) analyzer = arith.Analyzer() # Try to match the iters of last block to the iters of the first block. # For matched positions, use the iter from the input `iters`. @@ -129,8 +105,6 @@ def f_layout_mapping(*iters): ) + list(iters) index_map = tir.IndexMap.from_func(f_layout_mapping, ndim=num_last_block_iter) - print("index_map:") - print(index_map) sch.transform_block_layout(block_infos[-1].block_rv, index_map) try: From 396d4471638d9651065581ace8c183dbff00d3aa Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Sun, 4 May 2025 15:26:51 -0400 Subject: [PATCH 18/18] cleanup. All nightly tests pass --- python/tvm/dlight/gpu/general_reduction.py | 5 ++--- .../test_nnapi/test_from_exported_to_cuda.py | 16 ++++++++++++++++ .../relax/test_frontend_from_exported_program.py | 3 ++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index 1a43a629e80f..b1564bf61fa9 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -39,8 +39,8 @@ def apply( # pylint: disable=too-many-locals return None if target.kind.name == "cuda": - len_tx = 256 # Number of threads per block - unroll_depth = 256 # How many iterations of loop to unroll + len_tx = 256 + unroll_depth = 256 elif target.kind.name == "opencl": len_tx = 256 unroll_depth = 64 @@ -60,7 +60,6 @@ def apply( # pylint: disable=too-many-locals # Align the number of block iters of the last block. num_last_block_iter = len(block_infos[-1].dom_kind()) - if num_last_block_iter < len(dom_kind): # If the last block is a scalar value, there is nothing left to # tile/parallelise, and `iters` is an empty tuple. diff --git a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py index 0184688a9e84..3f0964cfa8ed 100644 --- a/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py +++ b/tests/python/nightly/test_nnapi/test_from_exported_to_cuda.py @@ -1054,5 +1054,21 @@ def forward(self, x): assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) +@tvm.testing.parametrize_targets("cuda") +def test_cross_entropy_module(target, dev): + class CrossEntropyModule(nn.Module): + def __init__(self): + super().__init__() + self.criterion = nn.CrossEntropyLoss() + self.target = torch.tensor([0, 1, 2, 1]) + + def forward(self, x): + return self.criterion(x, self.target) + + raw_data = np.random.randn(4, 3).astype(np.float32) + torch_module = CrossEntropyModule().eval() + assert_torch_output_vs_tvm_from_exported_to_cuda(raw_data, torch_module, target, dev) + + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/relax/test_frontend_from_exported_program.py b/tests/python/relax/test_frontend_from_exported_program.py index d39a5cee1e17..e5d307b895a9 100644 --- a/tests/python/relax/test_frontend_from_exported_program.py +++ b/tests/python/relax/test_frontend_from_exported_program.py @@ -5007,6 +5007,7 @@ def main(x: R.Tensor((4, 3), dtype="float32")) -> R.Tuple(R.Tensor((), dtype="fl example_args1 = (torch.randn(4, 3, dtype=torch.float32),) verify_model(CrossEntropyModule(), example_args1, {}, Expected1) + def test_linspace(): class Linspace(Module): def forward(self, input): @@ -5057,4 +5058,4 @@ def main( if __name__ == "__main__": tvm.testing.main() -1 \ No newline at end of file +1