diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index 55c64fef326..4d2449f946c 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -7,7 +7,6 @@ from . import arm_pass_utils # noqa from .arm_pass import ArmPass # noqa # usort: skip from .add_bias_pass import AddBiasPass # noqa -from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder # noqa from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass # noqa from .broadcast_args_pass import BroadcastArgsPass # noqa from .cast_bool_to_int8_pass import CastBoolToInt8Pass # noqa @@ -85,6 +84,7 @@ ) from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa from .size_adjust_input_pass import SizeAdjustInputPass # noqa +from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass # noqa from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass # noqa from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass # noqa from .replace_inf_values_pass import ReplaceInfValues # noqa # usort: skip diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index 7aab59ac310..7592be1d7da 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -10,7 +10,6 @@ import executorch.backends.arm.tosa.dialect # noqa: unused from executorch.backends.arm._passes import ( AddBiasPass, - AnnotateChannelsLastDimOrder, AnnotateDecomposedMatmulPass, BroadcastArgsPass, CastBoolToInt8Pass, @@ -84,6 +83,7 @@ RetraceFoldedDtypesPass, ScalarsToAttributePass, SizeAdjustInputPass, + ToTosaMemoryFormatPass, UnsqueezeBeforeRepeatPass, UnsqueezeScalarPlaceholdersPass, ) @@ -162,7 +162,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(InsertTableOpsPass(exported_program)) self.add_pass(FuseEqualPlaceholdersPass(exported_program)) - self.add_pass(AnnotateChannelsLastDimOrder()) + self.add_pass(ToTosaMemoryFormatPass(exported_program)) self.add_pass(InsertRescalePass()) return self._transform(exported_program.graph_module) @@ -241,7 +241,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(AddBiasPass(exported_program)) self.add_pass(InsertTableOpsPass(exported_program)) self.add_pass(FuseEqualPlaceholdersPass(exported_program)) - self.add_pass(AnnotateChannelsLastDimOrder()) + self.add_pass(ToTosaMemoryFormatPass(exported_program)) self.add_pass(InsertRescalePass()) return self._transform(exported_program.graph_module) diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py index 9a25b7c28ae..99c89f474ea 100644 --- a/backends/arm/_passes/decompose_select.py +++ b/backends/arm/_passes/decompose_select.py @@ -7,7 +7,10 @@ # pyre-unsafe import torch -from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.arm_pass_utils import ( + create_node, + get_first_fake_tensor, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -34,8 +37,9 @@ def call(self, graph_module: torch.fx.GraphModule): input_node, dim, index = node.args - rank = len(input_node.meta["val"].size()) - shape = input_node.meta["val"].shape + input_tensor = get_first_fake_tensor(input_node) + rank = len(input_tensor.size()) + shape = input_tensor.shape dim = dim % rank if dim < 0 else dim index = index % shape[dim] if index < 0 else index @@ -44,7 +48,7 @@ def call(self, graph_module: torch.fx.GraphModule): graph_module.graph, slice_op, (input_node, dim, index, index + 1) ) squeeze_node = create_node( - graph_module.graph, squeeze_op, (slice_node, [dim]) + graph_module.graph, squeeze_op, (slice_node, [dim]), from_node=node ) node.replace_all_uses_with(squeeze_node) diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py similarity index 73% rename from backends/arm/_passes/annotate_channels_last_dim_order_pass.py rename to backends/arm/_passes/to_tosa_memory_format_pass.py index 0ce8d667b3c..49482a70059 100644 --- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/_passes/to_tosa_memory_format_pass.py @@ -10,13 +10,22 @@ from executorch.backends.arm._passes.arm_pass_utils import ( create_node, get_first_fake_tensor, + is_param_node, ) from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d +from executorch.exir import ExportedProgram from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -class AnnotateChannelsLastDimOrder(ExportPass): +def _is_input(node: torch.fx.Node, exported_program: ExportedProgram) -> bool: + """ + Returns True if the node is an input node, i.e. a placeholder or a parameter. + """ + return node.op == "placeholder" and not is_param_node(exported_program, node) + + +class ToTosaMemoryFormatPass(ExportPass): """ Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts backend.tosa.TRANSPOSE @@ -30,6 +39,10 @@ class AnnotateChannelsLastDimOrder(ExportPass): NNHWC_order = (0, 1, 3, 4, 2) NNHWC_inverse_order = (0, 1, 4, 2, 3) + def __init__(self, exported_program: ExportedProgram) -> None: + self.exported_program = exported_program + super().__init__() + def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node): """ returns True for w in the following sequence; @@ -92,6 +105,11 @@ def is_channel_reshape(input_shape, output_shape): @staticmethod def insert_input_transpose(node, input_node, graph_module): + if input_node.target == exir_ops.backend.tosa.TRANSPOSE.default: + pre_permute_node = input_node.all_input_nodes[0] + node.replace_input_with(input_node, pre_permute_node) + return + with graph_module.graph.inserting_before(node): permute_node = create_node( graph_module.graph, @@ -99,18 +117,18 @@ def insert_input_transpose(node, input_node, graph_module): args=( input_node, list( - AnnotateChannelsLastDimOrder.NNHWC_inverse_order + ToTosaMemoryFormatPass.NNHWC_inverse_order if len(get_first_fake_tensor(input_node).size()) == 5 - else AnnotateChannelsLastDimOrder.NHWC_inverse_order + else ToTosaMemoryFormatPass.NHWC_inverse_order ), ), + from_node=node, ) node.replace_input_with(input_node, permute_node) permute_node.meta["tosa_dim_order"] = tuple( range(len(input_node.meta["val"].size())) ) - permute_node.meta["val"] = input_node.meta["val"] @staticmethod def insert_output_transpose(node, graph_module): @@ -121,25 +139,23 @@ def insert_output_transpose(node, graph_module): args=( node, list( - AnnotateChannelsLastDimOrder.NNHWC_order + ToTosaMemoryFormatPass.NNHWC_order if len(get_first_fake_tensor(node).size()) == 5 - else AnnotateChannelsLastDimOrder.NHWC_order + else ToTosaMemoryFormatPass.NHWC_order ), ), + from_node=node, ) + permute_node.meta["tosa_dim_order"] = ( - AnnotateChannelsLastDimOrder.NNHWC_order + ToTosaMemoryFormatPass.NNHWC_order if len(get_first_fake_tensor(node).size()) == 5 - else AnnotateChannelsLastDimOrder.NHWC_order - ) - permute_node.meta["val"] = get_first_fake_tensor(node).permute( - AnnotateChannelsLastDimOrder.NNHWC_order - if len(get_first_fake_tensor(node).size()) == 5 - else AnnotateChannelsLastDimOrder.NHWC_order + else ToTosaMemoryFormatPass.NHWC_order ) node.meta["tosa_dim_order"] = tuple( range(len(get_first_fake_tensor(node).size())) ) + users = [user for user in node.users if user != permute_node] for user in users: user.replace_input_with(node, permute_node) @@ -150,20 +166,23 @@ def _insert_view_transpose( ): nchw_to_nhwc = len(input_shape) < 4 and len(output_shape) >= 4 nhwc_to_nchw = len(input_shape) >= 4 and len(output_shape) < 4 - channel_reshape = AnnotateChannelsLastDimOrder.is_channel_reshape( + channel_reshape = ToTosaMemoryFormatPass.is_channel_reshape( output_shape, input_shape ) if ( channel_reshape or nhwc_to_nchw - ) and AnnotateChannelsLastDimOrder.memory_format_differs(input_shape): - AnnotateChannelsLastDimOrder.insert_input_transpose( + ) and ToTosaMemoryFormatPass.memory_format_differs(input_shape): + + ToTosaMemoryFormatPass.insert_input_transpose( node, input_node, graph_module ) + if ( channel_reshape or nchw_to_nhwc - ) and AnnotateChannelsLastDimOrder.memory_format_differs(output_shape): - AnnotateChannelsLastDimOrder.insert_output_transpose(node, graph_module) + ) and ToTosaMemoryFormatPass.memory_format_differs(output_shape): + + ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module) def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule): """ @@ -181,9 +200,10 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: # call_function and placeholder allowed due to # index.Tensor being able to come in as both - if node.op not in ["call_function", "placeholder"]: + if node.op not in ["call_function", "placeholder", "output"]: continue + # Transpose views elif node.target in ( exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.index.Tensor, @@ -194,25 +214,48 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule): input_node = node.args[0] input_shape = input_node.meta["val"].shape output_shape = node.meta["val"].shape - self._insert_view_transpose( - input_shape, output_shape, node, input_node, graph_module + input_shape, + output_shape, + node, + input_node, + graph_module, ) + # Transpose inputs + elif _is_input(node, self.exported_program): + input_shape = get_first_fake_tensor(node).size() + if len(input_shape) in (4, 5): + ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module) + + # Transpose outputs + elif node.op == "output": + output_shape = get_first_fake_tensor(node).size() + + if len(output_shape) in (4, 5): + for input_node in node.all_input_nodes: + ToTosaMemoryFormatPass.insert_input_transpose( + node, input_node, graph_module + ) + def call(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: node_data = get_first_fake_tensor(node).data - if node_data.dim() == 4: + # Inputs and outputs are always in (N)NCHW format + if _is_input(node, self.exported_program) or node.op == "output": + dim_order = tuple(range(node_data.dim())) + elif node_data.dim() == 4: dim_order = self.NHWC_order if self.is_weight_node_for_depthwise_conv2d(node): # The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d). dim_order = self.HWCM_order elif node_data.dim() == 5: - dim_order = self.NNHWC_order # type: ignore[assignment] + dim_order = self.NNHWC_order else: dim_order = tuple(range(node_data.dim())) # type: ignore[assignment] + node.meta["tosa_dim_order"] = dim_order # Insert TOSA transposes to convert between (N)NCHW and (N)NHWC format. # See insert_tosa_transposes for insertion conditions. diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py index 91614874d23..accd79e8546 100644 --- a/backends/arm/operators/op_transpose.py +++ b/backends/arm/operators/op_transpose.py @@ -47,7 +47,14 @@ def define_node( validate_valid_dtype( self.target, [inputs[0], output], - [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32], + [ + ts.DType.INT8, + ts.DType.INT16, + ts.DType.INT32, + ts.DType.FP32, + ts.DType.BOOL, + ts.DType.FP16, + ], output.tosa_spec, ) diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index c91ad4021c4..bff5ff69284 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -261,9 +261,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { // Select a compatible copy routine including checking for input layouts // which require permutation. - bool permuted_input_shape; - ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( - i, tensor_in, &handles.inputs->io[i], &permuted_input_shape)); bool both_int = tensor_in.scalar_type() == ScalarType::Int && handles.inputs->io[i].elem_size == 4; bool both_char = tensor_in.scalar_type() == ScalarType::Char && @@ -273,19 +270,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { bool both_bool = tensor_in.scalar_type() == ScalarType::Bool && (handles.inputs->io[i].elem_size == 1); - // Select a compatible copy routine - if ((both_char || both_bool) && permuted_input_shape) { - EXECUTORCH_PROF_SCOPE( - event_tracer, - "+EthosUBackend::execute()handles.input.permute_CHW_to_HWC()"); - // permuted byte copy CHW to HWC - permute_CHW_to_HWC( - tensor_in.mutable_data_ptr(), - scratch_addr, - tensor_in.size(1), - tensor_in.size(2), - tensor_in.size(3)); - } else if (both_char || both_int || both_short || both_bool) { + if (both_char || both_int || both_short || both_bool) { EXECUTORCH_PROF_SCOPE( event_tracer, "+EthosUBackend::execute()handles.input.memcpy()"); // Sizes match and elt size matches so memcpy @@ -297,18 +282,16 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Error, "No matching input copy routine"); return Error::InvalidProgram; } - if (!permuted_input_shape) { - calculate_dimensions( - tensor_in, &handles.inputs->io[i], &tensor_count, &io_count); - if (tensor_count != io_count) { - ET_LOG(Error, "Input tensor sizes do not match"); - ET_LOG( - Error, - "Program expects %d elements but got %d", - io_count, - tensor_count); - return Error::InvalidProgram; - } + calculate_dimensions( + tensor_in, &handles.inputs->io[i], &tensor_count, &io_count); + if (tensor_count != io_count) { + ET_LOG(Error, "Input tensor sizes do not match"); + ET_LOG( + Error, + "Program expects %d elements but got %d", + io_count, + tensor_count); + return Error::InvalidProgram; } } @@ -369,34 +352,13 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { tensor_dim = tensor_dim + tensor_count; io_dim = io_dim + io_count; - bool permuted_output_shape; - ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( - i, tensor_out, &handles.outputs->io[i], &permuted_output_shape)); + EXECUTORCH_PROF_SCOPE( + event_tracer, "+EthosUBackend::execute()handles.output.memcpy()"); - if ((tensor_out.scalar_type() == ScalarType::Char || - tensor_out.scalar_type() == ScalarType::Bool) && - permuted_output_shape) { - EXECUTORCH_PROF_SCOPE( - event_tracer, - "+EthosUBackend::execute()handles.output.permute_HWC_to_CHW()"); - - const char* output_address = static_cast(output_addr); - - permute_HWC_to_CHW( - output_address, - tensor_out.mutable_data_ptr(), - tensor_out.size(1), - tensor_out.size(2), - tensor_out.size(3)); - } else { - EXECUTORCH_PROF_SCOPE( - event_tracer, "+EthosUBackend::execute()handles.output.memcpy()"); - - memcpy( - tensor_out.mutable_data_ptr(), - static_cast(output_addr), - tensor_out.nbytes()); - } + memcpy( + tensor_out.mutable_data_ptr(), + static_cast(output_addr), + tensor_out.nbytes()); } if (tensor_dim != io_dim) { ET_LOG(Error, "Total output tensor sizes do not match"); @@ -426,46 +388,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { *io_count = *io_count * io->shape[i]; } } - - Error check_requires_permute( - int index, - const executorch::aten::Tensor tensor, - VelaIO* io, - bool* is_permuted) const { - bool permuted_shape = false; - - if (tensor.dim() == 4) { - // special case for NHWC workaround in AOT; as the compilation has - // permuted to channel last in an undetectable way, we assume here - // that the application has similarly permuted any input/output tensors. - permuted_shape = tensor.size(0) == io->shape[0] && - tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] && - tensor.size(3) == io->shape[2]; - if (permuted_shape) { - ET_LOG(Debug, "Tensor input/output %d will be permuted", index); - } - } - *is_permuted = permuted_shape; - return Error::Ok; - } - - void permute_CHW_to_HWC(const char* input, char* output, int C, int H, int W) - const { - for (int i = 0; i != H * W; ++i) { - for (int j = 0; j < C; ++j) { - output[i * C + j] = input[i + j * W * H]; - } - } - } - - void permute_HWC_to_CHW(const char* input, char* output, int C, int H, int W) - const { - for (int i = 0; i != H * W; ++i) { - for (int j = 0; j < C; ++j) { - output[i + j * W * H] = input[i * C + j]; - } - } - } }; namespace { @@ -476,4 +398,4 @@ static auto registered = register_backend(backend_id); } // namespace arm } // namespace backends -} // namespace executorch +} // namespace executorch \ No newline at end of file diff --git a/backends/arm/test/models/test_nn_functional.py b/backends/arm/test/models/test_nn_functional.py index 651f9585459..c1a9f312d85 100644 --- a/backends/arm/test/models/test_nn_functional.py +++ b/backends/arm/test/models/test_nn_functional.py @@ -83,6 +83,8 @@ def forward(self, *args): module_tests, xfails={ "affine_grid": "Int64 input. Partition handling fails since arange int64 output is split between 2 partitions.", + "unfold": "ValueError: Invalid TOSA graph", + "fold": "ValueError: Invalid TOSA graph", }, ) def test_nn_functional_FP(test_data): diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py index 3600c34c94c..080dddda92e 100644 --- a/backends/arm/test/ops/test_amax.py +++ b/backends/arm/test/ops/test_amax.py @@ -95,10 +95,7 @@ def test_amax_u55_INT_not_delegated(): pipeline.run() -fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"} - - -@common.parametrize("test_data", Amax.test_data, fvp_xfails, strict=False) +@common.parametrize("test_data", Amax.test_data) @common.XfailIfNoCorstone320 def test_amax_u85_INT(test_data: Amax.input_t): data, dim, keep_dims = test_data() diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py index 3ae94fe3c6e..a24da9e1ba0 100644 --- a/backends/arm/test/ops/test_amin.py +++ b/backends/arm/test/ops/test_amin.py @@ -104,10 +104,7 @@ def test_amin_u55_INT_not_delegated(): pipeline.run() -fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"} - - -@common.parametrize("test_data", Amin.test_data, fvp_xfails, strict=False) +@common.parametrize("test_data", Amin.test_data) @common.XfailIfNoCorstone320 def test_amin_u85_INT(test_data: Amin.input_t): data, dim, keep_dims = test_data() diff --git a/backends/arm/test/ops/test_bitwise.py b/backends/arm/test/ops/test_bitwise.py index 1c0f0e36a6a..46e84361573 100644 --- a/backends/arm/test/ops/test_bitwise.py +++ b/backends/arm/test/ops/test_bitwise.py @@ -4,8 +4,10 @@ # LICENSE file in the root directory of this source tree. +from copy import copy from typing import Tuple +import pytest import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -16,7 +18,6 @@ VgfPipeline, ) - input_t2 = Tuple[torch.Tensor, torch.Tensor] # Input x, y @@ -56,6 +57,9 @@ class BitwiseBinary(torch.nn.Module): ), } + test_data_u85 = copy(test_data) + del test_data_u85["zeros"] + class BitwiseBinaryScalar(torch.nn.Module): test_data = { @@ -77,6 +81,9 @@ class BitwiseBinaryScalar(torch.nn.Module): ), } + test_data_u85 = copy(test_data) + del test_data_u85["zeros"] + class And(BitwiseBinary): aten_op = "torch.ops.aten.bitwise_and.Tensor" @@ -226,7 +233,7 @@ def test_bitwise_and_scalar_u55_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", AndScalar.test_data) +@common.parametrize("test_data", AndScalar.test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_and_scalar_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -244,7 +251,7 @@ def test_bitwise_and_scalar_u85_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", And().test_data) +@common.parametrize("test_data", And().test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_and_tensor_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -427,7 +434,7 @@ def test_bitwise_xor_scalar_u55_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Xor().test_data) +@common.parametrize("test_data", Xor().test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_xor_tensor_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -445,7 +452,7 @@ def test_bitwise_xor_tensor_u85_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", XorScalar.test_data) +@common.parametrize("test_data", XorScalar.test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_xor_scalar_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -628,7 +635,7 @@ def test_bitwise_or_scalar_u55_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Or().test_data) +@common.parametrize("test_data", Or().test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_or_tensor_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -646,7 +653,7 @@ def test_bitwise_or_tensor_u85_INT(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", OrScalar.test_data) +@common.parametrize("test_data", OrScalar.test_data_u85) @common.XfailIfNoCorstone320 def test_bitwise_or_scalar_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -730,3 +737,30 @@ def test_bitwise_or_scalar_vgf_INT(test_data: input_t2): pipeline.pop_stage("quantize") pipeline.pop_stage("check.quant_nodes") pipeline.run() + + +@pytest.mark.xfail( + reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype." +) +def test_bitwise_or_tensor_u85_INT_zeros(): + raise RuntimeError( + "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash." + ) + + +@pytest.mark.xfail( + reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype." +) +def test_bitwise_and_tensor_u85_INT_zeros(): + raise RuntimeError( + "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash." + ) + + +@pytest.mark.xfail( + reason="MLBEDSW-11029: Fatal Python floating point error in Vela for rank 4 bitwse ops with int32 dtype." +) +def test_bitwise_xor_tensor_u85_INT_zeros(): + raise RuntimeError( + "Dummy test to xfail mark u85 zeros test case since running the actual test causes a fatal crash." + ) diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py index 826689622fb..55578aa15c6 100644 --- a/backends/arm/test/ops/test_cat.py +++ b/backends/arm/test/ops/test_cat.py @@ -105,15 +105,7 @@ def test_cat_tosa_INT(test_data: Tuple): pipeline.run() -x_fails = { - "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.", - "cat_rand_two_tensors_dim_0": "MLETORCH-630: AssertionError: Output 0 does not match reference output.", - "cat_rand_two_tensors_dim_3": "MLETORCH-630: AssertionError: Output 0 does not match reference output.", - "cat_rand_large": "MLETORCH-630: AssertionError: Output 0 does not match reference output.", -} - - -@common.parametrize("test_data", Cat.test_parameters, x_fails) +@common.parametrize("test_data", Cat.test_parameters) @common.XfailIfNoCorstone300 def test_cat_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( @@ -126,7 +118,7 @@ def test_cat_u55_INT(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", Cat.test_parameters, x_fails) +@common.parametrize("test_data", Cat.test_parameters) @common.XfailIfNoCorstone320 def test_cat_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 7a24848697e..b4f2879be48 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -106,6 +106,9 @@ def test_clone_u85_INT(test_data): @common.parametrize("test_data", test_data_suite) @common.SkipIfNoModelConverter +@pytest.mark.xfail( + reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477" +) def test_clone_vgf_FP(test_data): pipeline = VgfPipeline[input_t]( Clone(), test_data(), aten_op, exir_op, tosa_version="TOSA-1.0+FP" @@ -115,6 +118,9 @@ def test_clone_vgf_FP(test_data): @common.parametrize("test_data", test_data_suite) @common.SkipIfNoModelConverter +@pytest.mark.xfail( + reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477" +) def test_clone_vgf_INT(test_data): pipeline = VgfPipeline[input_t]( Clone(), diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index 0d23d2a6c7e..0300f7c2049 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -388,15 +388,6 @@ def forward(self, x): for q in [True, False] } -fvp_xfails = { - f"{k},per_channel_quant={q}": reason - for k, reason in { - "2x2_3x2x40x40_nobias": "MLETORCH-520: Numerical issues on FVP.", - "5x5_3x2x128x128_st1": "MLETORCH-520: Numerical issues on FVP.", - }.items() - for q in [True, False] -} - input_t = Tuple[torch.Tensor] @@ -426,7 +417,7 @@ def test_convolution_2d_tosa_INT(test_data): pipeline.run() -@common.parametrize("test_data", test_data_INT, fvp_xfails) +@common.parametrize("test_data", test_data_INT) @common.XfailIfNoCorstone300 def test_convolution_2d_u55_INT(test_data): model, per_channel_quantization = test_data() @@ -441,7 +432,7 @@ def test_convolution_2d_u55_INT(test_data): pipeline.run() -@common.parametrize("test_data", test_data_INT, fvp_xfails) +@common.parametrize("test_data", test_data_INT) @common.XfailIfNoCorstone320 def test_convolution_u85_INT(test_data): model, per_channel_quantization = test_data() diff --git a/backends/arm/test/ops/test_cosh.py b/backends/arm/test/ops/test_cosh.py index 14b7def60cd..60920d03f94 100644 --- a/backends/arm/test/ops/test_cosh.py +++ b/backends/arm/test/ops/test_cosh.py @@ -73,7 +73,14 @@ def test_cosh_u55_INT(test_data: Tuple): @common.XfailIfNoCorstone320 -@common.parametrize("test_data", test_data_suite) +@common.parametrize( + "test_data", + test_data_suite, + xfails={ + "ones_4D": "MLBEDSW-11046 - Incorrect output for TABLE followed by RESHAPE" + }, + strict=False, +) def test_cosh_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index bf6aad840ac..0f8b34d3d47 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -199,7 +199,7 @@ @common.parametrize("test_data", test_data_conv1d_FP | test_data_conv2d_FP) -def test_depthwise_convolution_2d_tosa_FP(test_data: torch.nn.Module): +def test_convolution_2d_tosa_FP_depthwise(test_data: torch.nn.Module): pipeline = TosaPipelineFP[input_t]( test_data(), test_data().get_inputs(), @@ -211,7 +211,7 @@ def test_depthwise_convolution_2d_tosa_FP(test_data: torch.nn.Module): @pytest.mark.flaky(reruns=5) # TODO: Investigate flakyness (MLTORCH-307) @common.parametrize("test_data", test_data_conv1d_INT | test_data_conv2d_INT) -def test_depthwise_convolution_2d_tosa_INT(test_data): +def test_convolution_2d_tosa_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = TosaPipelineINT[input_t]( model, @@ -225,7 +225,7 @@ def test_depthwise_convolution_2d_tosa_INT(test_data): @common.parametrize("test_data", test_data_conv1d_FP | test_data_conv2d_FP) @common.SkipIfNoModelConverter -def test_depthwise_convolution_2d_vgf_FP(test_data: torch.nn.Module): +def test_convolution_2d_vgf_FP_depthwise(test_data: torch.nn.Module): model = test_data() pipeline = VgfPipeline[input_t]( model, @@ -239,7 +239,7 @@ def test_depthwise_convolution_2d_vgf_FP(test_data: torch.nn.Module): @common.parametrize("test_data", test_data_conv1d_INT | test_data_conv2d_INT) @common.SkipIfNoModelConverter -def test_depthwise_convolution_2d_vgf_INT(test_data): +def test_convolution_2d_vgf_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = VgfPipeline[input_t]( model, @@ -251,19 +251,9 @@ def test_depthwise_convolution_2d_vgf_INT(test_data): pipeline.run() -x_fails = { - f"{k},per_channel_quant={q}": reason - for k, reason in { - "3x3_2x8x198x198_gp8_st3": "MLETORCH-517: Operators fail with batches > 1", - "two_dw_conv2d": "MLETORCH-517: Operators fail with batches > 1", - }.items() - for q in [True, False] -} - - @common.XfailIfNoCorstone300 # TODO: MLETORCH-516 -@common.parametrize("test_data", test_data_conv2d_INT, x_fails) -def test_depthwise_convolution_2d_u55_INT(test_data): +@common.parametrize("test_data", test_data_conv2d_INT) +def test_convolution_2d_u55_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = EthosU55PipelineINT[input_t]( model, @@ -278,7 +268,7 @@ def test_depthwise_convolution_2d_u55_INT(test_data): @common.XfailIfNoCorstone300 # TODO: MLETORCH-516 @common.parametrize("test_data", test_data_conv1d_INT) -def test_depthwise_convolution_1d_u55_INT(test_data): +def test_convolution_1d_u55_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = EthosU55PipelineINT[input_t]( model, @@ -292,8 +282,8 @@ def test_depthwise_convolution_1d_u55_INT(test_data): @common.XfailIfNoCorstone320 # TODO: MLETORCH-516 -@common.parametrize("test_data", test_data_conv2d_INT, x_fails) -def test_depthwise_convolution_2d_u85_INT(test_data): +@common.parametrize("test_data", test_data_conv2d_INT) +def test_convolution_2d_u85_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = EthosU85PipelineINT[input_t]( model, @@ -307,8 +297,8 @@ def test_depthwise_convolution_2d_u85_INT(test_data): @common.XfailIfNoCorstone320 # TODO: MLETORCH-516 -@common.parametrize("test_data", test_data_conv1d_INT, x_fails) -def test_depthwise_convolution_1d_u85_INT(test_data): +@common.parametrize("test_data", test_data_conv1d_INT) +def test_convolution_1d_u85_INT_depthwise(test_data): model, per_channel_quantization = test_data() pipeline = EthosU85PipelineINT[input_t]( model, diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index 026939758a0..5bacac1c962 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -101,17 +101,7 @@ def test_div_tensor_tosa_INT(test_data: Tuple): pipeline.run() -x_fails = { - "op_div_rank4_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op", - "op_div_rank4_negative_ones": "MLETORCH-521: Numerical issues on FVP likely due to mul op", - "op_div_rank4_ones_div_negative": "MLETORCH-521: Numerical issues on FVP likely due to mul op", - "op_div_rank4_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op", - "op_div_rank4_negative_large_rand": "MLETORCH-521: Numerical issues on FVP likely due to mul op", - "op_div_rank4_large_randn": "MLETORCH-521: Numerical issues on FVP likely due to mul op", -} - - -@common.parametrize("test_data", test_data_suite, xfails=x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 def test_div_tensor_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( @@ -124,7 +114,7 @@ def test_div_tensor_u55_INT(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", test_data_suite, xfails=x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 def test_div_tensor_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index 607d8650946..a0a7ccadeb4 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -70,14 +70,7 @@ def test_expand_tosa_INT(test_data: Tuple): pipeline.run() -x_fails = { - "rand_batch_2": "AssertionError: Output 0 does not match reference output.", - "rand_mix_neg": "AssertionError: Output 0 does not match reference output.", - "rand_small_neg": "AssertionError: Output 0 does not match reference output.", -} - - -@common.parametrize("test_data", Expand.test_parameters, x_fails) +@common.parametrize("test_data", Expand.test_parameters) @common.XfailIfNoCorstone300 def test_expand_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( @@ -90,7 +83,7 @@ def test_expand_u55_INT(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", Expand.test_parameters, x_fails) +@common.parametrize("test_data", Expand.test_parameters) @common.XfailIfNoCorstone320 def test_expand_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py index c66f6d164b9..94f33d28630 100644 --- a/backends/arm/test/ops/test_ge.py +++ b/backends/arm/test/ops/test_ge.py @@ -153,7 +153,6 @@ def test_ge_scalar_u55_INT(test_module): @common.parametrize( "test_module", test_data_tensor, - xfails={"ge_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"}, ) @common.XfailIfNoCorstone320 def test_ge_tensor_u85_INT(test_module): @@ -170,7 +169,6 @@ def test_ge_tensor_u85_INT(test_module): @common.parametrize( "test_module", test_data_scalar, - xfails={"ge_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85"}, ) @common.XfailIfNoCorstone320 def test_ge_scalar_u85_INT(test_module): diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py index 83c85e5f9fc..41229397eb5 100644 --- a/backends/arm/test/ops/test_gt.py +++ b/backends/arm/test/ops/test_gt.py @@ -154,9 +154,6 @@ def test_gt_scalar_u55_INT(test_module): @common.parametrize( "test_module", test_data_tensor, - xfails={ - "gt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", - }, ) @common.XfailIfNoCorstone320 def test_gt_tensor_u85_INT(test_module): @@ -173,9 +170,6 @@ def test_gt_tensor_u85_INT(test_module): @common.parametrize( "test_module", test_data_scalar, - xfails={ - "gt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", - }, ) @common.XfailIfNoCorstone320 def test_gt_scalar_u85_INT(test_module): diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py index 6cb185ecb92..31422302a2d 100644 --- a/backends/arm/test/ops/test_le.py +++ b/backends/arm/test/ops/test_le.py @@ -155,9 +155,6 @@ def test_le_scalar_u55_INT_not_delegated(test_module): @common.parametrize( "test_module", test_data_tensor, - xfails={ - "le_tensor_rank4_randn": "4D fails because boolean Tensors can't be subtracted" - }, ) @common.XfailIfNoCorstone320 def test_le_tensor_u85_INT(test_module): @@ -175,9 +172,6 @@ def test_le_tensor_u85_INT(test_module): @common.parametrize( "test_module", test_data_scalar, - xfails={ - "le_scalar_rank4_randn": "4D fails because boolean Tensors can't be subtracted" - }, ) @common.XfailIfNoCorstone320 def test_le_scalar_u85_INT(test_module): diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 57ce490dae8..f5cb2e952e5 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -183,24 +183,9 @@ def test_linear_u55_INT(test_data: torch.Tensor): ).run() -x_fail = { - f"{k},per_channel_quant={q}": reason - for k, reason in { - "model_linear_rank4_zeros": "AssertionError: Output 0 does not match reference output.", - "model_linear_rank4_ones": "AssertionError: Output 0 does not match reference output.", - "model_linear_rank4_negative_ones": "AssertionError: Output 0 does not match reference output.", - "model_linear_rank4_rand": "AssertionError: Output 0 does not match reference output.", - "model_linear_rank4_negative_large_rand": "AssertionError: Output 0 does not match reference output.", - "model_linear_rank4_large_randn": "AssertionError: Output 0 does not match reference output.", - }.items() - for q in [True, False] -} - - @common.parametrize( "test_data", test_data_rank1_INT | test_data_rank4_INT, - x_fail, ) @common.XfailIfNoCorstone320 def test_linear_u85_INT(test_data: torch.Tensor): diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py index 2b160ce7b50..bb7c5773342 100644 --- a/backends/arm/test/ops/test_logical.py +++ b/backends/arm/test/ops/test_logical.py @@ -86,6 +86,9 @@ def forward(self, tensor: torch.Tensor): ################# +xfails = {"rand_rank4": "MLBEDSW-11031: Output diff on u85 bool transpose."} + + @common.parametrize("test_data", And().test_data) def test_logical_and_tosa_FP(test_data: input_t2): pipeline = TosaPipelineFP[input_t2]( @@ -129,7 +132,7 @@ def test_logical_and_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", And().test_data) +@common.parametrize("test_data", And().test_data, xfails=xfails) @common.XfailIfNoCorstone320 def test_logical_and_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -223,7 +226,7 @@ def test_logical_xor_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Xor().test_data) +@common.parametrize("test_data", Xor().test_data, xfails=xfails) @common.XfailIfNoCorstone320 def test_logical_xor_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -317,7 +320,7 @@ def test_logical_or_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Or().test_data) +@common.parametrize("test_data", Or().test_data, xfails=xfails) @common.XfailIfNoCorstone320 def test_logical_or_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -411,7 +414,7 @@ def test_logical_not_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Not().test_data) +@common.parametrize("test_data", Not().test_data, xfails=xfails) @common.XfailIfNoCorstone320 def test_logical_not_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py index b1b934fbcc8..791069aa4b0 100644 --- a/backends/arm/test/ops/test_logsoftmax.py +++ b/backends/arm/test/ops/test_logsoftmax.py @@ -68,7 +68,7 @@ def test_log_softmax_tosa_INT(test_data): "test_data", LogSoftmax.test_data, xfails={ - "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP" + "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55." }, ) @common.XfailIfNoCorstone300() @@ -85,13 +85,7 @@ def test_log_softmax_u55_INT(test_data): pipeline.run() -@common.parametrize( - "test_data", - LogSoftmax.test_data, - xfails={ - "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP" - }, -) +@common.parametrize("test_data", LogSoftmax.test_data) @common.XfailIfNoCorstone320 def test_log_softmax_u85_INT(test_data): data, dim = test_data() diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py index 86d903e3f88..98d0298b195 100644 --- a/backends/arm/test/ops/test_lt.py +++ b/backends/arm/test/ops/test_lt.py @@ -154,9 +154,6 @@ def test_lt_scalar_u55_INT_not_delegated(test_module): @common.parametrize( "test_module", test_data_tensor, - xfails={ - "lt_tensor_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", - }, ) @common.XfailIfNoCorstone320 def test_lt_tensor_u85_INT(test_module): @@ -173,9 +170,6 @@ def test_lt_tensor_u85_INT(test_module): @common.parametrize( "test_module", test_data_scalar, - xfails={ - "lt_scalar_rank4_randn": "MLETORCH-847: Boolean eq result unstable on U85", - }, ) @common.XfailIfNoCorstone320 def test_lt_scalar_u85_INT(test_module): diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index 6b75c2b7d0a..7db56311837 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -65,12 +65,10 @@ torch.rand(1, 16, 54, 54), [3, (1, 3), 1], ), -} - -test_data_suite_mult_batches = { "randn": lambda: (torch.randn(5, 16, 50, 32), [4, 2, 0]), } + test_data_suite_dilation = [ # Simple dilation=2 on 8x8 input, kernel=3, stride=1, no padding ("dilation2", torch.rand(1, 1, 8, 8), [3, 1, 0, 2]), @@ -161,61 +159,6 @@ def test_max_pool2d_u85_INT(test_data: torch.Tensor): ).run() -@common.parametrize("test_data", test_data_suite_mult_batches) -def test_max_pool2d_tosa_FP_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - pipeline = TosaPipelineFP[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_op, - ) - pipeline.run() - - -@common.parametrize("test_data", test_data_suite_mult_batches) -def test_max_pool2d_tosa_INT_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - pipeline = TosaPipelineINT[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_op, - ) - pipeline.run() - - -x_fail = {"randn": "MLETORCH-986: Numerical issues with mutli batches."} - - -@common.parametrize("test_data", test_data_suite_mult_batches, x_fail) -@common.XfailIfNoCorstone300 -def test_max_pool2d_u55_INT_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - EthosU55PipelineINT[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_ops=[], - run_on_fvp=True, - use_to_edge_transform_and_lower=True, - ).run() - - -@common.parametrize("test_data", test_data_suite_mult_batches, x_fail) -@common.XfailIfNoCorstone320 -def test_max_pool2d_u85_INT_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - EthosU85PipelineINT[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_op, - run_on_fvp=True, - use_to_edge_transform_and_lower=True, - ).run() - - reject_data_suite = { "reject_1": lambda: (MaxPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)), "reject_2": lambda: (MaxPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)), @@ -306,34 +249,6 @@ def test_max_pool2d_vgf_INT(test_data: torch.Tensor): pipeline.run() -@common.parametrize("test_data", test_data_suite_mult_batches) -@common.SkipIfNoModelConverter -def test_max_pool2d_vgf_FP_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - pipeline = VgfPipeline[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_op, - tosa_version="TOSA-1.0+FP", - ) - pipeline.run() - - -@common.parametrize("test_data", test_data_suite_mult_batches) -@common.SkipIfNoModelConverter -def test_max_pool2d_vgf_INT_mult_batches(test_data: torch.Tensor): - test_data, model_params = test_data() - pipeline = VgfPipeline[input_t1]( - MaxPool2d(*model_params), - (test_data,), - aten_op, - exir_op, - tosa_version="TOSA-1.0+INT", - ) - pipeline.run() - - @common.parametrize("test_data", dilation_test_data) @common.SkipIfNoModelConverter def test_max_pool2d_vgf_FP_dilation(test_data: torch.Tensor): diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index 1483b5d82b6..061e8da14f1 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -280,6 +280,7 @@ def test_mean_dim_tosa_INT(test_data): (test_data,), [], # Might be sum, avgpool, or both symmetric_io_quantization=True, + custom_path="MEANDIM", ) pipeline.run() diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index b0b7f5f4b7d..d8f9e947ce3 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -200,15 +200,7 @@ def test_mul_tensor_u85_INT(test_data: torch.Tensor): pipeline.run() -@common.parametrize( - "test_data", - test_data_suite_int32, - xfails={ - # TODO: MLETORCH-1132 Investigate why tests with inputs that require broadcasting fail on u55/u85 - "op_mul_rank4_randn_mutltiple_broadcasts_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int", - "op_mul_rank4_randn_broadcast_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int", - }, -) +@common.parametrize("test_data", test_data_suite_int32) @common.XfailIfNoCorstone300 def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor): pipeline = EthosU55PipelineINT[input_t1]( @@ -222,15 +214,7 @@ def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor): pipeline.run() -@common.parametrize( - "test_data", - test_data_suite_int32, - xfails={ - # TODO: MLETORCH-1132 Investigate why tests with inputs that require broadcasting fail on u55/u85 - "op_mul_rank4_randn_mutltiple_broadcasts_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int", - "op_mul_rank4_randn_broadcast_int32": "RuntimeError: mean(): could not infer output dtype. Input dtype must be either a floating point or complex dtype. Got: Int", - }, -) +@common.parametrize("test_data", test_data_suite_int32) @common.XfailIfNoCorstone320 def test_mul_tensor_u85_INT_int32(test_data: torch.Tensor): pipeline = EthosU85PipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index 57f7f9603a1..eb482bcee54 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -72,13 +72,11 @@ def test_permute_tosa_INT(test_data: torch.Tensor): pipeline.run() -x_fails = { - "rank_4_2": "AssertionError: Output 0 does not match reference output.", - "rank_4_3": "AssertionError: Output 0 does not match reference output.", -} - - -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize( + "test_data", + test_data_suite, + xfails={"rank_4_3": "MLETORCH-955 : Permutation numerical diff for u55"}, +) @common.XfailIfNoCorstone300 def test_permute_u55_INT(test_data): test_data, dims = test_data() @@ -92,8 +90,7 @@ def test_permute_u55_INT(test_data): pipeline.run() -# Fails since on FVP since N > 1 is not supported. MLETORCH-517 -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 def test_permute_u85_INT(test_data: torch.Tensor): test_data, dims = test_data() diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py index dcf5a4a181b..4c3887f1e18 100644 --- a/backends/arm/test/ops/test_select.py +++ b/backends/arm/test/ops/test_select.py @@ -102,12 +102,7 @@ def test_select_int_tosa_INT(test_data: Tuple): pipeline.run() -x_fails = { - "select4d_0_dim_2_index": "AssertionError: Output 0 does not match reference output." -} - - -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 def test_select_int_u55_INT_copy(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( @@ -121,7 +116,7 @@ def test_select_int_u55_INT_copy(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone300 def test_select_int_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( @@ -148,7 +143,7 @@ def test_select_int_u55_INT_not_delegated(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 def test_select_int_u85_INT_copy(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( @@ -162,7 +157,7 @@ def test_select_int_u85_INT_copy(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", test_data_suite, x_fails) +@common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 def test_select_int_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py index 3d70881a3f0..4252d96bac9 100644 --- a/backends/arm/test/ops/test_sigmoid_16bit.py +++ b/backends/arm/test/ops/test_sigmoid_16bit.py @@ -103,9 +103,6 @@ def test_sigmoid_tosa_INT(test_data): @common.parametrize( "test_data", test_data_suite, - xfails={ - "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787" - }, strict=False, ) def test_sigmoid_tosa_INT_add_sigmoid(test_data): @@ -121,14 +118,6 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data): pipeline.run() -xfails = { - "ones": "AssertionError: Output 0 does not match reference output. MLETORCH-787", - "rand": "AssertionError: Output 0 does not match reference output. MLETORCH-787", - "rand_4d": "AssertionError: Output 0 does not match reference output. MLETORCH-787", - "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787", -} - - @common.parametrize( "test_data", test_data_suite, diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index 4bbd4d83285..dc258f20ec4 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -64,8 +64,8 @@ def test_softmax_tosa_INT(test_data): @common.parametrize( "test_data", Softmax.test_data, - xfails={ - "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP" + { + "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55." }, ) @common.XfailIfNoCorstone300 @@ -77,13 +77,7 @@ def test_softmax_u55_INT(test_data): pipeline.run() -@common.parametrize( - "test_data", - Softmax.test_data, - xfails={ - "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP" - }, -) +@common.parametrize("test_data", Softmax.test_data) @common.XfailIfNoCorstone320 def test_softmax_u85_INT(test_data): data, dim = test_data() diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py index 00ec1f48af8..15e2dd45322 100644 --- a/backends/arm/test/ops/test_sqrt.py +++ b/backends/arm/test/ops/test_sqrt.py @@ -40,11 +40,6 @@ def forward(self, x): } -fvp_xfails = { - "sqrt_tensor_rank4_multibatch": "MLETORCH-517 : Multiple batches not supported", -} - - @common.parametrize("test_data", Sqrt.test_data) def test_sqrt_tosa_FP(test_data: Sqrt.input_t): pipeline = TosaPipelineFP[Sqrt.input_t]( @@ -67,7 +62,7 @@ def test_sqrt_tosa_INT(test_data: Sqrt.input_t): pipeline.run() -@common.parametrize("test_data", Sqrt.test_data, fvp_xfails) +@common.parametrize("test_data", Sqrt.test_data) @common.XfailIfNoCorstone300 def test_sqrt_u55_INT(test_data: Sqrt.input_t): pipeline = EthosU55PipelineINT[Sqrt.input_t]( @@ -80,7 +75,7 @@ def test_sqrt_u55_INT(test_data: Sqrt.input_t): pipeline.run() -@common.parametrize("test_data", Sqrt.test_data, fvp_xfails) +@common.parametrize("test_data", Sqrt.test_data) @common.XfailIfNoCorstone320 def test_sqrt_u85_INT(test_data: Sqrt.input_t): pipeline = EthosU85PipelineINT[Sqrt.input_t]( diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index e89fee04b62..ab6612393b8 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -31,8 +31,6 @@ "zeros": lambda: (torch.zeros(10),), } -fvp_sub_xfails = {"rand_4D_2x3x4x5": "MLETORCH-517 : Multiple batches not supported"} - # Two-input subtraction (x - y) sub2_test_data = { "rand_2D_4x4": lambda: (torch.rand(4, 4), torch.rand(4, 4)), @@ -46,7 +44,6 @@ "rand_3d_rand_Scalar": lambda: (torch.rand(1, 6, 2), torch.rand(1)), "rand_3d_Scalar": lambda: (torch.rand(1, 6, 2), 1), } -fvp_sub2_xfails = {"rand_4D_2x2x4x4": "MLETORCH-517 : Multiple batches not supported"} class Sub(torch.nn.Module): @@ -111,7 +108,7 @@ def test_sub_tensor_tosa_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]): pipeline.run() -@common.parametrize("test_data", sub_test_data, fvp_sub_xfails) +@common.parametrize("test_data", sub_test_data) @common.XfailIfNoCorstone300 def test_sub_tensor_u55_INT(test_data): """Test Subtraction on Ethos-U55 (FVP Mode)""" @@ -125,7 +122,7 @@ def test_sub_tensor_u55_INT(test_data): pipeline.run() -@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails) +@common.parametrize("test_data", sub2_test_data) @common.XfailIfNoCorstone300 def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]): """Test Two-Operand Subtraction on Ethos-U55 (FVP Mode)""" @@ -139,7 +136,7 @@ def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]): pipeline.run() -@common.parametrize("test_data", sub_test_data, fvp_sub_xfails) +@common.parametrize("test_data", sub_test_data) @common.XfailIfNoCorstone320 def test_sub_tensor_u85_INT_2(test_data): """Test Subtraction on Ethos-U85 (FVP Mode)""" @@ -153,7 +150,7 @@ def test_sub_tensor_u85_INT_2(test_data): pipeline.run() -@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails) +@common.parametrize("test_data", sub2_test_data) @common.XfailIfNoCorstone320 def test_sub_tensor_u85_INT(test_data: Tuple[torch.Tensor, torch.Tensor]): """Test Two-Operand Subtraction on Ethos-U85 (FVP Mode)""" diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index 71cb2ed73bb..0aa6f9a0245 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -82,22 +82,7 @@ def test_view_tosa_INT(test_data: Tuple): pipeline.run() -xfails = { - "rand_4d_neg": "MLETORCH-517: Multiple batches not supported", - "rand_4d_4d_small": "MLETORCH-517: Multiple batches not supported", - "rand_4d_4d": "MLETORCH-517: Multiple batches not supported", - "rand_4d_2d": "MLETORCH-517: Multiple batches not supported", - "rand_4d_3d": "MLETORCH-517: Multiple batches not supported", - "rand_4d_1": "MLETORCH-517: Multiple batches not supported", - "rand_4d_2": "MLETORCH-517: Multiple batches not supported", - "rand_4d_2_4_big": "MLETORCH-517: Multiple batches not supported", - "rand_4d_4_3": "MLETORCH-517: Multiple batches not supported", - "rand_4d_4_2": "MLETORCH-517: Multiple batches not supported", - "rand_4d_2_4_same": "MLETORCH-517: Multiple batches not supported", -} - - -@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails) +@common.parametrize("test_data", View.needs_transpose_tests) @common.XfailIfNoCorstone300 def test_view_u55_INT(test_data: Tuple): test_tensor, new_shape = test_data() @@ -136,7 +121,7 @@ def test_view_vgf_INT(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", View.rank_product_too_large, xfails=xfails) +@common.parametrize("test_data", View.rank_product_too_large) @common.XfailIfNoCorstone300 def test_view_u55_INT_not_delegated(test_data: Tuple): test_tensor, new_shape = test_data() @@ -151,7 +136,7 @@ def test_view_u55_INT_not_delegated(test_data: Tuple): pipeline.run() -@common.parametrize("test_data", View.needs_transpose_tests, xfails=xfails) +@common.parametrize("test_data", View.needs_transpose_tests) @common.XfailIfNoCorstone320 def test_view_u85_INT(test_data: Tuple): test_tensor, new_shape = test_data() diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py index 7ede72d9c4d..ae6c414e884 100644 --- a/backends/arm/test/passes/test_rescale_pass.py +++ b/backends/arm/test/passes/test_rescale_pass.py @@ -172,7 +172,14 @@ def test_quantized_rescale_tosa_bi(test_data: tuple[torch.Tensor, torch.Tensor]) pipeline.run() -@common.parametrize("test_data", RescaleNetwork.test_data) +u55_xfails = { + "ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", + "randn_ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", + "randn_large": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", +} + + +@common.parametrize("test_data", RescaleNetwork.test_data, xfails=u55_xfails) @common.XfailIfNoCorstone300 def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]): """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and diff --git a/backends/arm/test/passes/test_to_tosa_memory_format.py b/backends/arm/test/passes/test_to_tosa_memory_format.py new file mode 100644 index 00000000000..1e9b8ffc63d --- /dev/null +++ b/backends/arm/test/passes/test_to_tosa_memory_format.py @@ -0,0 +1,192 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from executorch.backends.arm._passes import ToTosaMemoryFormatPass + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + PassPipeline, + TosaPipelineINT, +) +from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass + +input_t = Tuple[torch.Tensor] # Input x + + +class NoNHWC(torch.nn.Module): + """ + Test-module with no ops requiring NHWC mermory format. + """ + + ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 2} + ops_not_after_pass = [] + + def forward(self, x): + x = x + x + return x + + def get_inputs(self): + return (torch.rand(1, 2, 2, 2),) + + +class ParallelClusters(torch.nn.Module): + """ + Test-module with multiple parallel clusters of nodes requiring different memory formats. + """ + + ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 2} + ops_not_after_pass = [] + + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d( + in_channels=2, + out_channels=2, + kernel_size=1, + bias=True, + ) + self.maxpool = torch.nn.MaxPool2d(1, 1) + self.avgpool = torch.nn.AvgPool2d(1, 1) + + def forward(self, x): + x1 = self.conv(x) + x2 = self.maxpool(x) + x3 = self.avgpool(x) + x4 = x * x + return x1 + x2 + x3 + x4 + + def get_inputs(self): + return (torch.rand(1, 2, 2, 2),) + + +class SerialClusters(torch.nn.Module): + """ + Test-module with multiple serial clusters of nodes requring different memory formats. + """ + + ops_before_pass = {} + ops_after_pass = {"executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 4} + ops_not_after_pass = [] + + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d( + in_channels=2, + out_channels=2, + kernel_size=1, + bias=True, + ) + self.fc = torch.nn.Linear( + in_features=2, + out_features=2, + bias=True, + ) + + def forward(self, x): + x = self.conv(x) + x = x * x + x = self.conv(x) + x = x.view((2, 1, 2, 4)) + x = x * 2 + x = x.view((2, 2, 2, 2)) + x = self.conv(x) + return x + + def get_inputs(self): + return (torch.rand(2, 2, 2, 2),) + + +class Reshapes(torch.nn.Module): + """ + Test-module with different configurations of views requiring different memory formats. + """ + + ops_before_pass = {} + ops_after_pass = { + "executorch_exir_dialects_backend__ops_tosa_TRANSPOSE_default": 16 + } + ops_not_after_pass = [] + + def __init__(self): + super().__init__() + self.maxpool = torch.nn.MaxPool2d(1, 1) # Use maxpool to force NHWC format + + def forward(self, x): + + x = self.maxpool(x) + x = x.view((2, 2, 4, 16, 1)) # N-C-HW-invariant intact, no transposes needed + x = x * 2 # Add op to avoid views merging + x = x.view((4, 4, 4, 4)) + x = x / 2 # Add op to avoid views merging + x = self.maxpool(x) + + x = x.view((256)) # Break N-C-HW invariant + x = x * 2 + x = x.view((4, 4, 4, 4)) + x = x / 2 + x = self.maxpool(x) + + x = x.view((16, 16)) # Break N-C-HW invariant + x = x * 2 + x = x.view((4, 4, 4, 4)) + x = x / 2 + x = self.maxpool(x) + + x = x.view((16, 4, 4)) # Break N-C-HW invariant + x = x * 2 + x = x.view((4, 4, 4, 4)) + x = x / 2 + x = self.maxpool(x) + + x = x.view((2, 4, 4, 8)) # Break N-C-HW invariant + x = x * 2 + x = x.view((4, 4, 4, 4)) + x = x / 2 + x = self.maxpool(x) + + x = x.view((8, 1, 2, 4, 4)) # Break N-C-HW invariant + x = x * 2 + x = x.view((4, 4, 4, 4)) + x = self.maxpool(x) + + return x + + def get_inputs(self): + return (torch.rand(4, 4, 4, 4),) + + +modules = { + "no_nhwc": NoNHWC(), + "parallel_clusters": ParallelClusters(), + "serial_clusters": SerialClusters(), + "reshapes": Reshapes(), +} + + +@common.parametrize("module", modules) +def test_to_tosa_memory_format_tosa_INT(module): + # We cannot check op counts after a specific pass with the full pipeline + pipeline = PassPipeline[input_t]( + module, + module.get_inputs(), + ops_after_pass=module.ops_after_pass, + ops_not_after_pass=module.ops_not_after_pass, + pass_list=[RemoveGetItemPass], + passes_with_exported_program=[ToTosaMemoryFormatPass], + ) + pipeline.pop_stage( + "run_method_and_compare_outputs" + ) # Eager execution is not possible after introducing tosa.TRANSPOSE + pipeline.run() + + +@common.parametrize("module", modules) +def test_to_tosa_memory_format_tosa_INT_functional(module): + # Also run the actual pass pipeline to ensure functional correctness. + pipeline = TosaPipelineINT[input_t](module, module.get_inputs(), []) + pipeline.run() diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 4335e96c730..523a3f30a54 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -667,7 +667,6 @@ def run_tosa_graph( ) -> list[torch.Tensor]: """Runs the TOSA reference model with inputs and returns the result.""" inputs_np = [input.numpy() for input in inputs] - transpose_data_format(inputs_np, to="NHWC") if isinstance(tosa_version, Tosa_1_00): import tosa_reference_model as reference_model @@ -689,24 +688,9 @@ def run_tosa_graph( status == reference_model.GraphStatus.TOSA_VALID ), "Non-valid TOSA given to reference model." - transpose_data_format(outputs_np, to="NCHW") return [torch.from_numpy(output) for output in outputs_np] -def transpose_data_format(data: list[np.ndarray], to: Literal["NHWC", "NCHW"]): - for i in range(len(data)): - if hasattr(data[i], "shape") and data[i].ndim in (4, 5): - match to: - case "NCHW": - dim_order = (0, 3, 1, 2) if data[i].ndim == 4 else (0, 1, 4, 2, 3) - case "NHWC": - dim_order = (0, 2, 3, 1) if data[i].ndim == 4 else (0, 1, 3, 4, 2) - case _: - raise NotImplementedError(f"Cant transpose to dim order {to}") - # Copy is needed to force actual data conversion, not setting stride. - data[i] = np.transpose(data[i], dim_order).copy() - - def get_target_board(compile_spec: list[CompileSpec]) -> str | None: if is_vgf(compile_spec): return "vkml_emulation_layer"