From 1f60bb76deaed6fe2afc4df08377ac3a736b55ab Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Tue, 29 Jul 2025 11:44:18 -0700 Subject: [PATCH 01/10] Add clone_dim_order kernel (portable + ATen) and layout conversion tests --- exir/tests/test_memory_format_ops_pass.py | 32 ++++++++ kernels/aten/cpu/op__clone_dim_order.cpp | 62 +++++++++++++++ kernels/aten/cpu/util/copy_ops_util.cpp | 22 ++++++ kernels/aten/cpu/util/copy_ops_util.h | 3 + kernels/aten/edge_dialect_aten_op.yaml | 5 ++ kernels/portable/cpu/op__clone_dim_order.cpp | 80 ++++++++++++++++++++ kernels/portable/cpu/util/copy_ops_util.h | 24 ++++++ kernels/portable/functions.yaml | 5 ++ 8 files changed, 233 insertions(+) create mode 100644 kernels/aten/cpu/op__clone_dim_order.cpp create mode 100644 kernels/portable/cpu/op__clone_dim_order.cpp diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py index 84cd0faa485..f6fa22f5c1c 100644 --- a/exir/tests/test_memory_format_ops_pass.py +++ b/exir/tests/test_memory_format_ops_pass.py @@ -28,6 +28,8 @@ MemoryFormatOpsPassTestUtils, MemoryFormatTestSet, PropagateToCopyChannalsLastModule, + SimpleCloneChannelsLastModule, + SimpleCloneContiguousModule, SimpleEmptyChannelLastModule, SimpleEmptyContiguoustModule, SimpleToCopyChannelsLastModule, @@ -91,6 +93,36 @@ def test_op_empty_replacement_contiguous(self) -> None: ), ) + def test_op_clone_replacement_contiguous(self) -> None: + model = SimpleCloneContiguousModule() + MemoryFormatOpsPassTestUtils.memory_format_test_runner( + self, + MemoryFormatTestSet( + module=model.eval(), + op=torch.ops.aten.clone.default, + sample_input=( + torch.randn((3, 4, 5, 6)).to(memory_format=torch.channels_last), + ), + target_memory_format=torch.contiguous_format, + _load_for_executorch_from_buffer=_load_for_executorch_from_buffer, + ), + ) + + def test_op_clone_replacement_channels_last(self) -> None: + model = SimpleCloneChannelsLastModule() + MemoryFormatOpsPassTestUtils.memory_format_test_runner( + self, + MemoryFormatTestSet( + module=model.eval(), + op=torch.ops.aten.clone.default, + sample_input=( + torch.randn((3, 4, 5, 6)).to(memory_format=torch.contiguous_format), + ), + target_memory_format=torch.channels_last, + _load_for_executorch_from_buffer=_load_for_executorch_from_buffer, + ), + ) + def test_op_dim_order_update(self) -> None: MemoryFormatOpsPassTestUtils.memory_format_test_runner( self, diff --git a/kernels/aten/cpu/op__clone_dim_order.cpp b/kernels/aten/cpu/op__clone_dim_order.cpp new file mode 100644 index 00000000000..7f2fd658f41 --- /dev/null +++ b/kernels/aten/cpu/op__clone_dim_order.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; +using MemoryFormat = executorch::aten::MemoryFormat; + +template +using OptionalArrayRef = executorch::aten::OptionalArrayRef; + +template +using Optional = std::optional; + +/** + * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? + * dim_order=None, Tensor(a!) out) -> Tensor(a!) + * + * Clones with explicit dim_order, using the corresponding memory format. + */ +Tensor& _clone_dim_order_out( + KernelRuntimeContext& ctx, + const Tensor& self, + bool non_blocking, + OptionalArrayRef dim_order, + Tensor& out) { + // Ensure output has the same layout as input or matches dim_order. + ET_KERNEL_CHECK( + ctx, + check__to_dim_order_copy_args(self, non_blocking, dim_order, out), + InvalidArgument, + out); + + Optional memory_format = get_memory_format(dim_order); + at::clone_outf(self, memory_format, out); + + return out; +} + +Tensor& _clone_dim_order_out( + const Tensor& self, + bool non_blocking, + OptionalArrayRef dim_order, + Tensor& out) { + KernelRuntimeContext ctx{}; + return _clone_dim_order_out(ctx, self, non_blocking, dim_order, out); +} + +} // namespace native +} // namespace executor +} // namespace torch \ No newline at end of file diff --git a/kernels/aten/cpu/util/copy_ops_util.cpp b/kernels/aten/cpu/util/copy_ops_util.cpp index 0fe5342ca39..157534aaa23 100644 --- a/kernels/aten/cpu/util/copy_ops_util.cpp +++ b/kernels/aten/cpu/util/copy_ops_util.cpp @@ -15,6 +15,28 @@ namespace torch { namespace executor { using Tensor = executorch::aten::Tensor; +using MemoryFormat = executorch::aten::MemoryFormat; + +/** + * Determines the memory format (Contiguous or ChannelsLast) corresponding to + * the dim_order. Provides support for bridging torch.memory_format with + * ExecuTorch's dim_order. + */ +std::optional get_memory_format( + executorch::aten::OptionalArrayRef dim_order) { + if (!dim_order.has_value()) { + return executorch::aten::nullopt; + } + if (is_contiguous_dim_order( + dim_order.value().data(), dim_order.value().size())) { + return MemoryFormat::Contiguous; + } else if (is_channels_last_dim_order( + dim_order.value().data(), dim_order.value().size())) { + return MemoryFormat::ChannelsLast; + } else { + ET_ASSERT_UNREACHABLE(); + } +} bool check__to_dim_order_copy_args( const Tensor& input, diff --git a/kernels/aten/cpu/util/copy_ops_util.h b/kernels/aten/cpu/util/copy_ops_util.h index dd9c50123db..c3aa2fd3205 100644 --- a/kernels/aten/cpu/util/copy_ops_util.h +++ b/kernels/aten/cpu/util/copy_ops_util.h @@ -13,6 +13,9 @@ namespace torch { namespace executor { +std::optional get_memory_format( + executorch::aten::OptionalArrayRef dim_order); + bool check__to_dim_order_copy_args( const Tensor& input, bool non_blocking, diff --git a/kernels/aten/edge_dialect_aten_op.yaml b/kernels/aten/edge_dialect_aten_op.yaml index d9de3f6dded..3ea81d34134 100644 --- a/kernels/aten/edge_dialect_aten_op.yaml +++ b/kernels/aten/edge_dialect_aten_op.yaml @@ -11,3 +11,8 @@ kernels: - arg_meta: null kernel_name: torch::executor::_to_dim_order_copy_out + +- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_clone_dim_order_out \ No newline at end of file diff --git a/kernels/portable/cpu/op__clone_dim_order.cpp b/kernels/portable/cpu/op__clone_dim_order.cpp new file mode 100644 index 00000000000..83045768cf2 --- /dev/null +++ b/kernels/portable/cpu/op__clone_dim_order.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; + +template +using OptionalArrayRef = executorch::aten::OptionalArrayRef; + +/** + * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? + * dim_order=None, Tensor(a!) out) -> Tensor(a!) + * + * Clones via element-wise copy while preserving dim_order. + */ +Tensor& _clone_dim_order_out( + KernelRuntimeContext& ctx, + const Tensor& self, + bool non_blocking, + OptionalArrayRef dim_order, + Tensor& out) { + (void)ctx; + + // Ensure input and output dtype match. + ET_KERNEL_CHECK( + ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out); + + // Ensure output has the same layout as input or matches dim_order. + ET_KERNEL_CHECK( + ctx, + check__to_dim_order_copy_args(self, non_blocking, dim_order, out), + InvalidArgument, + out); + + // Ensure input and output shapes match, resizing if necessary. + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, self.sizes()) == torch::executor::Error::Ok, + InvalidArgument, + out); + + if (self.numel() == 0) { + return out; + } + + // Select the correct input dtype and copy the tensors. + ET_SWITCH_REALHBBF16_TYPES( + self.scalar_type(), + ctx, + "dim_order_ops::_clone_dim_order.out", + CTYPE, + [&] { _to_dim_order_copy_impl(self, out); }); + + return out; +} + +Tensor& _clone_dim_order_out( + const Tensor& self, + bool non_blocking, + OptionalArrayRef dim_order, + Tensor& out) { + executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{}; + return _clone_dim_order_out(context, self, non_blocking, dim_order, out); +} + +} // namespace native +} // namespace executor +} // namespace torch \ No newline at end of file diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index e7cd6f6790c..15a7916e0e8 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include namespace torch { @@ -77,6 +78,29 @@ void as_strided_copy( } } +/** + * Copies and casts a tensor while preserving input dim_order. + */ +template +void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) { + auto self_data = self.mutable_data_ptr(); + auto out_data = out.mutable_data_ptr(); + + // Here we make a slightly off-label use of + // BroadcastIndexesRange. It always assumes it doesn't have to care + // about different dim_order between input and output, but we can + // just force it to respect strides (and thus dim_order) for its + // inputs using support_noncontiguous_input_tensors=true, and then pretend + // the output is just another input. + for (const auto [unused_index, self_data_index, out_data_index] : + BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>( + /*dummy output*/ self, self, out)) { + (void)unused_index; + out_data[out_data_index] = + static_cast(self_data[self_data_index]); + } +} + bool check_cat_args( executorch::aten::ArrayRef tensors, int64_t dim, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index feaee415f91..cb04241096f 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -1009,3 +1009,8 @@ kernels: - arg_meta: null kernel_name: torch::executor::_to_dim_order_copy_out + +- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_clone_dim_order_out \ No newline at end of file From beeeebd34fc8de29af86b623170c38c7e96e5c53 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Tue, 29 Jul 2025 11:56:35 -0700 Subject: [PATCH 02/10] Remove duplicate helper functions (moved to utils) --- kernels/aten/cpu/op__to_dim_order_copy.cpp | 66 +------------------ .../portable/cpu/op__to_dim_order_copy.cpp | 23 ------- 2 files changed, 1 insertion(+), 88 deletions(-) diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp index 0ed10f69d5a..78472115bdf 100644 --- a/kernels/aten/cpu/op__to_dim_order_copy.cpp +++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include @@ -25,71 +26,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef; template using Optional = std::optional; -namespace { -Optional get_memory_format(OptionalArrayRef dim_order) { - if (!dim_order.has_value()) { - return executorch::aten::nullopt; - } - if (is_contiguous_dim_order( - dim_order.value().data(), dim_order.value().size())) { - return MemoryFormat::Contiguous; - } else if (is_channels_last_dim_order( - dim_order.value().data(), dim_order.value().size())) { - return MemoryFormat::ChannelsLast; - } else { - ET_ASSERT_UNREACHABLE(); - } -} - -bool check__to_dim_order_copy_args( - const Tensor& input, - bool non_blocking, - executorch::aten::OptionalArrayRef dim_order, - Tensor& out) { - // Right now we only support blocking data transfer - ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false); - - // dim_order is set, the target dim_order will be either contiguous or - // channels_last memory format - if (dim_order.has_value()) { - executorch::aten::ArrayRef dim_order_ref = dim_order.value(); - - // dim order size shall equal to input dim - ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim()); - - ET_LOG_AND_RETURN_IF_FALSE( - is_channels_last_dim_order( - dim_order.value().data(), dim_order.value().size()) || - is_contiguous_dim_order( - dim_order.value().data(), dim_order.value().size())); - - // Out Aten tensor shall have same memory format stride as dim_order - const size_t kMaxNumOfDimensions = 16; - ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim()); - executorch::aten::StridesType target_strides[kMaxNumOfDimensions]; - dim_order_to_stride_nocheck( - out.sizes().data(), - dim_order_ref.data(), - dim_order_ref.size(), - target_strides); - ET_LOG_AND_RETURN_IF_FALSE(out.dim() == dim_order_ref.size()); - for (size_t i = 0; i < dim_order_ref.size(); i++) { - ET_LOG_AND_RETURN_IF_FALSE(target_strides[i] == out.strides()[i]); - } - - } else { // dim_order is not set, preserve the dim order of input - - auto out_strides = out.strides(); - auto input_strides = input.strides(); - ET_LOG_AND_RETURN_IF_FALSE(input_strides.size() == out_strides.size()); - for (size_t i = 0; i < input_strides.size(); i++) { - ET_LOG_AND_RETURN_IF_FALSE(input_strides[i] == out_strides[i]); - } - } - return true; -} -} // namespace - // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]? // dim_order=None, Tensor(a!) out) -> Tensor(a!) Tensor& _to_dim_order_copy_out( diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp index fb47ff7b6ef..b6e35f90cdb 100644 --- a/kernels/portable/cpu/op__to_dim_order_copy.cpp +++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp @@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef; template using Optional = std::optional; -namespace { - -template -void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) { - auto self_data = self.mutable_data_ptr(); - auto out_data = out.mutable_data_ptr(); - - // Here we make a slightly off-label use of - // BroadcastIndexesRange. It always assumes it doesn't have to care - // about different dim_order between input and output, but we can - // just force it to respect strides (and thus dim_order) for its - // inputs using support_noncontiguous_input_tensors=true, and then pretend - // the output is just another input. - for (const auto [unused_index, self_data_index, out_data_index] : - BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>( - /*dummy output*/ self, self, out)) { - (void)unused_index; - out_data[out_data_index] = - static_cast(self_data[self_data_index]); - } -} -} // namespace - // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]? // dim_order=None, Tensor(a!) out) -> Tensor(a!) Tensor& _to_dim_order_copy_out( From f8d18ec09b5c005118994ce9930f7f91182c2731 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Sat, 2 Aug 2025 18:31:21 -0700 Subject: [PATCH 03/10] Revert ATen _clone_dim_order implementation and refactoring --- kernels/aten/cpu/op__clone_dim_order.cpp | 62 -------------------- kernels/aten/cpu/op__to_dim_order_copy.cpp | 66 +++++++++++++++++++++- kernels/aten/cpu/util/copy_ops_util.cpp | 22 -------- kernels/aten/cpu/util/copy_ops_util.h | 3 - kernels/aten/edge_dialect_aten_op.yaml | 5 -- 5 files changed, 65 insertions(+), 93 deletions(-) delete mode 100644 kernels/aten/cpu/op__clone_dim_order.cpp diff --git a/kernels/aten/cpu/op__clone_dim_order.cpp b/kernels/aten/cpu/op__clone_dim_order.cpp deleted file mode 100644 index 7f2fd658f41..00000000000 --- a/kernels/aten/cpu/op__clone_dim_order.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include - -namespace torch { -namespace executor { -namespace native { - -using Tensor = executorch::aten::Tensor; -using MemoryFormat = executorch::aten::MemoryFormat; - -template -using OptionalArrayRef = executorch::aten::OptionalArrayRef; - -template -using Optional = std::optional; - -/** - * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? - * dim_order=None, Tensor(a!) out) -> Tensor(a!) - * - * Clones with explicit dim_order, using the corresponding memory format. - */ -Tensor& _clone_dim_order_out( - KernelRuntimeContext& ctx, - const Tensor& self, - bool non_blocking, - OptionalArrayRef dim_order, - Tensor& out) { - // Ensure output has the same layout as input or matches dim_order. - ET_KERNEL_CHECK( - ctx, - check__to_dim_order_copy_args(self, non_blocking, dim_order, out), - InvalidArgument, - out); - - Optional memory_format = get_memory_format(dim_order); - at::clone_outf(self, memory_format, out); - - return out; -} - -Tensor& _clone_dim_order_out( - const Tensor& self, - bool non_blocking, - OptionalArrayRef dim_order, - Tensor& out) { - KernelRuntimeContext ctx{}; - return _clone_dim_order_out(ctx, self, non_blocking, dim_order, out); -} - -} // namespace native -} // namespace executor -} // namespace torch \ No newline at end of file diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp index 78472115bdf..0ed10f69d5a 100644 --- a/kernels/aten/cpu/op__to_dim_order_copy.cpp +++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include @@ -26,6 +25,71 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef; template using Optional = std::optional; +namespace { +Optional get_memory_format(OptionalArrayRef dim_order) { + if (!dim_order.has_value()) { + return executorch::aten::nullopt; + } + if (is_contiguous_dim_order( + dim_order.value().data(), dim_order.value().size())) { + return MemoryFormat::Contiguous; + } else if (is_channels_last_dim_order( + dim_order.value().data(), dim_order.value().size())) { + return MemoryFormat::ChannelsLast; + } else { + ET_ASSERT_UNREACHABLE(); + } +} + +bool check__to_dim_order_copy_args( + const Tensor& input, + bool non_blocking, + executorch::aten::OptionalArrayRef dim_order, + Tensor& out) { + // Right now we only support blocking data transfer + ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false); + + // dim_order is set, the target dim_order will be either contiguous or + // channels_last memory format + if (dim_order.has_value()) { + executorch::aten::ArrayRef dim_order_ref = dim_order.value(); + + // dim order size shall equal to input dim + ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim()); + + ET_LOG_AND_RETURN_IF_FALSE( + is_channels_last_dim_order( + dim_order.value().data(), dim_order.value().size()) || + is_contiguous_dim_order( + dim_order.value().data(), dim_order.value().size())); + + // Out Aten tensor shall have same memory format stride as dim_order + const size_t kMaxNumOfDimensions = 16; + ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim()); + executorch::aten::StridesType target_strides[kMaxNumOfDimensions]; + dim_order_to_stride_nocheck( + out.sizes().data(), + dim_order_ref.data(), + dim_order_ref.size(), + target_strides); + ET_LOG_AND_RETURN_IF_FALSE(out.dim() == dim_order_ref.size()); + for (size_t i = 0; i < dim_order_ref.size(); i++) { + ET_LOG_AND_RETURN_IF_FALSE(target_strides[i] == out.strides()[i]); + } + + } else { // dim_order is not set, preserve the dim order of input + + auto out_strides = out.strides(); + auto input_strides = input.strides(); + ET_LOG_AND_RETURN_IF_FALSE(input_strides.size() == out_strides.size()); + for (size_t i = 0; i < input_strides.size(); i++) { + ET_LOG_AND_RETURN_IF_FALSE(input_strides[i] == out_strides[i]); + } + } + return true; +} +} // namespace + // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]? // dim_order=None, Tensor(a!) out) -> Tensor(a!) Tensor& _to_dim_order_copy_out( diff --git a/kernels/aten/cpu/util/copy_ops_util.cpp b/kernels/aten/cpu/util/copy_ops_util.cpp index 157534aaa23..0fe5342ca39 100644 --- a/kernels/aten/cpu/util/copy_ops_util.cpp +++ b/kernels/aten/cpu/util/copy_ops_util.cpp @@ -15,28 +15,6 @@ namespace torch { namespace executor { using Tensor = executorch::aten::Tensor; -using MemoryFormat = executorch::aten::MemoryFormat; - -/** - * Determines the memory format (Contiguous or ChannelsLast) corresponding to - * the dim_order. Provides support for bridging torch.memory_format with - * ExecuTorch's dim_order. - */ -std::optional get_memory_format( - executorch::aten::OptionalArrayRef dim_order) { - if (!dim_order.has_value()) { - return executorch::aten::nullopt; - } - if (is_contiguous_dim_order( - dim_order.value().data(), dim_order.value().size())) { - return MemoryFormat::Contiguous; - } else if (is_channels_last_dim_order( - dim_order.value().data(), dim_order.value().size())) { - return MemoryFormat::ChannelsLast; - } else { - ET_ASSERT_UNREACHABLE(); - } -} bool check__to_dim_order_copy_args( const Tensor& input, diff --git a/kernels/aten/cpu/util/copy_ops_util.h b/kernels/aten/cpu/util/copy_ops_util.h index c3aa2fd3205..dd9c50123db 100644 --- a/kernels/aten/cpu/util/copy_ops_util.h +++ b/kernels/aten/cpu/util/copy_ops_util.h @@ -13,9 +13,6 @@ namespace torch { namespace executor { -std::optional get_memory_format( - executorch::aten::OptionalArrayRef dim_order); - bool check__to_dim_order_copy_args( const Tensor& input, bool non_blocking, diff --git a/kernels/aten/edge_dialect_aten_op.yaml b/kernels/aten/edge_dialect_aten_op.yaml index 3ea81d34134..d9de3f6dded 100644 --- a/kernels/aten/edge_dialect_aten_op.yaml +++ b/kernels/aten/edge_dialect_aten_op.yaml @@ -11,8 +11,3 @@ kernels: - arg_meta: null kernel_name: torch::executor::_to_dim_order_copy_out - -- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: torch::executor::_clone_dim_order_out \ No newline at end of file From 62bced0d876e549eb1f048d8d12e2695f91664d6 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Sat, 2 Aug 2025 18:33:27 -0700 Subject: [PATCH 04/10] Register _clone_dim_order and add operator tests --- exir/passes/dim_order_ops_registry.py | 18 +++++++ exir/tests/test_memory_format_ops_pass.py | 58 ++++++++++++----------- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/exir/passes/dim_order_ops_registry.py b/exir/passes/dim_order_ops_registry.py index f3fc009f109..e9f7fcf190c 100644 --- a/exir/passes/dim_order_ops_registry.py +++ b/exir/passes/dim_order_ops_registry.py @@ -28,6 +28,14 @@ "_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "_clone_dim_order(Tensor self, *, bool non_blocking=False, int[]? dim_order=None) -> Tensor" +) + +lib.define( + "_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)" +) + def _op_impl(target, *args, **kwargs): kwargs["memory_format"] = get_memory_format(kwargs.get("dim_order", None)) @@ -57,6 +65,16 @@ def _empty_dim_order_out_impl(*args, **kwargs): return _op_impl(torch.ops.aten.empty.out, *args, **kwargs) +@impl(lib, "_clone_dim_order", "CompositeImplicitAutograd") +def _clone_dim_order_impl(*args, **kwargs): + return _op_impl(torch.ops.aten.clone.default, *args, **kwargs) + + +@impl(lib, "_clone_dim_order.out", "CompositeImplicitAutograd") +def _clone_dim_order_out_impl(*args, **kwargs): + return _op_impl(torch.ops.aten.clone.out, *args, **kwargs) + + """ Defines a map of edge ops to the corresponding dim_order ops for quick lookup """ diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py index f6fa22f5c1c..0172026be74 100644 --- a/exir/tests/test_memory_format_ops_pass.py +++ b/exir/tests/test_memory_format_ops_pass.py @@ -28,8 +28,6 @@ MemoryFormatOpsPassTestUtils, MemoryFormatTestSet, PropagateToCopyChannalsLastModule, - SimpleCloneChannelsLastModule, - SimpleCloneContiguousModule, SimpleEmptyChannelLastModule, SimpleEmptyContiguoustModule, SimpleToCopyChannelsLastModule, @@ -93,36 +91,40 @@ def test_op_empty_replacement_contiguous(self) -> None: ), ) - def test_op_clone_replacement_contiguous(self) -> None: - model = SimpleCloneContiguousModule() - MemoryFormatOpsPassTestUtils.memory_format_test_runner( - self, - MemoryFormatTestSet( - module=model.eval(), - op=torch.ops.aten.clone.default, - sample_input=( - torch.randn((3, 4, 5, 6)).to(memory_format=torch.channels_last), - ), - target_memory_format=torch.contiguous_format, - _load_for_executorch_from_buffer=_load_for_executorch_from_buffer, - ), + def test_op_clone_dim_order_preserves_channels_last(self): + x = torch.randn(2, 3, 4, 5).to(memory_format=torch.channels_last) + y = torch.ops.dim_order_ops._clone_dim_order.default(x) + + assert y.is_contiguous( + memory_format=torch.channels_last + ), "_clone_dim_order output is not in channels_last memory format." + assert torch.allclose(x, y) + + def test_op_clone_dim_order_to_contiguous(self): + x = torch.randn(2, 3, 4, 5).to(memory_format=torch.channels_last) + contiguous_dim_order = get_dim_order(torch.contiguous_format, x.dim()) + y = torch.ops.dim_order_ops._clone_dim_order.default( + x, dim_order=contiguous_dim_order ) - def test_op_clone_replacement_channels_last(self) -> None: - model = SimpleCloneChannelsLastModule() - MemoryFormatOpsPassTestUtils.memory_format_test_runner( - self, - MemoryFormatTestSet( - module=model.eval(), - op=torch.ops.aten.clone.default, - sample_input=( - torch.randn((3, 4, 5, 6)).to(memory_format=torch.contiguous_format), - ), - target_memory_format=torch.channels_last, - _load_for_executorch_from_buffer=_load_for_executorch_from_buffer, - ), + assert ( + y.is_contiguous() + ), "_clone_dim_order output is not in contiguous memory format" + assert torch.allclose(x, y) + + def test_op_clone_dim_order_out_to_channels_last(self): + x = torch.randn(2, 3, 4, 5).contiguous() + y = torch.empty_like(x, memory_format=torch.channels_last) + channels_last_dim_order = get_dim_order(torch.channels_last, y.dim()) + torch.ops.dim_order_ops._clone_dim_order.out( + x, dim_order=channels_last_dim_order, out=y ) + assert y.is_contiguous( + memory_format=torch.channels_last + ), "_clone_dim_order output is not in channels_last memory format" + assert torch.allclose(x, y) + def test_op_dim_order_update(self) -> None: MemoryFormatOpsPassTestUtils.memory_format_test_runner( self, From b4e7f7a649d8208128969cd968491171e1042499 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:03:06 -0700 Subject: [PATCH 05/10] Add broadcast_util dep for copy_ops_util.h --- kernels/portable/cpu/util/targets.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 1806ebb0d5a..b61dab7eb07 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -149,6 +149,7 @@ def define_common_targets(): compiler_flags = ["-Wno-missing-prototypes"], deps = [ "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu/util:broadcast_util", ], visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], ) @@ -348,7 +349,6 @@ def define_common_targets(): ], ) - runtime.cxx_library( name = "arange_util{}".format(suffix), srcs = ["arange_util.cpp"], From a4f98ac15fb5acdb812d6b977ccd63e3019005e6 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:09:55 -0700 Subject: [PATCH 06/10] Remove Python side registration and tests for _clone_dim_order --- exir/passes/dim_order_ops_registry.py | 18 ------------ exir/tests/test_memory_format_ops_pass.py | 34 ----------------------- 2 files changed, 52 deletions(-) diff --git a/exir/passes/dim_order_ops_registry.py b/exir/passes/dim_order_ops_registry.py index e9f7fcf190c..f3fc009f109 100644 --- a/exir/passes/dim_order_ops_registry.py +++ b/exir/passes/dim_order_ops_registry.py @@ -28,14 +28,6 @@ "_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)" ) -lib.define( - "_clone_dim_order(Tensor self, *, bool non_blocking=False, int[]? dim_order=None) -> Tensor" -) - -lib.define( - "_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)" -) - def _op_impl(target, *args, **kwargs): kwargs["memory_format"] = get_memory_format(kwargs.get("dim_order", None)) @@ -65,16 +57,6 @@ def _empty_dim_order_out_impl(*args, **kwargs): return _op_impl(torch.ops.aten.empty.out, *args, **kwargs) -@impl(lib, "_clone_dim_order", "CompositeImplicitAutograd") -def _clone_dim_order_impl(*args, **kwargs): - return _op_impl(torch.ops.aten.clone.default, *args, **kwargs) - - -@impl(lib, "_clone_dim_order.out", "CompositeImplicitAutograd") -def _clone_dim_order_out_impl(*args, **kwargs): - return _op_impl(torch.ops.aten.clone.out, *args, **kwargs) - - """ Defines a map of edge ops to the corresponding dim_order ops for quick lookup """ diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py index 0172026be74..84cd0faa485 100644 --- a/exir/tests/test_memory_format_ops_pass.py +++ b/exir/tests/test_memory_format_ops_pass.py @@ -91,40 +91,6 @@ def test_op_empty_replacement_contiguous(self) -> None: ), ) - def test_op_clone_dim_order_preserves_channels_last(self): - x = torch.randn(2, 3, 4, 5).to(memory_format=torch.channels_last) - y = torch.ops.dim_order_ops._clone_dim_order.default(x) - - assert y.is_contiguous( - memory_format=torch.channels_last - ), "_clone_dim_order output is not in channels_last memory format." - assert torch.allclose(x, y) - - def test_op_clone_dim_order_to_contiguous(self): - x = torch.randn(2, 3, 4, 5).to(memory_format=torch.channels_last) - contiguous_dim_order = get_dim_order(torch.contiguous_format, x.dim()) - y = torch.ops.dim_order_ops._clone_dim_order.default( - x, dim_order=contiguous_dim_order - ) - - assert ( - y.is_contiguous() - ), "_clone_dim_order output is not in contiguous memory format" - assert torch.allclose(x, y) - - def test_op_clone_dim_order_out_to_channels_last(self): - x = torch.randn(2, 3, 4, 5).contiguous() - y = torch.empty_like(x, memory_format=torch.channels_last) - channels_last_dim_order = get_dim_order(torch.channels_last, y.dim()) - torch.ops.dim_order_ops._clone_dim_order.out( - x, dim_order=channels_last_dim_order, out=y - ) - - assert y.is_contiguous( - memory_format=torch.channels_last - ), "_clone_dim_order output is not in channels_last memory format" - assert torch.allclose(x, y) - def test_op_dim_order_update(self) -> None: MemoryFormatOpsPassTestUtils.memory_format_test_runner( self, From 9d45191950304d4a5a359d28dc72baa7dff535f6 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:44:07 -0700 Subject: [PATCH 07/10] Add _clone_dim_order runtime test --- kernels/test/CMakeLists.txt | 1 + kernels/test/op__clone_dim_order_test.cpp | 379 ++++++++++++++++++++++ kernels/test/targets.bzl | 1 + 3 files changed, 381 insertions(+) create mode 100644 kernels/test/op__clone_dim_order_test.cpp diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index f5997a1ee3f..f4e8d0ee311 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -108,6 +108,7 @@ add_custom_target( set(all_test_sources "BinaryLogicalOpTest.cpp" "op__to_dim_order_copy_test.cpp" + "op__clone_dim_order_test.cpp" "op_abs_test.cpp" "op_acos_test.cpp" "op_acosh_test.cpp" diff --git a/kernels/test/op__clone_dim_order_test.cpp b/kernels/test/op__clone_dim_order_test.cpp new file mode 100644 index 00000000000..1f2319d2f4d --- /dev/null +++ b/kernels/test/op__clone_dim_order_test.cpp @@ -0,0 +1,379 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include // Declares the operator. +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using executorch::aten::ArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using std::optional; +using torch::executor::testing::TensorFactory; + +class OpDimOrderCloneTest : public OperatorTest { + protected: + Tensor& op__clone_dim_order_out( + const Tensor& self, + bool non_blocking, + std::optional> dim_order, + Tensor& out) { + return torch::executor::dim_order_ops::_clone_dim_order_outf( + context_, self, non_blocking, dim_order, out); + } + + template + std::vector vector_type_cast(std::vector input) { + std::vector output(input.size()); + std::transform( + input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) { + return static_cast(x); + }); + return output; + } + + template + struct ToTestCase { + const std::vector sizes; + const std::vector data_in; + const std::vector data_out; + }; + + template + void test_runner_clone(std::vector> test_cases) { + TensorFactory tf_in; + TensorFactory tf_out; + + for (const auto& test_case : test_cases) { + auto data_in = vector_type_cast(test_case.data_in); + + Tensor input = tf_in.make(test_case.sizes, data_in); + Tensor output = tf_out.zeros_like(input); + + std::vector dim_order_vec; + for (int64_t i = 0; i < input.dim(); i++) { + dim_order_vec.push_back(i); + } + ArrayRef dim_order(dim_order_vec.data(), dim_order_vec.size()); + + Tensor ret = op__clone_dim_order_out( + /*self=*/input, + /*non_blocking=*/false, + dim_order, + output); + + Tensor expected = tf_out.make(test_case.sizes, data_in); + + // Verifies that the returned and output tensor from _clone_dim_order both + // match the original input (expected). + EXPECT_TENSOR_EQ(ret, output); + EXPECT_TENSOR_EQ(ret, expected); + } + } + + /* %python + import torch + torch.manual_seed(0) + x = torch.rand(2, 3) + res = x.clone(memory_format = torch.preserve_format) + op = "op__clone_dim_order_out" + opt_setup_params = """ + bool non_blocking = false; + optional memory_format; + """ + opt_extra_params = "non_blocking, memory_format," + out_args = "out_shape, dynamism" + dtype = "ScalarType::Float" + check = "EXPECT_TENSOR_EQ" */ + + // Helper for testing dynamic shape outputs. + void test_dynamic_shape( + const std::vector& out_shape, + enum torch::executor::TensorShapeDynamism dynamism) { + /* %python + %rewrite(unary_op) */ + + TensorFactory tf; + + Tensor x = tf.make( + {2, 3}, + {0.49625658988952637, + 0.7682217955589294, + 0.08847743272781372, + 0.13203048706054688, + 0.30742281675338745, + 0.6340786814689636}); + Tensor expected = tf.make( + {2, 3}, + {0.49625658988952637, + 0.7682217955589294, + 0.08847743272781372, + 0.13203048706054688, + 0.30742281675338745, + 0.6340786814689636}); + + bool non_blocking = false; + + Tensor out = tf.zeros(out_shape, dynamism); + + std::vector dim_order_vec; + for (int64_t i = 0; i < x.dim(); i++) { + dim_order_vec.push_back(i); + } + ArrayRef dim_order(dim_order_vec.data(), dim_order_vec.size()); + + Tensor ret = op__clone_dim_order_out( + /*self=*/x, non_blocking, dim_order, out); + + EXPECT_TENSOR_EQ(out, expected); + EXPECT_TENSOR_EQ(ret, expected); + } +}; + +// Clones tensors of all real dtypes. +TEST_F(OpDimOrderCloneTest, AllDtypesSupported) { + std::vector> test_cases = { + { + /*sizes=*/{2, 4}, + /*data_in=*/{2.11, 3.2, 2.3, 4.0, 1.1, 5.2, 1.1, 6.3}, + /*data_out=*/{}, // data_out shouldn't be used in test_runner_clone + }, + { + /*sizes=*/{3, 4, 0, 5}, + /*data_in=*/{}, + /*data_out=*/{}, + }, + { + /*sizes=*/{}, + /*data_in=*/{10.0}, + /*data_out=*/{}, // data_out shouldn't be used in test_runner_clone + }, + }; + +#define TEST_KERNEL(CTYPE, DTYPE) \ + test_runner_clone(test_cases); + + ET_FORALL_REAL_TYPES(TEST_KERNEL); + +#undef TEST_KERNEL +} + +// Cloning with mismatched input and output tensor shapes should fail. +TEST_F(OpDimOrderCloneTest, MismatchedSizesDie) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "Skipping: ATen kernel supports mismatched sizes."; + } + TensorFactory tf; + Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6}); + Tensor out = tf.zeros({3, 2, 1, 1}); + std::vector dim_order_vec; + for (int64_t i = 0; i < input.dim(); i++) { + dim_order_vec.push_back(i); + } + ArrayRef dim_order(dim_order_vec.data(), dim_order_vec.size()); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op__clone_dim_order_out( + /*self=*/input, + /*non_blocking=*/false, + dim_order, + out)); +} + +// Cloning with a non-contiguous memory format should fail. +TEST_F(OpDimOrderCloneTest, MismatchedMemoryFormatDies) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() + << "Skipping: ATen kernel supports non-contiguous memory formats."; + } + TensorFactory tf_in; + TensorFactory tf_out; + Tensor input = + tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6}); + Tensor out = tf_out.zeros({3, 1, 1, 2}); + + std::vector dim_order_vec; + for (int64_t i = 0; i < input.dim(); i++) { + dim_order_vec.push_back(i); + } + + // Mutate dim_order_vec to create an illegal dim_order. + dim_order_vec[1] = 3; + dim_order_vec[3] = 1; + ArrayRef dim_order(dim_order_vec.data(), dim_order_vec.size()); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op__clone_dim_order_out( + /*self=*/input, + /*non_blocking=*/false, + dim_order, + out)); +} + +// Cloning with non‑blocking=true should fail because portable kernels only +// support blocking. +TEST_F(OpDimOrderCloneTest, MismatchedBlockingDie) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() + << "Skipping: ATen kernel supports non-blocking data transfer."; + } + TensorFactory tf; + Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6}); + Tensor out = tf.zeros(/*sizes=*/{3, 1, 1, 2}); + + std::vector dim_order_vec; + for (int64_t i = 0; i < input.dim(); i++) { + dim_order_vec.push_back(i); + } + ArrayRef dim_order(dim_order_vec.data(), dim_order_vec.size()); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op__clone_dim_order_out( + /*self=*/input, + /*non_blocking=*/true, + dim_order, + out)); +} + +TEST_F(OpDimOrderCloneTest, DynamicShapeUpperBoundSameAsExpected) { + test_dynamic_shape( + {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpDimOrderCloneTest, DynamicShapeUpperBoundLargerThanExpected) { + test_dynamic_shape( + {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpDimOrderCloneTest, DynamicShapeUnbound) { + if (!torch::executor::testing::SupportedFeatures::get()->output_resize) { + GTEST_SKIP() << "Dynamic shape unbound not supported."; + } + test_dynamic_shape( + {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); +} + +TEST_F(OpDimOrderCloneTest, ContiguousToChannelsLast) { + TensorFactory tf; + + Tensor x = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.5248, 0.5361, 0.8513, 0.8184, 0.8206, 0.7357, 0.9655, 0.6138, + 0.1112, 0.2799, 0.1079, 0.9680, 0.2548, 0.0393, 0.6002, 0.2257, 0.8766, + 0.2715, 0.1595, 0.2029, 0.7026, 0.6982, 0.8529, 0.4405, 0.6560, 0.9217, + 0.6372, 0.2446, 0.6590, 0.3866, 0.7185, 0.4439, 0.5346, 0.3179, 0.4492, + 0.3491, 0.6970, 0.8456, 0.2516, 0.2345, 0.2924, 0.7695, 0.0911, 0.8530, + 0.8560, 0.6909, 0.7719, 0.8923, 0.5546, 0.6978, 0.8151, 0.3007, 0.3961, + 0.8416, 0.4296, 0.7203, 0.8963, 0.3597, 0.5552}); + + Tensor out = tf.full_channels_last({3, 5, 2, 2}, 0.0); + Tensor expected = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548, + 0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079, + 0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560, + 0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529, + 0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203, + 0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416, + 0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552}, + /*dim_order=*/{0, 2, 3, 1}); + + std::vector dim_order_vec = {0, 2, 3, 1}; + executorch::aten::ArrayRef dim_order( + dim_order_vec.data(), dim_order_vec.size()); + Tensor ret = op__clone_dim_order_out( + /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out); + + EXPECT_TENSOR_EQ(out, expected); + EXPECT_TENSOR_EQ(ret, expected); +} + +TEST_F(OpDimOrderCloneTest, ChannelsLastToContiguous) { + TensorFactory tf; + + Tensor out = tf.full({3, 5, 2, 2}, 0.0); + Tensor x = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548, + 0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079, + 0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560, + 0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529, + 0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203, + 0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416, + 0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552}, + /*dim_order=*/{0, 2, 3, 1}); + + Tensor expected = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.5248, 0.5361, 0.8513, 0.8184, 0.8206, 0.7357, 0.9655, 0.6138, + 0.1112, 0.2799, 0.1079, 0.9680, 0.2548, 0.0393, 0.6002, 0.2257, 0.8766, + 0.2715, 0.1595, 0.2029, 0.7026, 0.6982, 0.8529, 0.4405, 0.6560, 0.9217, + 0.6372, 0.2446, 0.6590, 0.3866, 0.7185, 0.4439, 0.5346, 0.3179, 0.4492, + 0.3491, 0.6970, 0.8456, 0.2516, 0.2345, 0.2924, 0.7695, 0.0911, 0.8530, + 0.8560, 0.6909, 0.7719, 0.8923, 0.5546, 0.6978, 0.8151, 0.3007, 0.3961, + 0.8416, 0.4296, 0.7203, 0.8963, 0.3597, 0.5552}); + + std::vector dim_order_vec = {0, 1, 2, 3}; + executorch::aten::ArrayRef dim_order( + dim_order_vec.data(), dim_order_vec.size()); + Tensor ret = op__clone_dim_order_out( + /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out); + + EXPECT_TENSOR_EQ(out, expected); + EXPECT_TENSOR_EQ(ret, expected); +} + +TEST_F(OpDimOrderCloneTest, PreserveChannelsLast) { + TensorFactory tf; + + Tensor out = tf.full_channels_last({3, 5, 2, 2}, 0.0); + Tensor x = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548, + 0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079, + 0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560, + 0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529, + 0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203, + 0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416, + 0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552}, + /*dim_order=*/{0, 2, 3, 1}); + + Tensor expected = tf.make_with_dimorder( + {3, 5, 2, 2}, + {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548, + 0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079, + 0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560, + 0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529, + 0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203, + 0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416, + 0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552}, + /*dim_order=*/{0, 2, 3, 1}); + + Tensor ret = op__clone_dim_order_out( + /*self*/ x, + /*non_blocking*/ false, + /*dim_order*/ executorch::aten::nullopt, + out); + + EXPECT_TENSOR_EQ(out, expected); + EXPECT_TENSOR_EQ(ret, expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 60dabac1844..8ab55c170fd 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -177,6 +177,7 @@ def define_common_targets(): _common_op_test("op__to_dim_order_copy_test", ["aten", "portable"]) _common_op_test("op__empty_dim_order_test", ["aten", "portable"]) + _common_op_test("op__clone_dim_order_test", ["portable"]) _common_op_test("op_abs_test", ["aten", "portable"]) _common_op_test("op_acos_test", ["aten", "portable"]) _common_op_test("op_acosh_test", ["aten", "portable"]) From e1865bf17ae0e9c7343ddb4c8803047ed1edfb26 Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Wed, 6 Aug 2025 12:24:13 -0700 Subject: [PATCH 08/10] Clarify dim_order format and remove unrelated comments --- kernels/test/op__clone_dim_order_test.cpp | 26 ++++++----------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/kernels/test/op__clone_dim_order_test.cpp b/kernels/test/op__clone_dim_order_test.cpp index 1f2319d2f4d..d999897cdf3 100644 --- a/kernels/test/op__clone_dim_order_test.cpp +++ b/kernels/test/op__clone_dim_order_test.cpp @@ -87,28 +87,10 @@ class OpDimOrderCloneTest : public OperatorTest { } } - /* %python - import torch - torch.manual_seed(0) - x = torch.rand(2, 3) - res = x.clone(memory_format = torch.preserve_format) - op = "op__clone_dim_order_out" - opt_setup_params = """ - bool non_blocking = false; - optional memory_format; - """ - opt_extra_params = "non_blocking, memory_format," - out_args = "out_shape, dynamism" - dtype = "ScalarType::Float" - check = "EXPECT_TENSOR_EQ" */ - // Helper for testing dynamic shape outputs. void test_dynamic_shape( const std::vector& out_shape, enum torch::executor::TensorShapeDynamism dynamism) { - /* %python - %rewrite(unary_op) */ - TensorFactory tf; Tensor x = tf.make( @@ -197,7 +179,7 @@ TEST_F(OpDimOrderCloneTest, MismatchedSizesDie) { out)); } -// Cloning with a non-contiguous memory format should fail. +// Cloning with an unsupported memory format should fail. TEST_F(OpDimOrderCloneTest, MismatchedMemoryFormatDies) { if (torch::executor::testing::SupportedFeatures::get()->is_aten) { GTEST_SKIP() @@ -266,7 +248,7 @@ TEST_F(OpDimOrderCloneTest, DynamicShapeUpperBoundLargerThanExpected) { TEST_F(OpDimOrderCloneTest, DynamicShapeUnbound) { if (!torch::executor::testing::SupportedFeatures::get()->output_resize) { - GTEST_SKIP() << "Dynamic shape unbound not supported."; + GTEST_SKIP() << "Skipping: Dynamic shape unbound not supported."; } test_dynamic_shape( {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); @@ -275,6 +257,8 @@ TEST_F(OpDimOrderCloneTest, DynamicShapeUnbound) { TEST_F(OpDimOrderCloneTest, ContiguousToChannelsLast) { TensorFactory tf; + // x is in contiguous dim order {0, 1, 2, 3}. + // make_with_dimorder() defaults to contiguous when dim_order isn't specified. Tensor x = tf.make_with_dimorder( {3, 5, 2, 2}, {0.2432, 0.5248, 0.5361, 0.8513, 0.8184, 0.8206, 0.7357, 0.9655, 0.6138, @@ -311,6 +295,8 @@ TEST_F(OpDimOrderCloneTest, ChannelsLastToContiguous) { TensorFactory tf; Tensor out = tf.full({3, 5, 2, 2}, 0.0); + + // x is in channels_last dim order {0, 2, 3, 1}. Tensor x = tf.make_with_dimorder( {3, 5, 2, 2}, {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548, From 19d14e161bdd3b9c0b1fa232ac19484710595b2f Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Wed, 6 Aug 2025 22:08:46 -0700 Subject: [PATCH 09/10] Move broadcast_util to exported_deps for copy_ops_util --- kernels/portable/cpu/util/targets.bzl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index b61dab7eb07..8194b37f319 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -147,9 +147,11 @@ def define_common_targets(): "copy_ops_util.h", ], compiler_flags = ["-Wno-missing-prototypes"], + exported_deps = [ + ":broadcast_util", + ], deps = [ "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu/util:broadcast_util", ], visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], ) From 0100fd41c67cb3fcd4c50c9626186e379d7dd27c Mon Sep 17 00:00:00 2001 From: Zuby Afzal <65686164+keyprocedure@users.noreply.github.com> Date: Fri, 8 Aug 2025 14:57:25 -0700 Subject: [PATCH 10/10] Add op__clone_dim_order to op_library --- .../executorch/kernels/portable/op_registration_util.bzl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index 73dfafdc65d..3df05b3651a 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -1329,6 +1329,13 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op__clone_dim_order", + deps = [ + ":scalar_utils", + "//executorch/kernels/portable/cpu/util:copy_ops_util", + ], + ), ) # Operators that are not listed in `functions.yaml` (i.e., operators listed in