From df69dfd92b805bf3a4cc1c8af493f6e859dc3d48 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Tue, 26 Nov 2024 19:02:00 -0800 Subject: [PATCH 1/3] Integrated temporary mem alloc functionality in place of malloc --- backends/cadence/aot/functions_hifi.yaml | 13 ++++++--- backends/cadence/hifi/kernels/kernels.cpp | 5 ++++ backends/cadence/hifi/kernels/kernels.h | 8 +++++- .../cadence/hifi/operators/CMakeLists.txt | 1 + .../cadence/hifi/operators/op_maximum.cpp | 2 ++ backends/cadence/hifi/operators/op_mean.cpp | 4 ++- backends/cadence/hifi/operators/op_pow.cpp | 14 +++++++--- backends/cadence/hifi/operators/op_where.cpp | 6 ++-- .../executor_runner/executor_runner.cpp | 28 ++++++++++++++++++- 9 files changed, 68 insertions(+), 13 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 0f3e582884c..ac981d321a6 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -61,11 +61,16 @@ kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::full_out + +- op: gt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_scalar_out - op: maximum.out kernels: - arg_meta: null - kernel_name: impl::HiFi::maximum_out + kernel_name: cadence::impl::HiFi::maximum_out - op: mean.out kernels: @@ -90,17 +95,17 @@ - op: pow.Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Scalar_out - op: pow.Tensor_Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out - op: pow.Tensor_Tensor_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Tensor_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out - op: rsqrt.out kernels: diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index 10e5fb176e0..0934afde29f 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -20,6 +20,11 @@ memcpy(void* dst, const void* src, size_t num_bytes) { MEMCPY_8b(dst, src, num_bytes); } +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { + Result temp_mem_res = ctx.allocate_temp(size); + return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; +} + // Quantize a fp32 value to an int8_t/uint8_t value template __attribute__((always_inline)) T diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 9a4689c17c2..e72284b99de 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -14,8 +14,12 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" -/* Potential NNLIB function/APIs */ +#include + +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::Result; +/* Potential NNLIB function/APIs */ extern "C" WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out, const int* const out_shape, @@ -149,6 +153,8 @@ namespace impl { namespace HiFi { namespace kernels { +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size); + void memcpy(void* dst, const void* src, size_t num_bytes); WORD32 matmul_asym8uxasym8u_asym8u( diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index c01dad5ce80..c66e55f0f42 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -38,6 +38,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index 97578765cff..f85d3470e93 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -170,3 +171,4 @@ Tensor& maximum_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index 478e10da712..cdc844ec5c7 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -125,7 +125,9 @@ Tensor& mean_dim_out( int scratch_size = xa_nn_reduce_getsize_nhwc( -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1); - void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size); + void* __restrict__ p_scratch_in = + (void* __restrict__)kernels::allocate_temp_memory( + ctx, scratch_size * sizeof(int)); xa_nn_reduce_mean_4D_f32_f32( p_out, diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 04533b290bf..74c24afbc0e 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -26,6 +26,7 @@ using executorch::runtime::promoteTypes; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -119,9 +120,11 @@ Tensor& pow_Tensor_Tensor_out( if (optimized) { if (broadcast) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ ptr2 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); @@ -154,7 +157,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr2); } else if (a_is_broadcasted && (!b_is_broadcasted)) { FLOAT32* __restrict__ ptr1 = - (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + (FLOAT32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); @@ -181,7 +185,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr1); } else if (b_is_broadcasted && (!a_is_broadcasted)) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); @@ -349,3 +354,4 @@ Tensor& pow_Scalar_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 06bd0bc3c9f..c4ad8177cfe 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -109,8 +109,10 @@ Tensor& where_out( if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) { - void* p_scratch = - malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]); + void* p_scratch = (void*)kernels::allocate_temp_memory( + ctx, + (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * + sizeof(int)); const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; xa_nn_broadcast_8_8( (WORD8* __restrict__)p_brd_cond, diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 93c150c0b90..a2476f9165a 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -30,8 +30,16 @@ #include #include +#if __XTENSA__ +#include +#include +#include +#endif + static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB +static uint8_t temp_allocator_pool[1024U * 1024U]; + DEFINE_string( model_path, "model.pte", @@ -120,6 +128,10 @@ int main(int argc, char** argv) { MemoryAllocator method_allocator{ MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + // Temporary memory required by kernels + MemoryAllocator temp_allocator{ + MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool)}; + // The memory-planned buffers will back the mutable tensors used by the // method. The sizes of these buffers were determined ahead of time during the // memory-planning pasees. @@ -144,7 +156,7 @@ int main(int argc, char** argv) { // Assemble all of the allocators into the MemoryManager that the Executor // will use. - MemoryManager memory_manager(&method_allocator, &planned_memory); + MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator); // // Load the method from the program, using the provided allocators. Running @@ -170,8 +182,22 @@ int main(int argc, char** argv) { (uint32_t)inputs.error()); ET_LOG(Info, "Inputs prepared."); +#if __XTENSA__ + struct tms start, stop; + xt_iss_client_command("all", "disable"); + xt_iss_client_command("all", "enable"); + times(&start); +#endif + // Run the model. Error status = method->execute(); + +#if __XTENSA__ + times(&stop); + xt_iss_client_command("all", "disable"); + ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime)); +#endif + ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, From a23b646b2a75fbd5f7a475ff8607b0d20ed6a7a5 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Tue, 26 Nov 2024 21:00:37 -0800 Subject: [PATCH 2/3] Namespace related changes --- backends/cadence/aot/functions_hifi.yaml | 6 +++--- backends/cadence/hifi/operators/op_add.cpp | 2 ++ backends/cadence/hifi/operators/op_minimum.cpp | 2 ++ backends/cadence/hifi/operators/op_rsqrt.cpp | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index ac981d321a6..edb34924164 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -25,7 +25,7 @@ - op: add.out kernels: - arg_meta: null - kernel_name: impl::HiFi::add_out + kernel_name: cadence::impl::HiFi::add_out - op: bmm.out kernels: @@ -80,7 +80,7 @@ - op: minimum.out kernels: - arg_meta: null - kernel_name: impl::HiFi::minimum_out + kernel_name: cadence::impl::HiFi::minimum_out - op: mul.out kernels: @@ -110,7 +110,7 @@ - op: rsqrt.out kernels: - arg_meta: null - kernel_name: impl::HiFi::rsqrt_out + kernel_name: cadence::impl::HiFi::rsqrt_out - op: sigmoid.out kernels: diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp index 10e06938f2e..43cb0d8cd62 100644 --- a/backends/cadence/hifi/operators/op_add.cpp +++ b/backends/cadence/hifi/operators/op_add.cpp @@ -22,6 +22,7 @@ using executorch::runtime::CppTypeToScalarType; using executorch::runtime::KernelRuntimeContext; using torch::executor::Error; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -202,3 +203,4 @@ Tensor& add_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp index fd9cfe4f95f..6f81ad5c3e3 100644 --- a/backends/cadence/hifi/operators/op_minimum.cpp +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -169,3 +170,4 @@ Tensor& minimum_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp index c94800aef14..13d8133355c 100644 --- a/backends/cadence/hifi/operators/op_rsqrt.cpp +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -15,6 +15,7 @@ using exec_aten::ScalarType; using exec_aten::Tensor; using executorch::aten::RuntimeContext; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -51,3 +52,4 @@ Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence \ No newline at end of file From 5d7aea8d1391c3d0b3512aad5222266e7d3a8e7b Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Tue, 26 Nov 2024 21:36:11 -0800 Subject: [PATCH 3/3] Cleanup the main application --- .../executor_runner/executor_runner.cpp | 22 ++----------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index a2476f9165a..514a82c0ff0 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -30,12 +30,6 @@ #include #include -#if __XTENSA__ -#include -#include -#include -#endif - static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB static uint8_t temp_allocator_pool[1024U * 1024U]; @@ -156,7 +150,8 @@ int main(int argc, char** argv) { // Assemble all of the allocators into the MemoryManager that the Executor // will use. - MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator); + MemoryManager memory_manager( + &method_allocator, &planned_memory, &temp_allocator); // // Load the method from the program, using the provided allocators. Running @@ -182,22 +177,9 @@ int main(int argc, char** argv) { (uint32_t)inputs.error()); ET_LOG(Info, "Inputs prepared."); -#if __XTENSA__ - struct tms start, stop; - xt_iss_client_command("all", "disable"); - xt_iss_client_command("all", "enable"); - times(&start); -#endif - // Run the model. Error status = method->execute(); -#if __XTENSA__ - times(&stop); - xt_iss_client_command("all", "disable"); - ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime)); -#endif - ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32,