From 7f44e16937c8835800afa87f2772b9a507f3a3fa Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 27 Sep 2023 08:34:22 +0000
Subject: [PATCH 01/20] Add ethos-u-core-driver submodule

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .gitmodules                                  | 3 +++
 backends/arm/third-party/ethos-u-core-driver | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 backends/arm/third-party/ethos-u-core-driver

diff --git a/.gitmodules b/.gitmodules
index 980a999eff0..aac8050326d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,3 +31,6 @@
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
 	url = https://git.mlplatform.org/tosa/serialization_lib.git
+[submodule "backends/arm/third-party/ethos-u-core-driver"]
+	path = backends/arm/third-party/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver
new file mode 160000
index 00000000000..90f9df900ac
--- /dev/null
+++ b/backends/arm/third-party/ethos-u-core-driver
@@ -0,0 +1 @@
+Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5

From c6755bf00500e2712558c8aaaf8e33d14d4c3edc Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 27 Sep 2023 11:45:33 +0000
Subject: [PATCH 02/20] Added shell of runtime Arm Backend for Ethos-U

* basic build system amendments
* toolchain file for baremetal platforms
* use of ethos-u-core-driver submodule
* some scripts to pull a compiler and test-build the backend

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 CMakeLists.txt                             |  8 ++
 backends/arm/CMakeLists.txt                | 25 ++++++
 backends/arm/cmake/Dependencies.cmake      | 12 +++
 backends/arm/cmake/arm-none-eabi-gcc.cmake | 90 ++++++++++++++++++++++
 backends/arm/cmake/build.sh                | 47 +++++++++++
 backends/arm/cmake/toolchain.sh            | 11 +++
 backends/arm/runtime/ArmBackendEthosU.cpp  | 58 ++++++++++++++
 schema/CMakeLists.txt                      |  2 +-
 8 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 backends/arm/CMakeLists.txt
 create mode 100644 backends/arm/cmake/Dependencies.cmake
 create mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake
 create mode 100755 backends/arm/cmake/build.sh
 create mode 100755 backends/arm/cmake/toolchain.sh
 create mode 100644 backends/arm/runtime/ArmBackendEthosU.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3883c991bf2..a61450f8895 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,10 @@ if(BUILD_SELECTIVE_BUILD_TEST)
   option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF)
 endif()
 
+# Build Arm Baremetal backend
+option(EXECUTORCH_BUILD_ARM_BAREMETAL
+       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
+
 # Build xnn_executor_runner which depends on XNNPACK
 option(EXECUTORCH_BUILD_XNNPACK
        "Build xnn_executor_runner which depends on XNNPACK" OFF)
@@ -282,6 +286,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 # Add selective build subdirectory
 if(BUILD_SELECTIVE_BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
new file mode 100644
index 00000000000..2cc5cf94740
--- /dev/null
+++ b/backends/arm/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(cmake/Dependencies.cmake)
+
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+add_library(ethos_u STATIC ${_arm_baremetal_sources})
+target_include_directories(ethos_u PUBLIC ${_common_include_directories})
+target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake
new file mode 100644
index 00000000000..27a587176bb
--- /dev/null
+++ b/backends/arm/cmake/Dependencies.cmake
@@ -0,0 +1,12 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
+
+# Ethos-U driver
+set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver")
+set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
+add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} )
+include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
new file mode 100644
index 00000000000..d70f79361cd
--- /dev/null
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -0,0 +1,90 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
+string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_LINKER "arm-none-eabi-ld")
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Select C/C++ version
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+
+set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})
+
+# Compile options
+add_compile_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    "$<$<CONFIG:DEBUG>:-gdwarf-3>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
+    -fdata-sections
+    -ffunction-sections)
+
+# Compile defines
+add_compile_definitions(
+    "$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")
+
+# Link options
+add_link_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    --specs=nosys.specs)
+
+# Set floating point unit
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
+    set(FLOAT hard)
+    set(FPU_CONFIG "fpv4-sp-d16")
+    add_compile_options(-mfpu=${FPU_CONFIG})
+    add_link_options(-mfpu=${FPU_CONFIG})
+else()
+    set(FLOAT soft)
+endif()
+
+if (FLOAT)
+    add_compile_options(-mfloat-abi=${FLOAT})
+    add_link_options(-mfloat-abi=${FLOAT})
+endif()
+
+add_link_options(LINKER:--nmagic,--gc-sections)
+
+# Compilation warnings
+add_compile_options(
+#    -Wall
+#    -Wextra
+
+#    -Wcast-align
+#    -Wdouble-promotion
+#    -Wformat
+#    -Wmissing-field-initializers
+#    -Wnull-dereference
+#    -Wredundant-decls
+#    -Wshadow
+#    -Wswitch
+#    -Wswitch-default
+#    -Wunused
+    -Wno-redundant-decls
+    -Wno-psabi
+)
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
new file mode 100755
index 00000000000..490a358c9a1
--- /dev/null
+++ b/backends/arm/cmake/build.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+#
+# Setup toolchain
+#
+BASEDIR=`realpath $(dirname "$0")`
+echo "building using build.sh in $BASEDIR"
+
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/
+echo $GCCPATH
+if test -d "${GCCPATH}"; then
+	echo Using exising compiler ${GCCPATH}
+else
+	pushd ${BASEDIR}/
+	./toolchain.sh
+	popd
+fi
+export PATH=${PATH}:${GCCPATH}
+
+echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`
+
+
+#
+# Prepare and run clean build
+#
+rm -rf buck-out/ build/lib/ cmake-out/
+rm -rf cmake-corstone
+mkdir cmake-corstone
+cd cmake-corstone
+
+#cmake -DBUCK2=buck2 ..
+
+#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
+cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
+	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  ..
+# -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \
+
+cd ..
+cmake --build cmake-corstone -j1 --target ethos_u
diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh
new file mode 100755
index 00000000000..7fd4abcc781
--- /dev/null
+++ b/backends/arm/cmake/toolchain.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz
+tar xf gcc.tar.xz
+export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
new file mode 100644
index 00000000000..fcf10567b26
--- /dev/null
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver
+ */
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <ethosu_driver.h>
+#include <pmu_ethosu.h>
+
+namespace torch {
+namespace executor {
+
+class ArmBackend final : public PyTorchBackendInterface {
+
+public:
+	~ArmBackend() = default;
+
+	virtual bool is_available() const override {
+		return 1;
+	}
+
+	Result<DelegateHandle*> init(
+		BackendInitContext& context,
+		FreeableBuffer* processed,
+		ArrayRef<CompileSpec> compile_specs) const override {
+		return Error::Ok;
+	}
+
+	Error execute(
+		BackendExecutionContext& context,
+		DelegateHandle* handle,
+		EValue** args) const override {
+		return Error::Ok;
+	}
+
+	void destroy(DelegateHandle* handle) const override {
+		return;
+	}
+
+};
+
+namespace {
+	auto backend = ArmBackend();
+	Backend backend_id{"ArmBackend", &backend};
+	static auto registered = register_backend(backend_id);
+} // namespace 
+
+} // namespace executor
+} // namespace torch
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 0c7dc2cbec4..55c07fd5f7b 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -41,7 +41,7 @@ add_custom_command(
     -o "${_program_schema__include_dir}/executorch/schema"
     ${_program_schema__srcs}
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  DEPENDS ${FLATC_EXECUTABLE} ${_program_schema__srcs}
+  DEPENDS ${_program_schema__srcs}
   COMMENT "Generating program_schema headers"
   VERBATIM)
 

From 6421ead4363e9815703b6c4fdf5033cd712a8de4 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 27 Sep 2023 14:33:14 +0000
Subject: [PATCH 03/20] First-pass ethos-u backend with an assumed flat program
 format in SRAM, this will be reworked as wrapped ethos buffers in .pte become
 available

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/runtime/ArmBackendEthosU.cpp |  76 +++++++++-
 backends/arm/runtime/command_stream.cpp   | 169 ++++++++++++++++++++++
 backends/arm/runtime/command_stream.hpp   | 120 +++++++++++++++
 3 files changed, 363 insertions(+), 2 deletions(-)
 create mode 100644 backends/arm/runtime/command_stream.cpp
 create mode 100644 backends/arm/runtime/command_stream.hpp

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index fcf10567b26..186e3318f61 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -9,6 +9,8 @@
  * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver
  */
 
+#include <memory>
+
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
@@ -16,6 +18,16 @@
 #include <ethosu_driver.h>
 #include <pmu_ethosu.h>
 
+#include "command_stream.hpp"
+using namespace EthosU::CommandStream;
+
+// Required byte alignment of all input pointers
+#define ETHOS_U_ALIGN 0xF
+char *ethos_align( char *ptr )
+{
+	return (char*)((uintptr_t)~ETHOS_U_ALIGN & (uintptr_t)(ptr + (ETHOS_U_ALIGN-1)));
+}
+
 namespace torch {
 namespace executor {
 
@@ -32,13 +44,73 @@ class ArmBackend final : public PyTorchBackendInterface {
 		BackendInitContext& context,
 		FreeableBuffer* processed,
 		ArrayRef<CompileSpec> compile_specs) const override {
-		return Error::Ok;
+
+		printf("ArmBackend::init 0x%X\n", processed->data());
+
+		char *data = (char*)processed->data();
+		size_t size = processed->size();
+		
+		//the model should have been placed in sram with
+		//__attribute__((section(".sram.data"), aligned(16)))
+		void *aligned = ethos_align(data);
+		if( data != ethos_align(data)) return Error::InvalidProgram;
+
+		// TODO: Verify address range is accessible to Ethos-U
+		// current expectation is the program is in SRAM
+		if(0) return Error::InvalidProgram;
+		
+		// Return the same buffer we were passed - this data will be
+		// executed directly
+		return processed;
 	}
 
 	Error execute(
 		BackendExecutionContext& context,
-		DelegateHandle* handle,
+		DelegateHandle* input_handle,
 		EValue** args) const override {
+
+		FreeableBuffer* processed = (FreeableBuffer*)input_handle;
+
+		printf("ArmBackend::execute 0x%X\n", processed->data());
+
+		// Command stream - we know at this point it's aligned
+		char *handle = (char*)processed->data();
+		int command_stream_length = ((int*)handle)[0];
+		char *command_stream = ethos_align(handle+sizeof(int));
+		
+		// Static tensors/weights/model data
+		handle = ethos_align( command_stream + command_stream_length );
+		int weight_data_length = ((int*)handle)[0];
+		char *weight_data = ethos_align(handle+sizeof(int));
+
+		// Activation data, input and output memory
+		handle = ethos_align( weight_data + weight_data_length );
+		int activation_data_length = ((int*)handle)[0];
+		char *activation_data = ethos_align(handle+sizeof(int));
+
+
+		// Invoke driver using the above pointers
+		CommandStream cs(
+			DataPointer(command_stream, command_stream_length),
+			BasePointers({
+					DataPointer(weight_data, weight_data_length),
+					DataPointer(activation_data, activation_data_length)
+				}),
+			PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE})
+			);
+
+		cs.getPmu().clear();
+		int res = cs.run(1);
+		if(res == 0)
+		{
+			uint64_t cycleCount = cs.getPmu().getCycleCount();
+			cs.getPmu().print();
+			printf("cycleCount=%llu, cycleCountPerJob=%llu\n", cycleCount, cycleCount);
+		} else {
+			printf("Error, failure executing job\n");
+			return Error::InvalidProgram;
+		}
+	
 		return Error::Ok;
 	}
 
diff --git a/backends/arm/runtime/command_stream.cpp b/backends/arm/runtime/command_stream.cpp
new file mode 100644
index 00000000000..d2e62ce629a
--- /dev/null
+++ b/backends/arm/runtime/command_stream.cpp
@@ -0,0 +1,169 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/****************************************************************************
+ * Includes
+ ****************************************************************************/
+
+#include "command_stream.hpp"
+
+#include <inttypes.h>
+#include <stdio.h>
+
+using namespace std;
+
+namespace EthosU {
+namespace CommandStream {
+
+/****************************************************************************
+ * DataPointer
+ ****************************************************************************/
+
+DataPointer::DataPointer() : data(nullptr), size(0) {}
+
+DataPointer::DataPointer(const char *_data, size_t _size) : data(_data), size(_size) {}
+
+bool DataPointer::operator!=(const DataPointer &other) {
+    if (size != other.size) {
+        return true;
+    }
+
+    for (size_t i = 0; i < size; i++) {
+        if (data[i] != other.data[i]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/****************************************************************************
+ * PmuConfig
+ ****************************************************************************/
+
+Pmu::Pmu(ethosu_driver *_drv, const PmuEvents &_config) : drv(_drv), config(_config) {
+    // Enable PMU block
+    ETHOSU_PMU_Enable(drv);
+
+    // Enable cycle counter
+    ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
+
+    // Configure event types
+    for (size_t i = 0; i < config.size(); i++) {
+        ETHOSU_PMU_Set_EVTYPER(drv, i, config[i]);
+        ETHOSU_PMU_CNTR_Enable(drv, 1u << i);
+    }
+}
+
+void Pmu::clear() {
+    ETHOSU_PMU_CYCCNT_Reset(drv);
+    ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
+}
+
+void Pmu::print() {
+    printf("PMU={cycleCount=%llu, events=[%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32 "]}\n",
+           ETHOSU_PMU_Get_CCNTR(drv),
+           ETHOSU_PMU_Get_EVCNTR(drv, 0),
+           ETHOSU_PMU_Get_EVCNTR(drv, 1),
+           ETHOSU_PMU_Get_EVCNTR(drv, 2),
+           ETHOSU_PMU_Get_EVCNTR(drv, 3));
+}
+
+uint64_t Pmu::getCycleCount() const {
+    return ETHOSU_PMU_Get_CCNTR(drv);
+}
+
+uint32_t Pmu::getEventCount(size_t index) const {
+    return ETHOSU_PMU_Get_EVCNTR(drv, index);
+}
+
+/****************************************************************************
+ * CommandStream
+ ****************************************************************************/
+
+CommandStream::CommandStream(const DataPointer &_commandStream,
+                             const BasePointers &_basePointers,
+                             const PmuEvents &_pmuEvents) :
+    drv(ethosu_reserve_driver()),
+    commandStream(_commandStream), basePointers(_basePointers), pmu(drv, _pmuEvents) {}
+
+CommandStream::~CommandStream() {
+    ethosu_release_driver(drv);
+}
+
+int CommandStream::run(size_t repeat) {
+    // Base pointer array
+    uint64_t baseAddress[ETHOSU_BASEP_INDEXES];
+    size_t baseAddressSize[ETHOSU_BASEP_INDEXES];
+    for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) {
+        baseAddress[i]     = reinterpret_cast<uint64_t>(basePointers[i].data);
+        baseAddressSize[i] = reinterpret_cast<size_t>(basePointers[i].size);
+    }
+
+    while (repeat-- > 0) {
+        int error = ethosu_invoke_v3(
+            drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr);
+
+        if (error != 0) {
+            printf("Inference failed. error=%d\n", error);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+int CommandStream::run_async() {
+    // Base pointer array
+    uint64_t baseAddress[ETHOSU_BASEP_INDEXES];
+    size_t baseAddressSize[ETHOSU_BASEP_INDEXES];
+
+    for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) {
+        baseAddress[i]     = reinterpret_cast<uint64_t>(basePointers[i].data);
+        baseAddressSize[i] = reinterpret_cast<size_t>(basePointers[i].size);
+    }
+
+    int error = ethosu_invoke_async(
+        drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr);
+
+    if (error != 0) {
+        printf("Inference invoke async failed. error=%d\n", error);
+        return 1;
+    }
+
+    return 0;
+}
+
+int CommandStream::wait_async(bool block) {
+    return ethosu_wait(drv, block);
+}
+
+DataPointer &CommandStream::getCommandStream() {
+    return commandStream;
+}
+
+BasePointers &CommandStream::getBasePointers() {
+    return basePointers;
+}
+
+Pmu &CommandStream::getPmu() {
+    return pmu;
+}
+
+}; // namespace CommandStream
+}; // namespace EthosU
diff --git a/backends/arm/runtime/command_stream.hpp b/backends/arm/runtime/command_stream.hpp
new file mode 100644
index 00000000000..7163b9d58ca
--- /dev/null
+++ b/backends/arm/runtime/command_stream.hpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMMAND_STREAM_HPP
+#define COMMAND_STREAM_HPP
+
+/****************************************************************************
+ * Includes
+ ****************************************************************************/
+
+#include <array>
+#include <ethosu_driver.h>
+#include <pmu_ethosu.h>
+#include <stddef.h>
+
+/****************************************************************************
+ * Defines
+ ****************************************************************************/
+
+#ifndef ETHOSU_BASEP_INDEXES
+#define ETHOSU_BASEP_INDEXES 8
+#endif
+
+/****************************************************************************
+ * Types
+ ****************************************************************************/
+
+namespace EthosU {
+namespace CommandStream {
+
+/****************************************************************************
+ * DataPointer
+ ****************************************************************************/
+
+struct DataPointer {
+    DataPointer();
+    DataPointer(const char *_data, size_t _size);
+
+    bool operator!=(const DataPointer &other);
+
+    const char *data;
+    size_t size;
+};
+
+/****************************************************************************
+ * Pmu
+ ****************************************************************************/
+
+using PmuEvents = std::array<ethosu_pmu_event_type, ETHOSU_PMU_NCOUNTERS>;
+
+class Pmu {
+public:
+    Pmu(ethosu_driver *_drv, const PmuEvents &_config = {});
+
+    void clear();
+    void print();
+
+    uint64_t getCycleCount() const;
+    uint32_t getEventCount(size_t index) const;
+
+private:
+    ethosu_driver *drv;
+    PmuEvents config;
+};
+
+/****************************************************************************
+ * CommandStream
+ ****************************************************************************/
+
+using BasePointers = std::array<DataPointer, ETHOSU_BASEP_INDEXES>;
+
+class CommandStream {
+public:
+    CommandStream(const DataPointer &_commandStream,
+                  const BasePointers &_pointers = {},
+                  const PmuEvents &_pmuEvents   = {});
+    virtual ~CommandStream();
+
+    int run(size_t repeat = 1);
+    int run_async();
+    int wait_async(bool block = true);
+
+    DataPointer &getCommandStream();
+    BasePointers &getBasePointers();
+    Pmu &getPmu();
+
+private:
+    ethosu_driver *drv;
+    DataPointer commandStream;
+    BasePointers basePointers;
+    Pmu pmu;
+};
+
+#define DRIVER_ACTION_MAGIC() 'C', 'O', 'P', '1',
+
+#define DRIVER_ACTION_COMMAND_STREAM(length) 0x02, (length >> 16) & 0xff, length & 0xff, (length >> 8) & 0xff,
+
+#define DRIVER_ACTION_NOP() 0x05, 0x00, 0x00, 0x00,
+
+#define NPU_OP_STOP(mask) (mask >> 8) && 0xff, mask & 0xff, 0x08, 0x00,
+
+}; // namespace CommandStream
+}; // namespace EthosU
+
+#endif /* COMMAND_STREAM_HPP */

From 1f62accbf566cbecbe5daf2ac61c42d089c59430 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 27 Sep 2023 15:40:36 +0000
Subject: [PATCH 04/20] fixed builds of ethos-u-core-driver

* Added cmmss submodule dependency
* Added command_stream.cpp into the build
* Added the target to the build script

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 .gitmodules                    | 3 +++
 backends/arm/CMakeLists.txt    | 2 +-
 backends/arm/cmake/build.sh    | 4 +++-
 backends/arm/third-party/cmsis | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)
 create mode 160000 backends/arm/third-party/cmsis

diff --git a/.gitmodules b/.gitmodules
index aac8050326d..3138391f7c0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -34,3 +34,6 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
 	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
+[submodule "backends/arm/third-party/cmsis"]
+	path = backends/arm/third-party/cmsis
+	url = https://github.com/ARM-software/CMSIS_5.git
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 2cc5cf94740..6d6cd1938b7 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -18,7 +18,7 @@ set(_common_compile_options -Wno-deprecated-declarations)
 
 include(cmake/Dependencies.cmake)
 
-set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp backends/arm/runtime/command_stream.cpp)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
 add_library(ethos_u STATIC ${_arm_baremetal_sources})
 target_include_directories(ethos_u PUBLIC ${_common_include_directories})
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
index 490a358c9a1..353b90125c6 100755
--- a/backends/arm/cmake/build.sh
+++ b/backends/arm/cmake/build.sh
@@ -39,9 +39,11 @@ cd cmake-corstone
 cmake -DFLATC_EXECUTABLE=flatc \
 	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
 	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
+	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
 	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
 	  ..
 # -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \
 
 cd ..
-cmake --build cmake-corstone -j1 --target ethos_u
+cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver
diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis
new file mode 160000
index 00000000000..a75f01746df
--- /dev/null
+++ b/backends/arm/third-party/cmsis
@@ -0,0 +1 @@
+Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3

From 83a5e32b09a858c02590b6e348d5c06e35d0b866 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 28 Sep 2023 10:57:12 +0000
Subject: [PATCH 05/20] Emit Ethos-U55 chunked binaries from preprocess

 * vela binaries returned from preprocess
 * Included in PTE captured from arm_tosa_e2e
 * currently assumes vela is on path

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py  | 52 +++++++++++++++++++++++++++++++-----
 examples/arm/arm_tosa_e2e.py |  2 +-
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 1a6499cf07d..1ec28bdeb21 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -13,6 +13,7 @@
 import operator
 import os
 import tempfile
+import subprocess
 from typing import final, List
 
 import numpy as np
@@ -140,6 +141,47 @@ def dbg_tosa_dump(tosa_fb, path):
     f.write(js)
     f.close()
 
+# Output to Vela with current file-based compilation
+# WARNING: if this changes, the runtime reader also needs to change
+def vela_compile(tosa_fb):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(f"compiling to Vela in {tmpdir}")
+
+        tosaname = "out.tosa"
+        flatbuffer = tosa_fb.serialize()
+        f = open(os.path.join(tmpdir,tosaname), "wb")
+        f.write(flatbuffer)
+        f.close()
+
+        # invoke vela
+        # TODO target ethos-u55-128
+        vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        subprocess.run([vela_command], shell=True, check=True)
+
+        np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz")
+        blocks = b''
+        with np.load(np_path, allow_pickle=False) as data:
+            # Emit the NPZ regions as:
+            #  - 16 byte block name null terminated string (padded to 16 if name shorter)
+            #  - 4 byes of int32 block length and 12 bytes of 0's
+            #  - block data (padded to 16 byte alignment at end)
+            # Repeat for all blocks
+            for key in data.keys():
+                block_name = bytes(key,"utf8")[:15]
+                block_name = block_name + b'\x00'*(16-len(block_name))
+                block_data = data[key].tobytes() 
+                # We need the acual unpadded block lengths for hw setup
+                block_length = len(block_data).to_bytes(16, 'little')
+                # pad block data to multiple of 16 bytes
+                block_data = block_data + b'\x00'*(16-len(block_data)%16)
+
+                block = block_name + block_length + block_data
+                blocks = blocks + block
+
+        # return 16 byte VELA bin header + blocks + footer
+        header = bytes("vela_bin_stream","utf-8") + b'\x00'
+        footer = bytes("vela_end_stream","utf-8") + b'\x00'
+        return header + blocks + footer
 
 def dbg_fail(node, tosa_fb, path):
     dbg_tosa_dump(tosa_fb, path)
@@ -205,10 +247,6 @@ def preprocess(  # noqa: C901
                 path = spec.value.decode()
                 debug_output = True
 
-        # in non debug builds we still pass files to vela
-        if path is None:
-            path = tempfile.mkdtemp(prefix="arm_tosa_")
-
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_fb = ts.TosaSerializer(path)
@@ -680,5 +718,7 @@ def preprocess(  # noqa: C901
             dbg_tosa_dump(tosa_fb, path)
 
         # Serialize and return the tosa flatbuffer
-        fb = tosa_fb.serialize()
-        return PreprocessResult(processed_bytes=bytes(fb))
+        # fb = bytes(tosa_fb.serialize())
+        binary = vela_compile(tosa_fb)
+        
+        return PreprocessResult(processed_bytes=binary)
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py
index e320ca0cf4e..a9e07bed4c9 100644
--- a/examples/arm/arm_tosa_e2e.py
+++ b/examples/arm/arm_tosa_e2e.py
@@ -153,7 +153,7 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
 # Temp systest mode for running all models against both inference profiles
 if __name__ == "__main__":
     for op in TestList:
-        tosa_run_test(op, profile=TosaProfile.MI)
+        tosa_run_test(op, profile=TosaProfile.BI)
 
     # TODO: haven't added the quantized lowerings for BI, comment out for now
     # for op in TestList:

From 93cfdc260ba0a266ed136a36af31c6245edcc953 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 28 Sep 2023 13:05:22 +0000
Subject: [PATCH 06/20] added executorch to the build targets

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/cmake/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
index 353b90125c6..fd02a4d9b55 100755
--- a/backends/arm/cmake/build.sh
+++ b/backends/arm/cmake/build.sh
@@ -46,4 +46,4 @@ cmake -DFLATC_EXECUTABLE=flatc \
 # -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \
 
 cd ..
-cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver
+cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver executorch

From a38f08012c1e10fa1ab29abd321a92c48a330eb9 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Thu, 28 Sep 2023 17:22:49 +0000
Subject: [PATCH 07/20] Extended the delegate to read 'vela_bin_stream's

 * added a scratch block in the vela_bin to preallocate it
 * added a vela_bin reading routine into ArmBackendEthosU
 * set pointers passed to vela based on vela_bin

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py               | 17 +++++
 backends/arm/cmake/build.sh               |  4 +-
 backends/arm/cmake/toolchain.sh           |  3 +-
 backends/arm/runtime/ArmBackendEthosU.cpp | 78 ++++++++++++++++++-----
 4 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 1ec28bdeb21..e1f07d0a266 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -178,6 +178,23 @@ def vela_compile(tosa_fb):
                 block = block_name + block_length + block_data
                 blocks = blocks + block
 
+            # Add a block for scratch, inputs and outputs
+            # scratch shape is a 1 element array giving us size in bytes
+            block_name = bytes("scratch_data","utf8")[:15]
+            block_name = block_name + b'\x00'*(16-len(block_name))
+            block_length = data["scratch_shape"][0].item()
+            print(f"scratch length = {block_length}")
+            block_length = block_length+(15-(block_length-1)%16)
+            block_data = b'\x00'*block_length
+            block_length = block_length.to_bytes(16, 'little')
+            print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}")
+            block = block_name + block_length + block_data
+            blocks = blocks + block
+            # TODO are these already in scratch shape? look to be
+            #input_shape * input_elem_size
+            #output_shape * output_elem_size
+            # input_offset and output_offset specify the location these arrays are written from base of scratch
+
         # return 16 byte VELA bin header + blocks + footer
         header = bytes("vela_bin_stream","utf-8") + b'\x00'
         footer = bytes("vela_end_stream","utf-8") + b'\x00'
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
index fd02a4d9b55..d2bedd7769f 100755
--- a/backends/arm/cmake/build.sh
+++ b/backends/arm/cmake/build.sh
@@ -11,7 +11,9 @@ set -e
 BASEDIR=`realpath $(dirname "$0")`
 echo "building using build.sh in $BASEDIR"
 
-GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/
+ARCH=$(uname -i)
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/
+
 echo $GCCPATH
 if test -d "${GCCPATH}"; then
 	echo Using exising compiler ${GCCPATH}
diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh
index 7fd4abcc781..92188ee982d 100755
--- a/backends/arm/cmake/toolchain.sh
+++ b/backends/arm/cmake/toolchain.sh
@@ -6,6 +6,7 @@
 set -e
 
 # Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
-curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz
+ARCH=$(uname -i)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
 tar xf gcc.tar.xz
 export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 186e3318f61..e7f791d2b9f 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -73,28 +73,24 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 		printf("ArmBackend::execute 0x%X\n", processed->data());
 
-		// Command stream - we know at this point it's aligned
-		char *handle = (char*)processed->data();
-		int command_stream_length = ((int*)handle)[0];
-		char *command_stream = ethos_align(handle+sizeof(int));
-		
-		// Static tensors/weights/model data
-		handle = ethos_align( command_stream + command_stream_length );
-		int weight_data_length = ((int*)handle)[0];
-		char *weight_data = ethos_align(handle+sizeof(int));
-
-		// Activation data, input and output memory
-		handle = ethos_align( weight_data + weight_data_length );
-		int activation_data_length = ((int*)handle)[0];
-		char *activation_data = ethos_align(handle+sizeof(int));
+		vela_handles handles = { 0, 0, 0, 0, 0, 0};
 
+		// Command stream - we know at this point it's aligned
+		char *data = (char*)processed->data();
 
+		// Read key sections from the vela_bin_stream
+		this->vela_read( data, &handles );
+		
+		printf("Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+			   handles.cmd_data, handles.cmd_data_length,
+			   handles.weight_data, handles.weight_data_length,
+			   handles.scratch_data, handles.scratch_data_length );
 		// Invoke driver using the above pointers
 		CommandStream cs(
-			DataPointer(command_stream, command_stream_length),
+			DataPointer(handles.cmd_data, handles.cmd_data_length),
 			BasePointers({
-					DataPointer(weight_data, weight_data_length),
-					DataPointer(activation_data, activation_data_length)
+					DataPointer(handles.weight_data, handles.weight_data_length),
+					DataPointer(handles.scratch_data, handles.scratch_data_length)
 				}),
 			PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE})
 			);
@@ -118,6 +114,54 @@ class ArmBackend final : public PyTorchBackendInterface {
 		return;
 	}
 
+private:
+	typedef struct {
+		const char *cmd_data; int cmd_data_length;
+		const char *weight_data; int weight_data_length;
+		const char *scratch_data; int scratch_data_length;
+	} vela_handles;
+
+	int vela_read(char* data, vela_handles *h ) const {
+		if( strncmp( data, "vela_bin_stream", 15 ) ) return 0;
+		while( 1 )
+		{
+			data += 16;
+			if( !strncmp( data, "vela_end_stream", 15 ) )
+			{
+				printf("footer found!\n");
+				return 1;
+			}
+			printf("reading block '%s':\n", data);
+			char *block_name = data;
+			data += 16;
+			int block_length = ((int*)data)[0];
+			int block_length_padded = block_length + (15-(block_length-1)%16);
+			printf("  length %d\n", block_length );
+			printf("  padded length %d\n", block_length_padded );
+			char *block_data = data;
+			data += block_length_padded;
+
+			if( !strncmp( block_name, "cmd_data", strlen("cmd_data")) )
+			{
+				printf("Capturing cmd_data\n");
+				h->cmd_data = block_data;
+				h->cmd_data_length = block_length;
+			}
+			if( !strncmp( block_name, "weight_data", strlen("weight_data")) )
+			{
+				printf("Capturing weight_data\n");
+				h->weight_data = block_data;
+				h->weight_data_length = block_length;
+			}
+			if( !strncmp( block_name, "scratch_data", strlen("scratch_data")) )
+			{
+				printf("Capturing scratch_data\n");
+				h->scratch_data = block_data;
+				h->scratch_data_length = block_length;
+			}
+		}
+	}
+
 };
 
 namespace {

From 8de6e9260d827c2b0effbae175601c668978f97b Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:33:44 -0700
Subject: [PATCH 08/20] [ET][Portable] Add int types header

---
 kernels/portable/cpu/vec_ops.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h
index 0373196a4b6..5a297026050 100644
--- a/kernels/portable/cpu/vec_ops.h
+++ b/kernels/portable/cpu/vec_ops.h
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <numeric>
 #include <type_traits>
+#include <cinttypes>
 
 /**
  * @file

From 644eafc9bb5d7a9e7bb53b3cb972a0efb637601a Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:33:10 -0700
Subject: [PATCH 09/20] [Executorch] Simplify FunctionRef to make it more
 portable

Also helps with c++11 compliance if we have less c++20 code esp when we
don't use it.
---
 runtime/core/function_ref.h | 51 ++++---------------------------------
 1 file changed, 5 insertions(+), 46 deletions(-)

diff --git a/runtime/core/function_ref.h b/runtime/core/function_ref.h
index 92171134291..a07f6151f10 100644
--- a/runtime/core/function_ref.h
+++ b/runtime/core/function_ref.h
@@ -59,9 +59,7 @@ class FunctionRef;
 
 template <typename Ret, typename... Params>
 class FunctionRef<Ret(Params...)> {
-  Ret (*callback_)(const void* memory, Params... params) = nullptr;
   union Storage {
-    void* callable;
     Ret (*function)(Params...);
   } storage_;
 
@@ -70,57 +68,18 @@ class FunctionRef<Ret(Params...)> {
   explicit FunctionRef(std::nullptr_t) {}
 
   /**
-   * Case 1: A callable object passed by lvalue reference.
-   * Taking rvalue reference is error prone because the object will be always
-   * be destroyed immediately.
-   */
-  template <
-      typename Callable,
-      // This is not the copy-constructor.
-      typename std::enable_if<
-          !std::is_same<remove_cvref_t<Callable>, FunctionRef>::value,
-          int32_t>::type = 0,
-      // Avoid lvalue reference to non-capturing lambda.
-      typename std::enable_if<
-          !std::is_convertible<Callable, Ret (*)(Params...)>::value,
-          int32_t>::type = 0,
-      // Functor must be callable and return a suitable type.
-      // To make this container type safe, we need to ensure either:
-      // 1. The return type is void.
-      // 2. Or the resulting type from calling the callable is convertible to
-      // the declared return type.
-      typename std::enable_if<
-          std::is_void<Ret>::value ||
-              std::is_convertible<
-                  decltype(std::declval<Callable>()(std::declval<Params>()...)),
-                  Ret>::value,
-          int32_t>::type = 0>
-  explicit FunctionRef(Callable& callable)
-      : callback_([](const void* memory, Params... params) {
-          auto& storage = *static_cast<const Storage*>(memory);
-          auto& callable = *static_cast<Callable*>(storage.callable);
-          return static_cast<Ret>(callable(std::forward<Params>(params)...));
-        }) {
-    storage_.callable = &callable;
-  }
-
-  /**
-   * Case 2: A plain function pointer.
+   * Case 1: A plain function pointer.
    * Instead of storing an opaque pointer to underlying callable object,
    * store a function pointer directly.
    * Note that in the future a variant which coerces compatible function
    * pointers could be implemented by erasing the storage type.
    */
-  /* implicit */ FunctionRef(Ret (*ptr)(Params...))
-      : callback_([](const void* memory, Params... params) {
-          auto& storage = *static_cast<const Storage*>(memory);
-          return storage.function(std::forward<Params>(params)...);
-        }) {
+  /* implicit */ FunctionRef(Ret (*ptr)(Params...)) {
     storage_.function = ptr;
   }
 
   /**
-   * Case 3: Implicit conversion from lambda to FunctionRef.
+   * Case 2: Implicit conversion from lambda to FunctionRef.
    * A common use pattern is like:
    * void foo(FunctionRef<...>) {...}
    * foo([](...){...})
@@ -144,11 +103,11 @@ class FunctionRef<Ret(Params...)> {
       : FunctionRef(static_cast<Ret (*)(Params...)>(function)) {}
 
   Ret operator()(Params... params) const {
-    return callback_(&storage_, std::forward<Params>(params)...);
+    return storage_.function(std::forward<Params>(params)...);
   }
 
   explicit operator bool() const {
-    return callback_;
+    return storage_.function;
   }
 };
 

From b307c319a54e37f01dc5cc47e54906a5669b92d3 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:40:28 -0700
Subject: [PATCH 10/20] [WIP] headrify pte

---
 headrify.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 headrify.py

diff --git a/headrify.py b/headrify.py
new file mode 100644
index 00000000000..cdae780c31c
--- /dev/null
+++ b/headrify.py
@@ -0,0 +1,26 @@
+import binascii
+bytes_per_line = 32
+hex_digits_per_line = bytes_per_line * 2
+
+# copied from
+# https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py
+
+magic_attr = '__attribute__((section(".sram.data"), aligned(16))) char'
+# magic_attr = '__attribute__((section("network_model_sec"), aligned(16))) char'
+# magic_attr = '__attribute__((section("input_data_sec"), aligned(16))) char'
+filename="./add.pte"
+with open(filename, "rb") as fr, open(f"{filename}.h", "w") as fw:
+    data = fr.read()
+    hexstream = binascii.hexlify(data).decode('utf-8')
+
+    hexstring = magic_attr + ' add_pte[] = {'
+
+    for i in range(0, len(hexstream), 2):
+        if 0 == (i % hex_digits_per_line):
+            hexstring += "\n"
+        hexstring += '0x' + hexstream[i:i+2] + ", "
+
+    hexstring += '};\n'
+    fw.write(hexstring)
+    print(f"Wrote {len(hexstring)} bytes, original {len(data)}")
+

From 52dc73cc8d075d1d52bb24bee3705cdd84b889f0 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:48:50 -0700
Subject: [PATCH 11/20] [NOT FOR LAND] Hacks for ET_LOG

---
 runtime/platform/target/Posix.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp
index bc0f1d9f312..9f53964278a 100644
--- a/runtime/platform/target/Posix.cpp
+++ b/runtime/platform/target/Posix.cpp
@@ -52,11 +52,9 @@
 #define _ASSERT_PAL_INITIALIZED()                                   \
   ({                                                                \
     if (!initialized) {                                             \
-      fprintf(                                                      \
-          ET_LOG_OUTPUT_FILE,                                       \
+      printf(                                                       \
           "ExecuTorch PAL must be initialized before call to %s()", \
           __ET_FUNCTION);                                           \
-      fflush(ET_LOG_OUTPUT_FILE);                                   \
       et_pal_abort();                                               \
     }                                                               \
   })
@@ -144,8 +142,7 @@ void et_pal_emit_log_message(
   //
   // Clients who want to change the format or add other fields can override this
   // weak implementation of et_pal_emit_log_message.
-  fprintf(
-      ET_LOG_OUTPUT_FILE,
+  printf(
       "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n",
       level,
       hour,
@@ -155,5 +152,5 @@ void et_pal_emit_log_message(
       filename,
       line,
       message);
-  fflush(ET_LOG_OUTPUT_FILE);
+  // fflush(ET_LOG_OUTPUT_FILE);
 }

From b2a431fc3c200ff0aa531fe5b7fd69a3cdf836b0 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:32:02 -0700
Subject: [PATCH 12/20] [NOT FOR LAND] Hack op_add to reduce size

---
 kernels/portable/cpu/op_add.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 1da9d0eaee5..1ec18b3775d 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -33,10 +33,15 @@ Tensor& add_out(
 
   ET_CHECK(canCast(common_type, out_type));
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() {
-        ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() {
+//  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() {
+//    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() {
+//      ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() {
+//        ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() {
+
+          using CTYPE_A = float;
+          using CTYPE_B = float;
+          using CTYPE_IN = float;
+          using CTYPE_OUT = float;
           CTYPE_IN alpha_val;
           ET_EXTRACT_SCALAR(alpha, alpha_val);
 
@@ -51,10 +56,10 @@ Tensor& add_out(
               a,
               b,
               out);
-        });
-      });
-    });
-  });
+//        });
+//      });
+//    });
+//  });
 
   return out;
 }

From 77e8eb0ca970baa8f930009b8e6de5dada962ee6 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 12:38:14 -0700
Subject: [PATCH 13/20] [NOT FOR LAND] Hacks for add minimal example

---
 examples/export/export_example.py  |   3 +
 examples/models/toy_model/model.py |   3 -
 kernels/portable/functions.yaml    | 695 -----------------------------
 3 files changed, 3 insertions(+), 698 deletions(-)

diff --git a/examples/export/export_example.py b/examples/export/export_example.py
index 9c2a9d9362e..e26d929aeac 100644
--- a/examples/export/export_example.py
+++ b/examples/export/export_example.py
@@ -12,6 +12,7 @@
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
 from .utils import export_to_exec_prog, save_pte_program
+from executorch.exir.print_program import pretty_print, print_program  # noqa
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -40,4 +41,6 @@
     )
 
     prog = export_to_exec_prog(model, example_inputs)
+
+    pretty_print(prog.program.execution_plan)
     save_pte_program(prog.buffer, args.model_name)
diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py
index 0f7131fe21c..1c8f9f3b590 100644
--- a/examples/models/toy_model/model.py
+++ b/examples/models/toy_model/model.py
@@ -45,9 +45,6 @@ def __init__(self):
 
     def forward(self, x, y):
         z = x + y
-        z = z + x
-        z = z + x
-        z = z + z
         return z
 
     def get_eager_model(self) -> torch.nn.Module:
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 949b771b9cc..6e31dbe4939 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -17,702 +17,7 @@
 # See the README.md file in this directory for a description of the syntax used
 # by this file.
 
-- op: _log_softmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::log_softmax_out
-
-- op: _native_batch_norm_legit_no_training.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::_native_batch_norm_legit_no_training_out
-
-- op: _softmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::softmax_out
-
-- op: _to_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::to_copy_out
-
-- op: abs.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::abs_out
-
-- op: acos.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::acos_out
-
-- op: acosh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::acosh_out
-
 - op: add.out
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::add_out
-
-- op: add.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::add_scalar_out
-
-- op: addmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::addmm_out
-
-- op: alias_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::alias_copy_out
-
-- op: amax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::amax_out
-
-- op: amin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::amin_out
-
-- op: any.all_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::any_all_out
-
-- op: arange.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::arange_out
-
-- op: arange.start_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::arange_start_out
-
-- op: argmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::argmax_out
-
-- op: argmin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::argmin_out
-
-- op: as_strided_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::as_strided_copy_out
-
-- op: asin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::asin_out
-
-- op: asinh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::asinh_out
-
-- op: atan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::atan_out
-
-- op: atanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::atanh_out
-
-- op: avg_pool2d.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::avg_pool2d_out
-
-- op: bitwise_and.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_and_Scalar_out
-
-- op: bitwise_and.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_and_Tensor_out
-
-- op: bitwise_not.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_not_out
-
-- op: bitwise_or.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_or_Scalar_out
-
-- op: bitwise_or.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_or_Tensor_out
-
-- op: bitwise_xor.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_xor_Scalar_out
-
-- op: bitwise_xor.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_xor_Tensor_out
-
-- op: bmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bmm_out
-
-- op: cat.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cat_out
-
-- op: ceil.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ceil_out
-
-- op: clamp.out
-  cpp_no_default_args: ['min']
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::clamp_out
-
-- op: clone.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::clone_out
-
-- op: constant_pad_nd.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::constant_pad_nd_out
-
-- op: convolution.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::convolution_out
-
-- op: copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::copy_out
-
-- op: cos.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cos_out
-
-- op: cosh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cosh_out
-
-- op: cumsum.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cumsum_out
-
-- op: detach_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::detach_copy_out
-
-- op: div.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_out
-
-- op: div.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_scalar_out
-
-- op: div.out_mode
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
-
-
-- op: embedding.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::embedding_out
-
-- op: empty.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::empty_out
-
-- op: eq.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::eq_scalar_out
-
-- op: erf.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::erf_out
-
-- op: exp.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::exp_out
-
-- op: expand_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::expand_copy_out
-
-- op: fill.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fill_scalar_out
-
-- op: fill.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fill_tensor_out
-
-- op: floor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::floor_out
-
-- op: floor_divide.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::floor_divide_out
-
-- op: fmod.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fmod_Tensor_out
-
-- op: fmod.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fmod_Scalar_out
-
-- op: full.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::full_out
-
-# TODO: Investigate why empty dispatch is required for building:
-# buck2 build //executorch/kernels/portable:generated_lib
-- op: full_like.out
-  dispatch: {}
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::full_like_out
-
-- op: ge.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ge_scalar_out
-
-- op: ge.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ge_tensor_out
-
-- op: gelu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gelu_out
-
-- op: glu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::glu_out
-
-- op: gt.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gt_scalar_out
-
-- op: gt.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gt_tensor_out
-
-- op: hardtanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::hardtanh_out
-
-- op: index.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_Tensor_out
-
-- op: index_put.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_put_out
-
-- op: index_select.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_select_out
-
-- op: isinf.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::isinf_out
-
-- op: isnan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::isnan_out
-
-- op: le.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::le_scalar_out
-
-- op: le.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::le_tensor_out
-
-- op: leaky_relu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::leaky_relu_out
-
-- op: lift_fresh_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lift_fresh_copy_out
-
-- op: log.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::log_out
-
-- op: logical_and.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_and_out
-
-- op: logical_not.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_not_out
-
-- op: logical_or.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_or_out
-
-- op: logical_xor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_xor_out
-
-- op: logit.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logit_out
-
-- op: lt.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lt_scalar_out
-
-- op: lt.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lt_tensor_out
-
-- op: masked_fill.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::masked_fill_scalar_out
-
-- op: max.dim_max
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::max_out
-
-- op: max_pool2d_with_indices.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::max_pool2d_with_indices_out
-
-- op: mean.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mean_dim_out
-
-- op: min.dim_min
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::min_out
-
-- op: minimum.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::minimum_out
-
-- op: mm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mm_out
-
-- op: mul.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mul_out
-
-- op: mul.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mul_scalar_out
-
-- op: native_layer_norm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::native_layer_norm_out
-
-- op: ne.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ne_scalar_out
-
-- op: ne.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ne_tensor_out
-
-- op: neg.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::neg_out
-
-- op: nonzero.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::nonzero_out
-
-- op: ones.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ones_out
-
-- op: permute_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
-
-- op: pixel_shuffle.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pixel_shuffle_out
-
-- op: pow.Tensor_Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pow_Tensor_Scalar_out
-
-- op: pow.Tensor_Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pow_Tensor_Tensor_out
-
-- op: reciprocal.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::reciprocal_out
-
-- op: relu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::relu_out
-
-- op: remainder.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::remainder_Tensor_out
-
-- op: remainder.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::remainder_Scalar_out
-
-- op: repeat.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::repeat_out
-
-- op: round.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::round_out
-
-- op: rsqrt.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::rsqrt_out
-
-- op: rsub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::rsub_scalar_out
-
-- op: scalar_tensor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::scalar_tensor_out
-
-- op: scatter_add.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::scatter_add_out
-
-- op: select_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::select_copy_int_out
-
-- op: select_scatter.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::select_scatter_out
-
-- op: sigmoid.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sigmoid_out
-
-- op: sign.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sign_out
-
-- op: sin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sin_out
-
-- op: sinh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sinh_out
-
-- op: slice_copy.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::slice_copy_Tensor_out
-
-- op: slice_scatter.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::slice_scatter_out
-
-- op: split_copy.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::split_copy_Tensor_out
-
-- op: sqrt.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sqrt_out
-
-- op: squeeze_copy.dim_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::squeeze_copy_dim_out
-
-- op: stack.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::stack_out
-
-- op: sub.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sub_out
-
-- op: sub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sub_scalar_out
-
-- op: sum.IntList_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sum_dim_out
-
-- op: t_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::t_copy_out
-
-- op: tan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tan_out
-
-- op: tanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tanh_out
-
-- op: transpose_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::transpose_copy_int_out
-
-- op: tril.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tril_out
-
-- op: unbind_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::unbind_copy_int_out
-
-- op: unsqueeze_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::unsqueeze_copy_out
-
-- op: var.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::var_out
-
-- op: view_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::view_copy_out
-
-- op: where.self_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::where_out
-
-- op: zeros.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::zeros_out

From 9b244b31afdabcddd7e5e9c2b767d7dc159504b3 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 13:08:41 -0700
Subject: [PATCH 14/20] [NOT FOR LAND] HACK for manual kernel registration

---
 codegen/templates/RegisterCodegenUnboxedKernels.cpp | 13 ++++++++-----
 manual.h                                            |  5 +++++
 2 files changed, 13 insertions(+), 5 deletions(-)
 create mode 100644 manual.h

diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
index a7790be7fed..86938d065b8 100644
--- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -11,6 +11,8 @@
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/profiler.h>
 #include "${fn_header}" // Generated Function import headers
+#include <executorch/manual.h>
+
 // ${generated_comment}
 
 // NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
@@ -24,8 +26,6 @@
 using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
 namespace torch {
 namespace executor {
-namespace function {
-namespace {
 
 static Kernel kernels_to_register[] = {
     ${unboxed_kernels} // Generated kernels
@@ -39,8 +39,11 @@ static KernelArrayRef kernel_array_ref(
 
 // Return value not used. Keep the static variable assignment to register
 // kernels in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
-} // namespace
-} // namespace function
+// static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+
+void manual_override() {
+    static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/manual.h b/manual.h
new file mode 100644
index 00000000000..eaee9a15407
--- /dev/null
+++ b/manual.h
@@ -0,0 +1,5 @@
+namespace torch {
+namespace executor {
+    void manual_override();
+    void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack);
+}}

From 1cabc63d8ce1e339dcd61bb5919386772b7e9477 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 22:32:47 -0700
Subject: [PATCH 15/20] [NOT FOR LAND] Allow enabling logging in Release mode

---
 CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c384800b52..1678bc2d8a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,8 +59,18 @@ endif()
 # - targets in the current directory, before and after this command is invoked
 # - targets in sub-directories added after this command is invoked
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  # To enable logging in Release mode
+  option(
+    EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE
+    "Enable logging in release mode" OFF)
+
+  set(_ET_LOG_ENABLE 0)
+  if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE})
+    set(_ET_LOG_ENABLE 1)
+  endif()
+
   # Avoid pulling in the logging strings, which can be large.
-  add_definitions(-DET_LOG_ENABLED=0)
+  add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE})
   # Avoid pulling in the flatbuffer data verification
   # logic, which can add about 20kB.
   add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)

From 2f10ee3ebe522ec7a8e8b76a2135502dfd0d3031 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 28 Sep 2023 20:24:49 -0700
Subject: [PATCH 16/20] [NOT FOR LAND][arm] setup for core_platform

---
 ...regress-cmake-version-from-3.21-3.20.patch |  25 ++
 ...disable-warnings-to-reduce-verbosity.patch |  52 ++++
 ...0003-HACK-Add-Executorch-add-example.patch | 224 ++++++++++++++++++
 examples/arm/cs300/setup.sh                   |  28 +++
 4 files changed, 329 insertions(+)
 create mode 100644 examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch
 create mode 100644 examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch
 create mode 100644 examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch
 create mode 100755 examples/arm/cs300/setup.sh

diff --git a/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch
new file mode 100644
index 00000000000..efb02478229
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch
@@ -0,0 +1,25 @@
+From a969839b90756b2458cb80ac5edb619e87210bea Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:03 -0700
+Subject: [PATCH 1/3] [HACK] regress cmake version from 3.21 --> 3.20
+
+---
+ targets/corstone-300/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt
+index 62205bb..7dda8a1 100644
+--- a/targets/corstone-300/CMakeLists.txt
++++ b/targets/corstone-300/CMakeLists.txt
+@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena")
+ # Project
+ #############################################################################
+ 
+-cmake_minimum_required(VERSION 3.21)
++cmake_minimum_required(VERSION 3.20)
+ 
+ project(ethos-u-corstone-300 VERSION 0.0.1)
+ 
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch
new file mode 100644
index 00000000000..f2a6e17ccd8
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch
@@ -0,0 +1,52 @@
+From 3687c49c2ca85ca8a7d554b1206272870c565de3 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:30 -0700
+Subject: [PATCH 2/3] [HACK] disable warnings to reduce verbosity
+
+---
+ cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++-------------
+ 1 file changed, 14 insertions(+), 14 deletions(-)
+
+diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake
+index 093005e..0e6a2ed 100644
+--- a/cmake/toolchain/arm-none-eabi-gcc.cmake
++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake
+@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections)
+ 
+ # Compilation warnings
+ add_compile_options(
+-    -Wall
+-    -Wextra
++    # -Wall
++    # -Wextra
+ 
+-    -Wcast-align
+-    -Wdouble-promotion
+-    -Wformat
+-    -Wmissing-field-initializers
+-    -Wnull-dereference
+-    -Wredundant-decls
+-    -Wshadow
+-    -Wswitch
+-    -Wswitch-default
+-    -Wunused
++    # -Wcast-align
++    # -Wdouble-promotion
++    # -Wformat
++    # -Wmissing-field-initializers
++    # -Wnull-dereference
++    # -Wredundant-decls
++    # -Wshadow
++    # -Wswitch
++    # -Wswitch-default
++    # -Wunused
+ 
+-    -Wno-redundant-decls
++    # -Wno-redundant-decls
+ 
+-    -Wno-psabi
++    # -Wno-psabi
+ )
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch
new file mode 100644
index 00000000000..9a0b0be554e
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch
@@ -0,0 +1,224 @@
+From b5369c873814d765276a746ce26d2be5724da8f8 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 19:07:51 -0700
+Subject: [PATCH 3/3] [HACK] Add Executorch add example
+
+---
+ applications/CMakeLists.txt                  |   2 +
+ applications/executorch_tests/CMakeLists.txt |  53 ++++++++
+ applications/executorch_tests/add.cpp        | 130 +++++++++++++++++++
+ 3 files changed, 185 insertions(+)
+ create mode 100644 applications/executorch_tests/CMakeLists.txt
+ create mode 100644 applications/executorch_tests/add.cpp
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index 1fa2b2e..68e5427 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo)
+ 
+ add_subdirectory(message_handler_openamp)
+ 
++add_subdirectory(executorch_tests)
++
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang")
+     # Only armclang supported for now
+     add_subdirectory(trustzone_inference)
+diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
+new file mode 100644
+index 0000000..8a34c44
+--- /dev/null
++++ b/applications/executorch_tests/CMakeLists.txt
+@@ -0,0 +1,53 @@
++#
++# Copyright (c) 2021 Arm Limited. All rights reserved.
++#
++# SPDX-License-Identifier: Apache-2.0
++#
++# Licensed under the Apache License, Version 2.0 (the License); you may
++# not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an AS IS BASIS, WITHOUT
++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++if (NOT TARGET ethosu_core_driver)
++  return()
++endif()
++
++####
++#### Executorch demo app/test
++####
++
++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to Executorch dir")
++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to Executorch build dir")
++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to Executorch headers")
++
++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH)
++
++message("**********************")
++message("Executorch dir      (ET_DIR_PATH)      : ${ET_DIR_PATH}")
++message("Executorch build dir(ET_BUILD_DIR_PATH): ${ET_BUILD_DIR_PATH}")
++message("Executorch headers  (ET_INCUDE_PATH)   : ${ET_INCLUDE_PATH}")
++message("**********************")
++
++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
++
++ethosu_add_executable_test(executorch_add PRIVATE
++    SOURCES add.cpp
++    LIBRARIES ${LIB_ET_RUNTIME} ${LIB_ET_OP_REGISTRATION}
++    ${LIB_ET_OP_KERNELS})
++
++target_include_directories(executorch_add PRIVATE
++${ET_INCLUDE_PATH})
++
++# TODO Memory setup
+diff --git a/applications/executorch_tests/add.cpp b/applications/executorch_tests/add.cpp
+new file mode 100644
+index 0000000..115af66
+--- /dev/null
++++ b/applications/executorch_tests/add.cpp
+@@ -0,0 +1,130 @@
++/*
++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
++ *
++ * SPDX-License-Identifier: Apache-2.0
++ *
++ * Licensed under the Apache License, Version 2.0 (the License); you may
++ * not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/****************************************************************************
++ * Includes
++ ****************************************************************************/
++
++#include <stdio.h>
++
++#include <executorch/runtime/platform/runtime.h>
++#include <executorch/runtime/executor/program.h>
++#include <executorch/extension/data_loader/buffer_data_loader.h>
++#include <executorch/runtime/platform/log.h>
++#include <executorch/util/util.h>
++
++#include <executorch/add.pte.h>
++
++#include <executorch/manual.h>
++
++#include <vector>
++#include <memory>
++
++using namespace std;
++
++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
++
++/****************************************************************************
++ * Functions
++ ****************************************************************************/
++
++int main() {
++     /*
++      * This is a simple Executorch app which runs `add.pte`.
++      */
++
++    torch::executor::runtime_init();
++
++    torch::executor::manual_override(); // Hack: This will be updated soon.
++
++    using torch::executor::Result;
++    using torch::executor::Error;
++
++    auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte));
++
++    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
++    if(!program.ok()) {
++       ET_LOG(Info,"ET: Program loading failed @ 0x%p: 0x%" PRIx32, add_pte, program.error());
++    }
++
++    ET_LOG(Info,"ET: Model buffer loaded, has %lu methods", program->num_methods());
++
++    const char* method_name = nullptr;
++    {
++      const auto method_name_result = program->get_method_name(0);
++      ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++      method_name = *method_name_result;
++    }
++    ET_LOG(Info,"ET: Running method %s", method_name);
++
++    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
++    if (!method_meta.ok()) {
++        ET_LOG(Info,"ET: Failed to get method_meta for %s: 0x%x",
++                method_name, (unsigned int)method_meta.error());
++    }
++
++    torch::executor::MemoryAllocator method_allocator{
++        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
++
++    std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
++    std::vector<torch::executor::Span<uint8_t>> planned_spans; // Passed to the allocator
++    size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
++
++    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
++        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
++        ET_LOG(Info,"ET: Setting up planned buffer %zu, size %zu.", id, buffer_size);
++
++        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
++        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
++    }
++
++    torch::executor::HierarchicalAllocator planned_memory(
++      {planned_spans.data(), planned_spans.size()});
++
++    torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory);
++
++    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
++    if(!method.ok()) {
++        ET_LOG(Info,"ET: Loading of method %s failed with status 0x%" PRIx32, method_name, method.error());
++    }
++    ET_LOG(Info,"ET: Method loaded.");
++
++    ET_LOG(Info,"ET: Preparing inputs...");
++    auto inputs = torch::executor::util::PrepareInputTensors(*method);
++    ET_LOG(Info,"ET: Input prepared.");
++
++    ET_LOG(Info,"ET: Starting the model execution...");
++    Error status = method->execute();
++    if(status != Error::Ok){
++        ET_LOG(Info,"ET: Execution of method %s failed with status 0x%" PRIx32, method_name, status);
++    } else {
++        ET_LOG(Info,"ET: Model executed successfully.");
++    }
++
++    // Print the outputs.
++    std::vector<torch::executor::EValue> outputs(method->outputs_size());
++    ET_LOG(Info, "%zu outputs: ", outputs.size());
++    status = method->get_outputs(outputs.data(), outputs.size());
++    ET_CHECK(status == Error::Ok);
++    for (int i = 0; i < outputs.size(); ++i) {
++       for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
++          printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr<float>()[j]);
++       }
++    }
++    return 0;
++}
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/setup.sh b/examples/arm/cs300/setup.sh
new file mode 100755
index 00000000000..63fbd36b3bc
--- /dev/null
+++ b/examples/arm/cs300/setup.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+set -eu
+
+ethos_u_dir=${1:-/tmp/ethos-u}
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+
+function patch_repo() {
+    echo -e "\nPreparing ${name}..."
+    cd ${ethos_u_dir}/${name}
+   
+    git reset --hard ${base_rev}
+    
+    patch_dir=${script_dir}/${name}/patches/
+    [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \
+        git am -3 ${patch_dir}/*.patch
+    
+    echo -e "Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${ethos_u_dir}/${name} dir.\n"
+}
+
+name="core_platform"
+base_rev=204210b1074071532627da9dc69950d058a809f4
+patch_repo 
+
+name="core_software"
+base_rev=74c514a5b50a19197a64a86095bc0429188adcbe
+patch_repo 
+
+exit $?

From 298fb222438984c59265f75975d7bb345f7eb0c3 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Fri, 29 Sep 2023 13:49:52 +0000
Subject: [PATCH 17/20] Fixes to make simple_add run on hardware

 * fix to encode and decode of vela_bin_stream block sizes
 * hardcoded input/output population to check operation behaviour
 * use manual.h to init and register backend

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/arm_backend.py               |  2 +-
 backends/arm/cmake/build.sh               |  6 ++-
 backends/arm/runtime/ArmBackendEthosU.cpp | 51 +++++++++++++++++------
 manual.h                                  |  1 +
 4 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index e1f07d0a266..82b24f4b9b6 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -173,7 +173,7 @@ def vela_compile(tosa_fb):
                 # We need the acual unpadded block lengths for hw setup
                 block_length = len(block_data).to_bytes(16, 'little')
                 # pad block data to multiple of 16 bytes
-                block_data = block_data + b'\x00'*(16-len(block_data)%16)
+                block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16)
 
                 block = block_name + block_length + block_data
                 blocks = blocks + block
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
index d2bedd7769f..0dbb8cf2177 100755
--- a/backends/arm/cmake/build.sh
+++ b/backends/arm/cmake/build.sh
@@ -39,13 +39,15 @@ cd cmake-corstone
 
 #cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
 cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_XNNPACK=OFF \
 	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
 	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
 	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
 	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
 	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
 	  ..
-# -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \
 
 cd ..
-cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver executorch
+cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index e7f791d2b9f..c0c5db7df0a 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -34,6 +34,10 @@ namespace executor {
 class ArmBackend final : public PyTorchBackendInterface {
 
 public:
+	ArmBackend() {
+		printf("Constructing ARM Backend\n");
+	}
+	
 	~ArmBackend() = default;
 
 	virtual bool is_available() const override {
@@ -73,18 +77,29 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 		printf("ArmBackend::execute 0x%X\n", processed->data());
 
-		vela_handles handles = { 0, 0, 0, 0, 0, 0};
+		vela_handles handles = { 0, 0, 0, 0, 0, 0 };
 
 		// Command stream - we know at this point it's aligned
 		char *data = (char*)processed->data();
 
 		// Read key sections from the vela_bin_stream
 		this->vela_read( data, &handles );
-		
+
 		printf("Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
 			   handles.cmd_data, handles.cmd_data_length,
 			   handles.weight_data, handles.weight_data_length,
 			   handles.scratch_data, handles.scratch_data_length );
+
+		// TMP emit scratch
+		printf("Scratch before:\n");
+		for( int i=0; i<handles.scratch_data_length; i++ )
+		{
+			if( i%4 == 0 ) ((char*)handles.scratch_data)[i] = 1;
+			printf("%02x ", ((char*)handles.scratch_data)[i]);
+			if( !((i+1)%4) ) printf("\n");
+		}
+		printf("\n");
+		
 		// Invoke driver using the above pointers
 		CommandStream cs(
 			DataPointer(handles.cmd_data, handles.cmd_data_length),
@@ -106,7 +121,16 @@ class ArmBackend final : public PyTorchBackendInterface {
 			printf("Error, failure executing job\n");
 			return Error::InvalidProgram;
 		}
-	
+
+        // TMP emit scratch
+        printf("Scratch after:\n");
+        for( int i=0; i<handles.scratch_data_length; i++ )
+        {
+            printf("%02x ", ((char*)handles.scratch_data)[i]);
+            if( !((i+1)%4) ) printf("\n");
+        }
+        printf("\n");
+		
 		return Error::Ok;
 	}
 
@@ -123,9 +147,9 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 	int vela_read(char* data, vela_handles *h ) const {
 		if( strncmp( data, "vela_bin_stream", 15 ) ) return 0;
+		data += 16;
 		while( 1 )
 		{
-			data += 16;
 			if( !strncmp( data, "vela_end_stream", 15 ) )
 			{
 				printf("footer found!\n");
@@ -135,15 +159,17 @@ class ArmBackend final : public PyTorchBackendInterface {
 			char *block_name = data;
 			data += 16;
 			int block_length = ((int*)data)[0];
-			int block_length_padded = block_length + (15-(block_length-1)%16);
-			printf("  length %d\n", block_length );
-			printf("  padded length %d\n", block_length_padded );
+			int block_length_padded = ((block_length-1)|15)+1;
+			printf("  a length %d\n", block_length );
+			printf("  a padded length %d\n" );
+			data += 16;
 			char *block_data = data;
 			data += block_length_padded;
 
 			if( !strncmp( block_name, "cmd_data", strlen("cmd_data")) )
 			{
-				printf("Capturing cmd_data\n");
+				printf("Capturing cmd_data %p %c%c%c%c\n", block_data,
+					   block_data[0], block_data[1], block_data[2], block_data[3]);
 				h->cmd_data = block_data;
 				h->cmd_data_length = block_length;
 			}
@@ -161,14 +187,13 @@ class ArmBackend final : public PyTorchBackendInterface {
 			}
 		}
 	}
-
 };
 
-namespace {
 	auto backend = ArmBackend();
-	Backend backend_id{"ArmBackend", &backend};
-	static auto registered = register_backend(backend_id);
-} // namespace 
+	void arm_backend_register() {
+		Backend backend_id{"ArmBackend", &backend};
+		static auto registered = register_backend(backend_id);
+	}
 
 } // namespace executor
 } // namespace torch
diff --git a/manual.h b/manual.h
index eaee9a15407..3719a142718 100644
--- a/manual.h
+++ b/manual.h
@@ -2,4 +2,5 @@ namespace torch {
 namespace executor {
     void manual_override();
     void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack);
+    void arm_backend_register();
 }}

From 08d71d9510ccf08d56df49d687d4aa5ce0b3dc5c Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 2 Oct 2023 09:39:26 +0000
Subject: [PATCH 18/20] tidied binary reading and moved to ET_LOG

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/runtime/ArmBackendEthosU.cpp | 155 +++++++++++++---------
 1 file changed, 91 insertions(+), 64 deletions(-)

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index c0c5db7df0a..10ba1dbfd58 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -6,7 +6,8 @@
  */
 
 /*
- * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver
+ * Arm backend for Ethos-U baremetal driver stack, this relies on the
+ * ethos-u-core-driver for hardware interaction.
  */
 
 #include <memory>
@@ -21,21 +22,19 @@
 #include "command_stream.hpp"
 using namespace EthosU::CommandStream;
 
-// Required byte alignment of all input pointers
-#define ETHOS_U_ALIGN 0xF
-char *ethos_align( char *ptr )
-{
-	return (char*)((uintptr_t)~ETHOS_U_ALIGN & (uintptr_t)(ptr + (ETHOS_U_ALIGN-1)));
-}
-
 namespace torch {
 namespace executor {
 
+// TODO we should be in 0x31, not this lower 1MB sRAM
+// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000
+#define CS300_SRAM_LOW ((void*)0x11000000)
+#define CS300_SRAM_HIGH ((void*)0x110FFFFF)
+
 class ArmBackend final : public PyTorchBackendInterface {
 
 public:
 	ArmBackend() {
-		printf("Constructing ARM Backend\n");
+		ET_LOG(Debug, "Constructing ARM Backend");
 	}
 	
 	~ArmBackend() = default;
@@ -49,19 +48,37 @@ class ArmBackend final : public PyTorchBackendInterface {
 		FreeableBuffer* processed,
 		ArrayRef<CompileSpec> compile_specs) const override {
 
-		printf("ArmBackend::init 0x%X\n", processed->data());
+        ET_LOG(Info, "ArmBackend::init %p", processed->data() );
 
 		char *data = (char*)processed->data();
 		size_t size = processed->size();
-		
-		//the model should have been placed in sram with
-		//__attribute__((section(".sram.data"), aligned(16)))
-		void *aligned = ethos_align(data);
-		if( data != ethos_align(data)) return Error::InvalidProgram;
-
-		// TODO: Verify address range is accessible to Ethos-U
-		// current expectation is the program is in SRAM
-		if(0) return Error::InvalidProgram;
+		char *foot = data + size - 16;
+
+		// Header and footer both 16 bit aligned suggest valid structure and we
+		// wont walk off the end of the chunks and segfault
+		if( !((int)data == next_mul_16((int)data)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !((int)foot == next_mul_16((int)foot)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( data, "vela_bin_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( foot, "vela_end_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		// Verify address range is accessible current expectation is the program
+		// is wholly stored in SRAM
+		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) );
 		
 		// Return the same buffer we were passed - this data will be
 		// executed directly
@@ -75,7 +92,7 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 		FreeableBuffer* processed = (FreeableBuffer*)input_handle;
 
-		printf("ArmBackend::execute 0x%X\n", processed->data());
+		ET_LOG(Info, "ArmBackend::execute %p", processed->data() );
 
 		vela_handles handles = { 0, 0, 0, 0, 0, 0 };
 
@@ -83,16 +100,20 @@ class ArmBackend final : public PyTorchBackendInterface {
 		char *data = (char*)processed->data();
 
 		// Read key sections from the vela_bin_stream
-		this->vela_read( data, &handles );
+		if( !this->vela_read( data, &handles, processed->size() ) )
+		{
+			ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" );
+			return Error::InvalidProgram;
+		}
 
-		printf("Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
-			   handles.cmd_data, handles.cmd_data_length,
-			   handles.weight_data, handles.weight_data_length,
-			   handles.scratch_data, handles.scratch_data_length );
+		ET_LOG(Debug, "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+			   handles.cmd_data, handles.cmd_data_size,
+			   handles.weight_data, handles.weight_data_size,
+			   handles.scratch_data, handles.scratch_data_size );
 
 		// TMP emit scratch
 		printf("Scratch before:\n");
-		for( int i=0; i<handles.scratch_data_length; i++ )
+		for( int i=0; i<handles.scratch_data_size; i++ )
 		{
 			if( i%4 == 0 ) ((char*)handles.scratch_data)[i] = 1;
 			printf("%02x ", ((char*)handles.scratch_data)[i]);
@@ -102,10 +123,10 @@ class ArmBackend final : public PyTorchBackendInterface {
 		
 		// Invoke driver using the above pointers
 		CommandStream cs(
-			DataPointer(handles.cmd_data, handles.cmd_data_length),
+			DataPointer(handles.cmd_data, handles.cmd_data_size),
 			BasePointers({
-					DataPointer(handles.weight_data, handles.weight_data_length),
-					DataPointer(handles.scratch_data, handles.scratch_data_length)
+					DataPointer(handles.weight_data, handles.weight_data_size),
+					DataPointer(handles.scratch_data, handles.scratch_data_size)
 				}),
 			PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE})
 			);
@@ -124,7 +145,7 @@ class ArmBackend final : public PyTorchBackendInterface {
 
         // TMP emit scratch
         printf("Scratch after:\n");
-        for( int i=0; i<handles.scratch_data_length; i++ )
+        for( int i=0; i<handles.scratch_data_size; i++ )
         {
             printf("%02x ", ((char*)handles.scratch_data)[i]);
             if( !((i+1)%4) ) printf("\n");
@@ -140,53 +161,59 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 private:
 	typedef struct {
-		const char *cmd_data; int cmd_data_length;
-		const char *weight_data; int weight_data_length;
-		const char *scratch_data; int scratch_data_length;
+		const char *cmd_data; int cmd_data_size;
+		const char *weight_data; int weight_data_size;
+		const char *scratch_data; int scratch_data_size;
 	} vela_handles;
 
-	int vela_read(char* data, vela_handles *h ) const {
-		if( strncmp( data, "vela_bin_stream", 15 ) ) return 0;
+	typedef struct {
+		char name[16];
+		int size; char _pad[12];
+		char data[];
+	} vela_bin_block;
+
+	static int next_mul_16( int n ) {
+		return ((n-1)|15)+1;
+	}
+	
+	int vela_read(char* data, vela_handles *h, int size ) const {
+
+		// Read header string
+		if( strncmp( data, "vela_bin_stream", 15 ) )
+		{
+			return 0;
+		}
 		data += 16;
+
+		// Expect one or more 'vela_bin_block's
 		while( 1 )
 		{
-			if( !strncmp( data, "vela_end_stream", 15 ) )
-			{
-				printf("footer found!\n");
-				return 1;
-			}
-			printf("reading block '%s':\n", data);
-			char *block_name = data;
-			data += 16;
-			int block_length = ((int*)data)[0];
-			int block_length_padded = ((block_length-1)|15)+1;
-			printf("  a length %d\n", block_length );
-			printf("  a padded length %d\n" );
-			data += 16;
-			char *block_data = data;
-			data += block_length_padded;
-
-			if( !strncmp( block_name, "cmd_data", strlen("cmd_data")) )
+			vela_bin_block *b = (vela_bin_block*)data;
+			data += 16 + 16 + next_mul_16(b->size);
+
+			// Exit with success on finding end of stream
+			if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1;
+
+			if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) )
 			{
-				printf("Capturing cmd_data %p %c%c%c%c\n", block_data,
-					   block_data[0], block_data[1], block_data[2], block_data[3]);
-				h->cmd_data = block_data;
-				h->cmd_data_length = block_length;
+				// This magic header confirms a valid command stream in binary
+				if( strncmp( b->data, "COP1", 4 ) ) return 0;
+				h->cmd_data = b->data;
+				h->cmd_data_size = b->size;
 			}
-			if( !strncmp( block_name, "weight_data", strlen("weight_data")) )
+			if( !strncmp( b->name, "weight_data", strlen("weight_data")) )
 			{
-				printf("Capturing weight_data\n");
-				h->weight_data = block_data;
-				h->weight_data_length = block_length;
+				h->weight_data = b->data;;
+				h->weight_data_size = b->size;
 			}
-			if( !strncmp( block_name, "scratch_data", strlen("scratch_data")) )
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
 			{
-				printf("Capturing scratch_data\n");
-				h->scratch_data = block_data;
-				h->scratch_data_length = block_length;
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
 			}
 		}
 	}
+
 };
 
 	auto backend = ArmBackend();

From 189b04c8e99b19ab9b0457db8747de65aca032a6 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 2 Oct 2023 11:57:17 +0000
Subject: [PATCH 19/20] Simplified EthosU invocation code

 * Removed dependencies on aything but driver
 * moved to minimal invocation pattern

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/runtime/ArmBackendEthosU.cpp |  48 +++---
 backends/arm/runtime/command_stream.cpp   | 169 ----------------------
 backends/arm/runtime/command_stream.hpp   | 120 ---------------
 3 files changed, 21 insertions(+), 316 deletions(-)
 delete mode 100644 backends/arm/runtime/command_stream.cpp
 delete mode 100644 backends/arm/runtime/command_stream.hpp

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 10ba1dbfd58..abf0d8e63d6 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -19,9 +19,6 @@
 #include <ethosu_driver.h>
 #include <pmu_ethosu.h>
 
-#include "command_stream.hpp"
-using namespace EthosU::CommandStream;
-
 namespace torch {
 namespace executor {
 
@@ -120,29 +117,26 @@ class ArmBackend final : public PyTorchBackendInterface {
 			if( !((i+1)%4) ) printf("\n");
 		}
 		printf("\n");
-		
-		// Invoke driver using the above pointers
-		CommandStream cs(
-			DataPointer(handles.cmd_data, handles.cmd_data_size),
-			BasePointers({
-					DataPointer(handles.weight_data, handles.weight_data_size),
-					DataPointer(handles.scratch_data, handles.scratch_data_size)
-				}),
-			PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE})
-			);
-
-		cs.getPmu().clear();
-		int res = cs.run(1);
-		if(res == 0)
+
+		// Allocate driver handle and synchronously invoke driver
+		ethosu_driver *drv = ethosu_reserve_driver();
+
+		uint64_t bases[2] = {(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
+		size_t bases_size[2] = {handles.weight_data_size, handles.scratch_data_size};
+		int result = ethosu_invoke_v3(drv,
+									  (void*)handles.cmd_data,
+									  handles.cmd_data_size,
+									  bases,
+									  bases_size,
+									  2,
+									  nullptr);
+
+		if(result != 0)
 		{
-			uint64_t cycleCount = cs.getPmu().getCycleCount();
-			cs.getPmu().print();
-			printf("cycleCount=%llu, cycleCountPerJob=%llu\n", cycleCount, cycleCount);
-		} else {
-			printf("Error, failure executing job\n");
+			ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result);
 			return Error::InvalidProgram;
-		}
-
+		}											  
+		
         // TMP emit scratch
         printf("Scratch after:\n");
         for( int i=0; i<handles.scratch_data_size; i++ )
@@ -161,9 +155,9 @@ class ArmBackend final : public PyTorchBackendInterface {
 
 private:
 	typedef struct {
-		const char *cmd_data; int cmd_data_size;
-		const char *weight_data; int weight_data_size;
-		const char *scratch_data; int scratch_data_size;
+		const char *cmd_data; size_t cmd_data_size;
+		const char *weight_data; size_t weight_data_size;
+		const char *scratch_data; size_t scratch_data_size;
 	} vela_handles;
 
 	typedef struct {
diff --git a/backends/arm/runtime/command_stream.cpp b/backends/arm/runtime/command_stream.cpp
deleted file mode 100644
index d2e62ce629a..00000000000
--- a/backends/arm/runtime/command_stream.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/****************************************************************************
- * Includes
- ****************************************************************************/
-
-#include "command_stream.hpp"
-
-#include <inttypes.h>
-#include <stdio.h>
-
-using namespace std;
-
-namespace EthosU {
-namespace CommandStream {
-
-/****************************************************************************
- * DataPointer
- ****************************************************************************/
-
-DataPointer::DataPointer() : data(nullptr), size(0) {}
-
-DataPointer::DataPointer(const char *_data, size_t _size) : data(_data), size(_size) {}
-
-bool DataPointer::operator!=(const DataPointer &other) {
-    if (size != other.size) {
-        return true;
-    }
-
-    for (size_t i = 0; i < size; i++) {
-        if (data[i] != other.data[i]) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-/****************************************************************************
- * PmuConfig
- ****************************************************************************/
-
-Pmu::Pmu(ethosu_driver *_drv, const PmuEvents &_config) : drv(_drv), config(_config) {
-    // Enable PMU block
-    ETHOSU_PMU_Enable(drv);
-
-    // Enable cycle counter
-    ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
-
-    // Configure event types
-    for (size_t i = 0; i < config.size(); i++) {
-        ETHOSU_PMU_Set_EVTYPER(drv, i, config[i]);
-        ETHOSU_PMU_CNTR_Enable(drv, 1u << i);
-    }
-}
-
-void Pmu::clear() {
-    ETHOSU_PMU_CYCCNT_Reset(drv);
-    ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
-}
-
-void Pmu::print() {
-    printf("PMU={cycleCount=%llu, events=[%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32 "]}\n",
-           ETHOSU_PMU_Get_CCNTR(drv),
-           ETHOSU_PMU_Get_EVCNTR(drv, 0),
-           ETHOSU_PMU_Get_EVCNTR(drv, 1),
-           ETHOSU_PMU_Get_EVCNTR(drv, 2),
-           ETHOSU_PMU_Get_EVCNTR(drv, 3));
-}
-
-uint64_t Pmu::getCycleCount() const {
-    return ETHOSU_PMU_Get_CCNTR(drv);
-}
-
-uint32_t Pmu::getEventCount(size_t index) const {
-    return ETHOSU_PMU_Get_EVCNTR(drv, index);
-}
-
-/****************************************************************************
- * CommandStream
- ****************************************************************************/
-
-CommandStream::CommandStream(const DataPointer &_commandStream,
-                             const BasePointers &_basePointers,
-                             const PmuEvents &_pmuEvents) :
-    drv(ethosu_reserve_driver()),
-    commandStream(_commandStream), basePointers(_basePointers), pmu(drv, _pmuEvents) {}
-
-CommandStream::~CommandStream() {
-    ethosu_release_driver(drv);
-}
-
-int CommandStream::run(size_t repeat) {
-    // Base pointer array
-    uint64_t baseAddress[ETHOSU_BASEP_INDEXES];
-    size_t baseAddressSize[ETHOSU_BASEP_INDEXES];
-    for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) {
-        baseAddress[i]     = reinterpret_cast<uint64_t>(basePointers[i].data);
-        baseAddressSize[i] = reinterpret_cast<size_t>(basePointers[i].size);
-    }
-
-    while (repeat-- > 0) {
-        int error = ethosu_invoke_v3(
-            drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr);
-
-        if (error != 0) {
-            printf("Inference failed. error=%d\n", error);
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-int CommandStream::run_async() {
-    // Base pointer array
-    uint64_t baseAddress[ETHOSU_BASEP_INDEXES];
-    size_t baseAddressSize[ETHOSU_BASEP_INDEXES];
-
-    for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) {
-        baseAddress[i]     = reinterpret_cast<uint64_t>(basePointers[i].data);
-        baseAddressSize[i] = reinterpret_cast<size_t>(basePointers[i].size);
-    }
-
-    int error = ethosu_invoke_async(
-        drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr);
-
-    if (error != 0) {
-        printf("Inference invoke async failed. error=%d\n", error);
-        return 1;
-    }
-
-    return 0;
-}
-
-int CommandStream::wait_async(bool block) {
-    return ethosu_wait(drv, block);
-}
-
-DataPointer &CommandStream::getCommandStream() {
-    return commandStream;
-}
-
-BasePointers &CommandStream::getBasePointers() {
-    return basePointers;
-}
-
-Pmu &CommandStream::getPmu() {
-    return pmu;
-}
-
-}; // namespace CommandStream
-}; // namespace EthosU
diff --git a/backends/arm/runtime/command_stream.hpp b/backends/arm/runtime/command_stream.hpp
deleted file mode 100644
index 7163b9d58ca..00000000000
--- a/backends/arm/runtime/command_stream.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef COMMAND_STREAM_HPP
-#define COMMAND_STREAM_HPP
-
-/****************************************************************************
- * Includes
- ****************************************************************************/
-
-#include <array>
-#include <ethosu_driver.h>
-#include <pmu_ethosu.h>
-#include <stddef.h>
-
-/****************************************************************************
- * Defines
- ****************************************************************************/
-
-#ifndef ETHOSU_BASEP_INDEXES
-#define ETHOSU_BASEP_INDEXES 8
-#endif
-
-/****************************************************************************
- * Types
- ****************************************************************************/
-
-namespace EthosU {
-namespace CommandStream {
-
-/****************************************************************************
- * DataPointer
- ****************************************************************************/
-
-struct DataPointer {
-    DataPointer();
-    DataPointer(const char *_data, size_t _size);
-
-    bool operator!=(const DataPointer &other);
-
-    const char *data;
-    size_t size;
-};
-
-/****************************************************************************
- * Pmu
- ****************************************************************************/
-
-using PmuEvents = std::array<ethosu_pmu_event_type, ETHOSU_PMU_NCOUNTERS>;
-
-class Pmu {
-public:
-    Pmu(ethosu_driver *_drv, const PmuEvents &_config = {});
-
-    void clear();
-    void print();
-
-    uint64_t getCycleCount() const;
-    uint32_t getEventCount(size_t index) const;
-
-private:
-    ethosu_driver *drv;
-    PmuEvents config;
-};
-
-/****************************************************************************
- * CommandStream
- ****************************************************************************/
-
-using BasePointers = std::array<DataPointer, ETHOSU_BASEP_INDEXES>;
-
-class CommandStream {
-public:
-    CommandStream(const DataPointer &_commandStream,
-                  const BasePointers &_pointers = {},
-                  const PmuEvents &_pmuEvents   = {});
-    virtual ~CommandStream();
-
-    int run(size_t repeat = 1);
-    int run_async();
-    int wait_async(bool block = true);
-
-    DataPointer &getCommandStream();
-    BasePointers &getBasePointers();
-    Pmu &getPmu();
-
-private:
-    ethosu_driver *drv;
-    DataPointer commandStream;
-    BasePointers basePointers;
-    Pmu pmu;
-};
-
-#define DRIVER_ACTION_MAGIC() 'C', 'O', 'P', '1',
-
-#define DRIVER_ACTION_COMMAND_STREAM(length) 0x02, (length >> 16) & 0xff, length & 0xff, (length >> 8) & 0xff,
-
-#define DRIVER_ACTION_NOP() 0x05, 0x00, 0x00, 0x00,
-
-#define NPU_OP_STOP(mask) (mask >> 8) && 0xff, mask & 0xff, 0x08, 0x00,
-
-}; // namespace CommandStream
-}; // namespace EthosU
-
-#endif /* COMMAND_STREAM_HPP */

From 70071205bf8f433268e45c87e6d334ec7011da43 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 2 Oct 2023 14:14:12 +0000
Subject: [PATCH 20/20] Basic ethos output copy to EValue

 * currently assumes function signature
 * read relevant argument data from vela_bin

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/CMakeLists.txt               |  2 +-
 backends/arm/runtime/ArmBackendEthosU.cpp | 47 +++++++++++++++++++++--
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 6d6cd1938b7..2cc5cf94740 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -18,7 +18,7 @@ set(_common_compile_options -Wno-deprecated-declarations)
 
 include(cmake/Dependencies.cmake)
 
-set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp backends/arm/runtime/command_stream.cpp)
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
 add_library(ethos_u STATIC ${_arm_baremetal_sources})
 target_include_directories(ethos_u PUBLIC ${_common_include_directories})
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index abf0d8e63d6..3dc52645089 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -117,7 +117,7 @@ class ArmBackend final : public PyTorchBackendInterface {
 			if( !((i+1)%4) ) printf("\n");
 		}
 		printf("\n");
-
+		
 		// Allocate driver handle and synchronously invoke driver
 		ethosu_driver *drv = ethosu_reserve_driver();
 
@@ -135,9 +135,9 @@ class ArmBackend final : public PyTorchBackendInterface {
 		{
 			ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result);
 			return Error::InvalidProgram;
-		}											  
+		}
 		
-        // TMP emit scratch
+		// TMP emit scratch
         printf("Scratch after:\n");
         for( int i=0; i<handles.scratch_data_size; i++ )
         {
@@ -146,6 +146,17 @@ class ArmBackend final : public PyTorchBackendInterface {
         }
         printf("\n");
 		
+		// Process results into EValue storage
+		// TODO: optimise into direct write for compatible layouts
+		// TODO: get num in/out and layout?
+		int *output_address = (int*)(handles.scratch_data + handles.output_offset);
+		auto tensor = args[1]->toTensor();
+		for(int j=0; j<tensor.numel(); j++)
+		{
+			
+			tensor.mutable_data_ptr<int>()[j] = output_address[j];
+		}
+				
 		return Error::Ok;
 	}
 
@@ -158,6 +169,8 @@ class ArmBackend final : public PyTorchBackendInterface {
 		const char *cmd_data; size_t cmd_data_size;
 		const char *weight_data; size_t weight_data_size;
 		const char *scratch_data; size_t scratch_data_size;
+		size_t input_offset; size_t input_data_shape[3];
+		size_t output_offset; size_t output_data_shape[3];
 	} vela_handles;
 
 	typedef struct {
@@ -205,6 +218,34 @@ class ArmBackend final : public PyTorchBackendInterface {
 				h->scratch_data = b->data;
 				h->scratch_data_size = b->size;
 			}
+
+			// capture inputs and outputs
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
+			{
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
+			}
+			if( !strncmp( b->name, "input_offset", strlen("input_offset")) )
+			{
+				h->input_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "output_offset", strlen("output_offset")) )
+			{
+				h->output_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "input_shape", strlen("input_shape")) )
+			{
+				h->input_data_shape[0] = ((int*)b->data)[0];
+				h->input_data_shape[0] = ((int*)b->data)[1];
+				h->input_data_shape[0] = ((int*)b->data)[2];
+				
+			}
+			if( !strncmp( b->name, "output_shape", strlen("output_shape")) )
+			{
+				h->output_data_shape[0] = ((int*)b->data)[0];
+				h->output_data_shape[0] = ((int*)b->data)[1];
+                h->output_data_shape[0] = ((int*)b->data)[2];
+            }							
 		}
 	}