From 7f44e16937c8835800afa87f2772b9a507f3a3fa Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 27 Sep 2023 08:34:22 +0000 Subject: [PATCH 01/20] Add ethos-u-core-driver submodule Signed-off-by: Rob Elliott --- .gitmodules | 3 +++ backends/arm/third-party/ethos-u-core-driver | 1 + 2 files changed, 4 insertions(+) create mode 160000 backends/arm/third-party/ethos-u-core-driver diff --git a/.gitmodules b/.gitmodules index 980a999eff0..aac8050326d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -31,3 +31,6 @@ [submodule "backends/arm/third-party/serialization_lib"] path = backends/arm/third-party/serialization_lib url = https://git.mlplatform.org/tosa/serialization_lib.git +[submodule "backends/arm/third-party/ethos-u-core-driver"] + path = backends/arm/third-party/ethos-u-core-driver + url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver new file mode 160000 index 00000000000..90f9df900ac --- /dev/null +++ b/backends/arm/third-party/ethos-u-core-driver @@ -0,0 +1 @@ +Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5 From c6755bf00500e2712558c8aaaf8e33d14d4c3edc Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 27 Sep 2023 11:45:33 +0000 Subject: [PATCH 02/20] Added shell of runtime Arm Backend for Ethos-U * basic build system amendments * toolchain file for baremetal platforms * use of ethos-u-core-driver submodule * some scripts to pull a compiler and test-build the backend Signed-off-by: Rob Elliott --- CMakeLists.txt | 8 ++ backends/arm/CMakeLists.txt | 25 ++++++ backends/arm/cmake/Dependencies.cmake | 12 +++ backends/arm/cmake/arm-none-eabi-gcc.cmake | 90 ++++++++++++++++++++++ backends/arm/cmake/build.sh | 47 +++++++++++ backends/arm/cmake/toolchain.sh | 11 +++ backends/arm/runtime/ArmBackendEthosU.cpp | 58 ++++++++++++++ schema/CMakeLists.txt | 2 +- 8 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 backends/arm/CMakeLists.txt create mode 100644 backends/arm/cmake/Dependencies.cmake create mode 100644 backends/arm/cmake/arm-none-eabi-gcc.cmake create mode 100755 backends/arm/cmake/build.sh create mode 100755 backends/arm/cmake/toolchain.sh create mode 100644 backends/arm/runtime/ArmBackendEthosU.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3883c991bf2..a61450f8895 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,10 @@ if(BUILD_SELECTIVE_BUILD_TEST) option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF) endif() +# Build Arm Baremetal backend +option(EXECUTORCH_BUILD_ARM_BAREMETAL + "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF) + # Build xnn_executor_runner which depends on XNNPACK option(EXECUTORCH_BUILD_XNNPACK "Build xnn_executor_runner which depends on XNNPACK" OFF) @@ -282,6 +286,10 @@ if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() + # Add selective build subdirectory if(BUILD_SELECTIVE_BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt new file mode 100644 index 00000000000..2cc5cf94740 --- /dev/null +++ b/backends/arm/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) +set(_common_compile_options -Wno-deprecated-declarations) + +include(cmake/Dependencies.cmake) + +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") +add_library(ethos_u STATIC ${_arm_baremetal_sources}) +target_include_directories(ethos_u PUBLIC ${_common_include_directories}) +target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake new file mode 100644 index 00000000000..27a587176bb --- /dev/null +++ b/backends/arm/cmake/Dependencies.cmake @@ -0,0 +1,12 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") + +# Ethos-U driver +set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver") +set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include") +add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} ) +include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} ) diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake new file mode 100644 index 00000000000..d70f79361cd --- /dev/null +++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake @@ -0,0 +1,90 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU") +string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_C_COMPILER "arm-none-eabi-gcc") +set(CMAKE_CXX_COMPILER "arm-none-eabi-g++") +set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc") +set(CMAKE_LINKER "arm-none-eabi-ld") + +set(CMAKE_EXECUTABLE_SUFFIX ".elf") +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# Select C/C++ version +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) + +set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR}) +string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU}) + +# Compile options +add_compile_options( + -mcpu=${GCC_CPU} + -mthumb + "$<$:-gdwarf-3>" + "$<$:-fno-unwind-tables;-fno-rtti;-fno-exceptions>" + -fdata-sections + -ffunction-sections) + +# Compile defines +add_compile_definitions( + "$<$>:NDEBUG>") + +# Link options +add_link_options( + -mcpu=${GCC_CPU} + -mthumb + --specs=nosys.specs) + +# Set floating point unit +if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp") + set(FLOAT soft) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)") + set(FLOAT hard) + set(FPU_CONFIG "fpv4-sp-d16") + add_compile_options(-mfpu=${FPU_CONFIG}) + add_link_options(-mfpu=${FPU_CONFIG}) +else() + set(FLOAT soft) +endif() + +if (FLOAT) + add_compile_options(-mfloat-abi=${FLOAT}) + add_link_options(-mfloat-abi=${FLOAT}) +endif() + +add_link_options(LINKER:--nmagic,--gc-sections) + +# Compilation warnings +add_compile_options( +# -Wall +# -Wextra + +# -Wcast-align +# -Wdouble-promotion +# -Wformat +# -Wmissing-field-initializers +# -Wnull-dereference +# -Wredundant-decls +# -Wshadow +# -Wswitch +# -Wswitch-default +# -Wunused + -Wno-redundant-decls + -Wno-psabi +) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh new file mode 100755 index 00000000000..490a358c9a1 --- /dev/null +++ b/backends/arm/cmake/build.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# +# Setup toolchain +# +BASEDIR=`realpath $(dirname "$0")` +echo "building using build.sh in $BASEDIR" + +GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/ +echo $GCCPATH +if test -d "${GCCPATH}"; then + echo Using exising compiler ${GCCPATH} +else + pushd ${BASEDIR}/ + ./toolchain.sh + popd +fi +export PATH=${PATH}:${GCCPATH} + +echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"` + + +# +# Prepare and run clean build +# +rm -rf buck-out/ build/lib/ cmake-out/ +rm -rf cmake-corstone +mkdir cmake-corstone +cd cmake-corstone + +#cmake -DBUCK2=buck2 .. + +#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake .. +cmake -DFLATC_EXECUTABLE=flatc \ + -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ + .. +# -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \ + +cd .. +cmake --build cmake-corstone -j1 --target ethos_u diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh new file mode 100755 index 00000000000..7fd4abcc781 --- /dev/null +++ b/backends/arm/cmake/toolchain.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon) +curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz +tar xf gcc.tar.xz +export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)` diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp new file mode 100644 index 00000000000..fcf10567b26 --- /dev/null +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -0,0 +1,58 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver + */ + +#include +#include +#include + +#include +#include + +namespace torch { +namespace executor { + +class ArmBackend final : public PyTorchBackendInterface { + +public: + ~ArmBackend() = default; + + virtual bool is_available() const override { + return 1; + } + + Result init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override { + return Error::Ok; + } + + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle, + EValue** args) const override { + return Error::Ok; + } + + void destroy(DelegateHandle* handle) const override { + return; + } + +}; + +namespace { + auto backend = ArmBackend(); + Backend backend_id{"ArmBackend", &backend}; + static auto registered = register_backend(backend_id); +} // namespace + +} // namespace executor +} // namespace torch diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt index 0c7dc2cbec4..55c07fd5f7b 100644 --- a/schema/CMakeLists.txt +++ b/schema/CMakeLists.txt @@ -41,7 +41,7 @@ add_custom_command( -o "${_program_schema__include_dir}/executorch/schema" ${_program_schema__srcs} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS ${FLATC_EXECUTABLE} ${_program_schema__srcs} + DEPENDS ${_program_schema__srcs} COMMENT "Generating program_schema headers" VERBATIM) From 6421ead4363e9815703b6c4fdf5033cd712a8de4 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 27 Sep 2023 14:33:14 +0000 Subject: [PATCH 03/20] First-pass ethos-u backend with an assumed flat program format in SRAM, this will be reworked as wrapped ethos buffers in .pte become available Signed-off-by: Rob Elliott --- backends/arm/runtime/ArmBackendEthosU.cpp | 76 +++++++++- backends/arm/runtime/command_stream.cpp | 169 ++++++++++++++++++++++ backends/arm/runtime/command_stream.hpp | 120 +++++++++++++++ 3 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 backends/arm/runtime/command_stream.cpp create mode 100644 backends/arm/runtime/command_stream.hpp diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index fcf10567b26..186e3318f61 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -9,6 +9,8 @@ * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver */ +#include + #include #include #include @@ -16,6 +18,16 @@ #include #include +#include "command_stream.hpp" +using namespace EthosU::CommandStream; + +// Required byte alignment of all input pointers +#define ETHOS_U_ALIGN 0xF +char *ethos_align( char *ptr ) +{ + return (char*)((uintptr_t)~ETHOS_U_ALIGN & (uintptr_t)(ptr + (ETHOS_U_ALIGN-1))); +} + namespace torch { namespace executor { @@ -32,13 +44,73 @@ class ArmBackend final : public PyTorchBackendInterface { BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs) const override { - return Error::Ok; + + printf("ArmBackend::init 0x%X\n", processed->data()); + + char *data = (char*)processed->data(); + size_t size = processed->size(); + + //the model should have been placed in sram with + //__attribute__((section(".sram.data"), aligned(16))) + void *aligned = ethos_align(data); + if( data != ethos_align(data)) return Error::InvalidProgram; + + // TODO: Verify address range is accessible to Ethos-U + // current expectation is the program is in SRAM + if(0) return Error::InvalidProgram; + + // Return the same buffer we were passed - this data will be + // executed directly + return processed; } Error execute( BackendExecutionContext& context, - DelegateHandle* handle, + DelegateHandle* input_handle, EValue** args) const override { + + FreeableBuffer* processed = (FreeableBuffer*)input_handle; + + printf("ArmBackend::execute 0x%X\n", processed->data()); + + // Command stream - we know at this point it's aligned + char *handle = (char*)processed->data(); + int command_stream_length = ((int*)handle)[0]; + char *command_stream = ethos_align(handle+sizeof(int)); + + // Static tensors/weights/model data + handle = ethos_align( command_stream + command_stream_length ); + int weight_data_length = ((int*)handle)[0]; + char *weight_data = ethos_align(handle+sizeof(int)); + + // Activation data, input and output memory + handle = ethos_align( weight_data + weight_data_length ); + int activation_data_length = ((int*)handle)[0]; + char *activation_data = ethos_align(handle+sizeof(int)); + + + // Invoke driver using the above pointers + CommandStream cs( + DataPointer(command_stream, command_stream_length), + BasePointers({ + DataPointer(weight_data, weight_data_length), + DataPointer(activation_data, activation_data_length) + }), + PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE}) + ); + + cs.getPmu().clear(); + int res = cs.run(1); + if(res == 0) + { + uint64_t cycleCount = cs.getPmu().getCycleCount(); + cs.getPmu().print(); + printf("cycleCount=%llu, cycleCountPerJob=%llu\n", cycleCount, cycleCount); + } else { + printf("Error, failure executing job\n"); + return Error::InvalidProgram; + } + return Error::Ok; } diff --git a/backends/arm/runtime/command_stream.cpp b/backends/arm/runtime/command_stream.cpp new file mode 100644 index 00000000000..d2e62ce629a --- /dev/null +++ b/backends/arm/runtime/command_stream.cpp @@ -0,0 +1,169 @@ +/* + * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/**************************************************************************** + * Includes + ****************************************************************************/ + +#include "command_stream.hpp" + +#include +#include + +using namespace std; + +namespace EthosU { +namespace CommandStream { + +/**************************************************************************** + * DataPointer + ****************************************************************************/ + +DataPointer::DataPointer() : data(nullptr), size(0) {} + +DataPointer::DataPointer(const char *_data, size_t _size) : data(_data), size(_size) {} + +bool DataPointer::operator!=(const DataPointer &other) { + if (size != other.size) { + return true; + } + + for (size_t i = 0; i < size; i++) { + if (data[i] != other.data[i]) { + return true; + } + } + + return false; +} + +/**************************************************************************** + * PmuConfig + ****************************************************************************/ + +Pmu::Pmu(ethosu_driver *_drv, const PmuEvents &_config) : drv(_drv), config(_config) { + // Enable PMU block + ETHOSU_PMU_Enable(drv); + + // Enable cycle counter + ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); + + // Configure event types + for (size_t i = 0; i < config.size(); i++) { + ETHOSU_PMU_Set_EVTYPER(drv, i, config[i]); + ETHOSU_PMU_CNTR_Enable(drv, 1u << i); + } +} + +void Pmu::clear() { + ETHOSU_PMU_CYCCNT_Reset(drv); + ETHOSU_PMU_EVCNTR_ALL_Reset(drv); +} + +void Pmu::print() { + printf("PMU={cycleCount=%llu, events=[%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32 "]}\n", + ETHOSU_PMU_Get_CCNTR(drv), + ETHOSU_PMU_Get_EVCNTR(drv, 0), + ETHOSU_PMU_Get_EVCNTR(drv, 1), + ETHOSU_PMU_Get_EVCNTR(drv, 2), + ETHOSU_PMU_Get_EVCNTR(drv, 3)); +} + +uint64_t Pmu::getCycleCount() const { + return ETHOSU_PMU_Get_CCNTR(drv); +} + +uint32_t Pmu::getEventCount(size_t index) const { + return ETHOSU_PMU_Get_EVCNTR(drv, index); +} + +/**************************************************************************** + * CommandStream + ****************************************************************************/ + +CommandStream::CommandStream(const DataPointer &_commandStream, + const BasePointers &_basePointers, + const PmuEvents &_pmuEvents) : + drv(ethosu_reserve_driver()), + commandStream(_commandStream), basePointers(_basePointers), pmu(drv, _pmuEvents) {} + +CommandStream::~CommandStream() { + ethosu_release_driver(drv); +} + +int CommandStream::run(size_t repeat) { + // Base pointer array + uint64_t baseAddress[ETHOSU_BASEP_INDEXES]; + size_t baseAddressSize[ETHOSU_BASEP_INDEXES]; + for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) { + baseAddress[i] = reinterpret_cast(basePointers[i].data); + baseAddressSize[i] = reinterpret_cast(basePointers[i].size); + } + + while (repeat-- > 0) { + int error = ethosu_invoke_v3( + drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr); + + if (error != 0) { + printf("Inference failed. error=%d\n", error); + return 1; + } + } + + return 0; +} + +int CommandStream::run_async() { + // Base pointer array + uint64_t baseAddress[ETHOSU_BASEP_INDEXES]; + size_t baseAddressSize[ETHOSU_BASEP_INDEXES]; + + for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) { + baseAddress[i] = reinterpret_cast(basePointers[i].data); + baseAddressSize[i] = reinterpret_cast(basePointers[i].size); + } + + int error = ethosu_invoke_async( + drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr); + + if (error != 0) { + printf("Inference invoke async failed. error=%d\n", error); + return 1; + } + + return 0; +} + +int CommandStream::wait_async(bool block) { + return ethosu_wait(drv, block); +} + +DataPointer &CommandStream::getCommandStream() { + return commandStream; +} + +BasePointers &CommandStream::getBasePointers() { + return basePointers; +} + +Pmu &CommandStream::getPmu() { + return pmu; +} + +}; // namespace CommandStream +}; // namespace EthosU diff --git a/backends/arm/runtime/command_stream.hpp b/backends/arm/runtime/command_stream.hpp new file mode 100644 index 00000000000..7163b9d58ca --- /dev/null +++ b/backends/arm/runtime/command_stream.hpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COMMAND_STREAM_HPP +#define COMMAND_STREAM_HPP + +/**************************************************************************** + * Includes + ****************************************************************************/ + +#include +#include +#include +#include + +/**************************************************************************** + * Defines + ****************************************************************************/ + +#ifndef ETHOSU_BASEP_INDEXES +#define ETHOSU_BASEP_INDEXES 8 +#endif + +/**************************************************************************** + * Types + ****************************************************************************/ + +namespace EthosU { +namespace CommandStream { + +/**************************************************************************** + * DataPointer + ****************************************************************************/ + +struct DataPointer { + DataPointer(); + DataPointer(const char *_data, size_t _size); + + bool operator!=(const DataPointer &other); + + const char *data; + size_t size; +}; + +/**************************************************************************** + * Pmu + ****************************************************************************/ + +using PmuEvents = std::array; + +class Pmu { +public: + Pmu(ethosu_driver *_drv, const PmuEvents &_config = {}); + + void clear(); + void print(); + + uint64_t getCycleCount() const; + uint32_t getEventCount(size_t index) const; + +private: + ethosu_driver *drv; + PmuEvents config; +}; + +/**************************************************************************** + * CommandStream + ****************************************************************************/ + +using BasePointers = std::array; + +class CommandStream { +public: + CommandStream(const DataPointer &_commandStream, + const BasePointers &_pointers = {}, + const PmuEvents &_pmuEvents = {}); + virtual ~CommandStream(); + + int run(size_t repeat = 1); + int run_async(); + int wait_async(bool block = true); + + DataPointer &getCommandStream(); + BasePointers &getBasePointers(); + Pmu &getPmu(); + +private: + ethosu_driver *drv; + DataPointer commandStream; + BasePointers basePointers; + Pmu pmu; +}; + +#define DRIVER_ACTION_MAGIC() 'C', 'O', 'P', '1', + +#define DRIVER_ACTION_COMMAND_STREAM(length) 0x02, (length >> 16) & 0xff, length & 0xff, (length >> 8) & 0xff, + +#define DRIVER_ACTION_NOP() 0x05, 0x00, 0x00, 0x00, + +#define NPU_OP_STOP(mask) (mask >> 8) && 0xff, mask & 0xff, 0x08, 0x00, + +}; // namespace CommandStream +}; // namespace EthosU + +#endif /* COMMAND_STREAM_HPP */ From 1f62accbf566cbecbe5daf2ac61c42d089c59430 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Wed, 27 Sep 2023 15:40:36 +0000 Subject: [PATCH 04/20] fixed builds of ethos-u-core-driver * Added cmmss submodule dependency * Added command_stream.cpp into the build * Added the target to the build script Signed-off-by: Rob Elliott --- .gitmodules | 3 +++ backends/arm/CMakeLists.txt | 2 +- backends/arm/cmake/build.sh | 4 +++- backends/arm/third-party/cmsis | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) create mode 160000 backends/arm/third-party/cmsis diff --git a/.gitmodules b/.gitmodules index aac8050326d..3138391f7c0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -34,3 +34,6 @@ [submodule "backends/arm/third-party/ethos-u-core-driver"] path = backends/arm/third-party/ethos-u-core-driver url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git +[submodule "backends/arm/third-party/cmsis"] + path = backends/arm/third-party/cmsis + url = https://github.com/ARM-software/CMSIS_5.git diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 2cc5cf94740..6d6cd1938b7 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -18,7 +18,7 @@ set(_common_compile_options -Wno-deprecated-declarations) include(cmake/Dependencies.cmake) -set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp backends/arm/runtime/command_stream.cpp) list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") add_library(ethos_u STATIC ${_arm_baremetal_sources}) target_include_directories(ethos_u PUBLIC ${_common_include_directories}) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh index 490a358c9a1..353b90125c6 100755 --- a/backends/arm/cmake/build.sh +++ b/backends/arm/cmake/build.sh @@ -39,9 +39,11 @@ cd cmake-corstone cmake -DFLATC_EXECUTABLE=flatc \ -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ .. # -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \ cd .. -cmake --build cmake-corstone -j1 --target ethos_u +cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis new file mode 160000 index 00000000000..a75f01746df --- /dev/null +++ b/backends/arm/third-party/cmsis @@ -0,0 +1 @@ +Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3 From 83a5e32b09a858c02590b6e348d5c06e35d0b866 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 28 Sep 2023 10:57:12 +0000 Subject: [PATCH 05/20] Emit Ethos-U55 chunked binaries from preprocess * vela binaries returned from preprocess * Included in PTE captured from arm_tosa_e2e * currently assumes vela is on path Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 52 +++++++++++++++++++++++++++++++----- examples/arm/arm_tosa_e2e.py | 2 +- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 1a6499cf07d..1ec28bdeb21 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -13,6 +13,7 @@ import operator import os import tempfile +import subprocess from typing import final, List import numpy as np @@ -140,6 +141,47 @@ def dbg_tosa_dump(tosa_fb, path): f.write(js) f.close() +# Output to Vela with current file-based compilation +# WARNING: if this changes, the runtime reader also needs to change +def vela_compile(tosa_fb): + with tempfile.TemporaryDirectory() as tmpdir: + print(f"compiling to Vela in {tmpdir}") + + tosaname = "out.tosa" + flatbuffer = tosa_fb.serialize() + f = open(os.path.join(tmpdir,tosaname), "wb") + f.write(flatbuffer) + f.close() + + # invoke vela + # TODO target ethos-u55-128 + vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + subprocess.run([vela_command], shell=True, check=True) + + np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz") + blocks = b'' + with np.load(np_path, allow_pickle=False) as data: + # Emit the NPZ regions as: + # - 16 byte block name null terminated string (padded to 16 if name shorter) + # - 4 byes of int32 block length and 12 bytes of 0's + # - block data (padded to 16 byte alignment at end) + # Repeat for all blocks + for key in data.keys(): + block_name = bytes(key,"utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_data = data[key].tobytes() + # We need the acual unpadded block lengths for hw setup + block_length = len(block_data).to_bytes(16, 'little') + # pad block data to multiple of 16 bytes + block_data = block_data + b'\x00'*(16-len(block_data)%16) + + block = block_name + block_length + block_data + blocks = blocks + block + + # return 16 byte VELA bin header + blocks + footer + header = bytes("vela_bin_stream","utf-8") + b'\x00' + footer = bytes("vela_end_stream","utf-8") + b'\x00' + return header + blocks + footer def dbg_fail(node, tosa_fb, path): dbg_tosa_dump(tosa_fb, path) @@ -205,10 +247,6 @@ def preprocess( # noqa: C901 path = spec.value.decode() debug_output = True - # in non debug builds we still pass files to vela - if path is None: - path = tempfile.mkdtemp(prefix="arm_tosa_") - # Converted output for this subgraph, serializer needs path early as it emits # const data directly. Path created and data written only in debug builds. tosa_fb = ts.TosaSerializer(path) @@ -680,5 +718,7 @@ def preprocess( # noqa: C901 dbg_tosa_dump(tosa_fb, path) # Serialize and return the tosa flatbuffer - fb = tosa_fb.serialize() - return PreprocessResult(processed_bytes=bytes(fb)) + # fb = bytes(tosa_fb.serialize()) + binary = vela_compile(tosa_fb) + + return PreprocessResult(processed_bytes=binary) diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py index e320ca0cf4e..a9e07bed4c9 100644 --- a/examples/arm/arm_tosa_e2e.py +++ b/examples/arm/arm_tosa_e2e.py @@ -153,7 +153,7 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 # Temp systest mode for running all models against both inference profiles if __name__ == "__main__": for op in TestList: - tosa_run_test(op, profile=TosaProfile.MI) + tosa_run_test(op, profile=TosaProfile.BI) # TODO: haven't added the quantized lowerings for BI, comment out for now # for op in TestList: From 93cfdc260ba0a266ed136a36af31c6245edcc953 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 28 Sep 2023 13:05:22 +0000 Subject: [PATCH 06/20] added executorch to the build targets Signed-off-by: Rob Elliott --- backends/arm/cmake/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh index 353b90125c6..fd02a4d9b55 100755 --- a/backends/arm/cmake/build.sh +++ b/backends/arm/cmake/build.sh @@ -46,4 +46,4 @@ cmake -DFLATC_EXECUTABLE=flatc \ # -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \ cd .. -cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver +cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver executorch From a38f08012c1e10fa1ab29abd321a92c48a330eb9 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 28 Sep 2023 17:22:49 +0000 Subject: [PATCH 07/20] Extended the delegate to read 'vela_bin_stream's * added a scratch block in the vela_bin to preallocate it * added a vela_bin reading routine into ArmBackendEthosU * set pointers passed to vela based on vela_bin Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 17 +++++ backends/arm/cmake/build.sh | 4 +- backends/arm/cmake/toolchain.sh | 3 +- backends/arm/runtime/ArmBackendEthosU.cpp | 78 ++++++++++++++++++----- 4 files changed, 83 insertions(+), 19 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 1ec28bdeb21..e1f07d0a266 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -178,6 +178,23 @@ def vela_compile(tosa_fb): block = block_name + block_length + block_data blocks = blocks + block + # Add a block for scratch, inputs and outputs + # scratch shape is a 1 element array giving us size in bytes + block_name = bytes("scratch_data","utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_length = data["scratch_shape"][0].item() + print(f"scratch length = {block_length}") + block_length = block_length+(15-(block_length-1)%16) + block_data = b'\x00'*block_length + block_length = block_length.to_bytes(16, 'little') + print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}") + block = block_name + block_length + block_data + blocks = blocks + block + # TODO are these already in scratch shape? look to be + #input_shape * input_elem_size + #output_shape * output_elem_size + # input_offset and output_offset specify the location these arrays are written from base of scratch + # return 16 byte VELA bin header + blocks + footer header = bytes("vela_bin_stream","utf-8") + b'\x00' footer = bytes("vela_end_stream","utf-8") + b'\x00' diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh index fd02a4d9b55..d2bedd7769f 100755 --- a/backends/arm/cmake/build.sh +++ b/backends/arm/cmake/build.sh @@ -11,7 +11,9 @@ set -e BASEDIR=`realpath $(dirname "$0")` echo "building using build.sh in $BASEDIR" -GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/ +ARCH=$(uname -i) +GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/ + echo $GCCPATH if test -d "${GCCPATH}"; then echo Using exising compiler ${GCCPATH} diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh index 7fd4abcc781..92188ee982d 100755 --- a/backends/arm/cmake/toolchain.sh +++ b/backends/arm/cmake/toolchain.sh @@ -6,6 +6,7 @@ set -e # Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon) -curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz +ARCH=$(uname -i) +curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz tar xf gcc.tar.xz export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)` diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 186e3318f61..e7f791d2b9f 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -73,28 +73,24 @@ class ArmBackend final : public PyTorchBackendInterface { printf("ArmBackend::execute 0x%X\n", processed->data()); - // Command stream - we know at this point it's aligned - char *handle = (char*)processed->data(); - int command_stream_length = ((int*)handle)[0]; - char *command_stream = ethos_align(handle+sizeof(int)); - - // Static tensors/weights/model data - handle = ethos_align( command_stream + command_stream_length ); - int weight_data_length = ((int*)handle)[0]; - char *weight_data = ethos_align(handle+sizeof(int)); - - // Activation data, input and output memory - handle = ethos_align( weight_data + weight_data_length ); - int activation_data_length = ((int*)handle)[0]; - char *activation_data = ethos_align(handle+sizeof(int)); + vela_handles handles = { 0, 0, 0, 0, 0, 0}; + // Command stream - we know at this point it's aligned + char *data = (char*)processed->data(); + // Read key sections from the vela_bin_stream + this->vela_read( data, &handles ); + + printf("Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, handles.cmd_data_length, + handles.weight_data, handles.weight_data_length, + handles.scratch_data, handles.scratch_data_length ); // Invoke driver using the above pointers CommandStream cs( - DataPointer(command_stream, command_stream_length), + DataPointer(handles.cmd_data, handles.cmd_data_length), BasePointers({ - DataPointer(weight_data, weight_data_length), - DataPointer(activation_data, activation_data_length) + DataPointer(handles.weight_data, handles.weight_data_length), + DataPointer(handles.scratch_data, handles.scratch_data_length) }), PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE}) ); @@ -118,6 +114,54 @@ class ArmBackend final : public PyTorchBackendInterface { return; } +private: + typedef struct { + const char *cmd_data; int cmd_data_length; + const char *weight_data; int weight_data_length; + const char *scratch_data; int scratch_data_length; + } vela_handles; + + int vela_read(char* data, vela_handles *h ) const { + if( strncmp( data, "vela_bin_stream", 15 ) ) return 0; + while( 1 ) + { + data += 16; + if( !strncmp( data, "vela_end_stream", 15 ) ) + { + printf("footer found!\n"); + return 1; + } + printf("reading block '%s':\n", data); + char *block_name = data; + data += 16; + int block_length = ((int*)data)[0]; + int block_length_padded = block_length + (15-(block_length-1)%16); + printf(" length %d\n", block_length ); + printf(" padded length %d\n", block_length_padded ); + char *block_data = data; + data += block_length_padded; + + if( !strncmp( block_name, "cmd_data", strlen("cmd_data")) ) + { + printf("Capturing cmd_data\n"); + h->cmd_data = block_data; + h->cmd_data_length = block_length; + } + if( !strncmp( block_name, "weight_data", strlen("weight_data")) ) + { + printf("Capturing weight_data\n"); + h->weight_data = block_data; + h->weight_data_length = block_length; + } + if( !strncmp( block_name, "scratch_data", strlen("scratch_data")) ) + { + printf("Capturing scratch_data\n"); + h->scratch_data = block_data; + h->scratch_data_length = block_length; + } + } + } + }; namespace { From 8de6e9260d827c2b0effbae175601c668978f97b Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:33:44 -0700 Subject: [PATCH 08/20] [ET][Portable] Add int types header --- kernels/portable/cpu/vec_ops.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h index 0373196a4b6..5a297026050 100644 --- a/kernels/portable/cpu/vec_ops.h +++ b/kernels/portable/cpu/vec_ops.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * @file From 644eafc9bb5d7a9e7bb53b3cb972a0efb637601a Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:33:10 -0700 Subject: [PATCH 09/20] [Executorch] Simplify FunctionRef to make it more portable Also helps with c++11 compliance if we have less c++20 code esp when we don't use it. --- runtime/core/function_ref.h | 51 ++++--------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/runtime/core/function_ref.h b/runtime/core/function_ref.h index 92171134291..a07f6151f10 100644 --- a/runtime/core/function_ref.h +++ b/runtime/core/function_ref.h @@ -59,9 +59,7 @@ class FunctionRef; template class FunctionRef { - Ret (*callback_)(const void* memory, Params... params) = nullptr; union Storage { - void* callable; Ret (*function)(Params...); } storage_; @@ -70,57 +68,18 @@ class FunctionRef { explicit FunctionRef(std::nullptr_t) {} /** - * Case 1: A callable object passed by lvalue reference. - * Taking rvalue reference is error prone because the object will be always - * be destroyed immediately. - */ - template < - typename Callable, - // This is not the copy-constructor. - typename std::enable_if< - !std::is_same, FunctionRef>::value, - int32_t>::type = 0, - // Avoid lvalue reference to non-capturing lambda. - typename std::enable_if< - !std::is_convertible::value, - int32_t>::type = 0, - // Functor must be callable and return a suitable type. - // To make this container type safe, we need to ensure either: - // 1. The return type is void. - // 2. Or the resulting type from calling the callable is convertible to - // the declared return type. - typename std::enable_if< - std::is_void::value || - std::is_convertible< - decltype(std::declval()(std::declval()...)), - Ret>::value, - int32_t>::type = 0> - explicit FunctionRef(Callable& callable) - : callback_([](const void* memory, Params... params) { - auto& storage = *static_cast(memory); - auto& callable = *static_cast(storage.callable); - return static_cast(callable(std::forward(params)...)); - }) { - storage_.callable = &callable; - } - - /** - * Case 2: A plain function pointer. + * Case 1: A plain function pointer. * Instead of storing an opaque pointer to underlying callable object, * store a function pointer directly. * Note that in the future a variant which coerces compatible function * pointers could be implemented by erasing the storage type. */ - /* implicit */ FunctionRef(Ret (*ptr)(Params...)) - : callback_([](const void* memory, Params... params) { - auto& storage = *static_cast(memory); - return storage.function(std::forward(params)...); - }) { + /* implicit */ FunctionRef(Ret (*ptr)(Params...)) { storage_.function = ptr; } /** - * Case 3: Implicit conversion from lambda to FunctionRef. + * Case 2: Implicit conversion from lambda to FunctionRef. * A common use pattern is like: * void foo(FunctionRef<...>) {...} * foo([](...){...}) @@ -144,11 +103,11 @@ class FunctionRef { : FunctionRef(static_cast(function)) {} Ret operator()(Params... params) const { - return callback_(&storage_, std::forward(params)...); + return storage_.function(std::forward(params)...); } explicit operator bool() const { - return callback_; + return storage_.function; } }; From b307c319a54e37f01dc5cc47e54906a5669b92d3 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:40:28 -0700 Subject: [PATCH 10/20] [WIP] headrify pte --- headrify.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 headrify.py diff --git a/headrify.py b/headrify.py new file mode 100644 index 00000000000..cdae780c31c --- /dev/null +++ b/headrify.py @@ -0,0 +1,26 @@ +import binascii +bytes_per_line = 32 +hex_digits_per_line = bytes_per_line * 2 + +# copied from +# https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py + +magic_attr = '__attribute__((section(".sram.data"), aligned(16))) char' +# magic_attr = '__attribute__((section("network_model_sec"), aligned(16))) char' +# magic_attr = '__attribute__((section("input_data_sec"), aligned(16))) char' +filename="./add.pte" +with open(filename, "rb") as fr, open(f"{filename}.h", "w") as fw: + data = fr.read() + hexstream = binascii.hexlify(data).decode('utf-8') + + hexstring = magic_attr + ' add_pte[] = {' + + for i in range(0, len(hexstream), 2): + if 0 == (i % hex_digits_per_line): + hexstring += "\n" + hexstring += '0x' + hexstream[i:i+2] + ", " + + hexstring += '};\n' + fw.write(hexstring) + print(f"Wrote {len(hexstring)} bytes, original {len(data)}") + From 52dc73cc8d075d1d52bb24bee3705cdd84b889f0 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:48:50 -0700 Subject: [PATCH 11/20] [NOT FOR LAND] Hacks for ET_LOG --- runtime/platform/target/Posix.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp index bc0f1d9f312..9f53964278a 100644 --- a/runtime/platform/target/Posix.cpp +++ b/runtime/platform/target/Posix.cpp @@ -52,11 +52,9 @@ #define _ASSERT_PAL_INITIALIZED() \ ({ \ if (!initialized) { \ - fprintf( \ - ET_LOG_OUTPUT_FILE, \ + printf( \ "ExecuTorch PAL must be initialized before call to %s()", \ __ET_FUNCTION); \ - fflush(ET_LOG_OUTPUT_FILE); \ et_pal_abort(); \ } \ }) @@ -144,8 +142,7 @@ void et_pal_emit_log_message( // // Clients who want to change the format or add other fields can override this // weak implementation of et_pal_emit_log_message. - fprintf( - ET_LOG_OUTPUT_FILE, + printf( "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n", level, hour, @@ -155,5 +152,5 @@ void et_pal_emit_log_message( filename, line, message); - fflush(ET_LOG_OUTPUT_FILE); + // fflush(ET_LOG_OUTPUT_FILE); } From b2a431fc3c200ff0aa531fe5b7fd69a3cdf836b0 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:32:02 -0700 Subject: [PATCH 12/20] [NOT FOR LAND] Hack op_add to reduce size --- kernels/portable/cpu/op_add.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 1da9d0eaee5..1ec18b3775d 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -33,10 +33,15 @@ Tensor& add_out( ET_CHECK(canCast(common_type, out_type)); - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() { + + using CTYPE_A = float; + using CTYPE_B = float; + using CTYPE_IN = float; + using CTYPE_OUT = float; CTYPE_IN alpha_val; ET_EXTRACT_SCALAR(alpha, alpha_val); @@ -51,10 +56,10 @@ Tensor& add_out( a, b, out); - }); - }); - }); - }); +// }); +// }); +// }); +// }); return out; } From 77e8eb0ca970baa8f930009b8e6de5dada962ee6 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 12:38:14 -0700 Subject: [PATCH 13/20] [NOT FOR LAND] Hacks for add minimal example --- examples/export/export_example.py | 3 + examples/models/toy_model/model.py | 3 - kernels/portable/functions.yaml | 695 ----------------------------- 3 files changed, 3 insertions(+), 698 deletions(-) diff --git a/examples/export/export_example.py b/examples/export/export_example.py index 9c2a9d9362e..e26d929aeac 100644 --- a/examples/export/export_example.py +++ b/examples/export/export_example.py @@ -12,6 +12,7 @@ from ..models import MODEL_NAME_TO_MODEL from ..models.model_factory import EagerModelFactory from .utils import export_to_exec_prog, save_pte_program +from executorch.exir.print_program import pretty_print, print_program # noqa FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" @@ -40,4 +41,6 @@ ) prog = export_to_exec_prog(model, example_inputs) + + pretty_print(prog.program.execution_plan) save_pte_program(prog.buffer, args.model_name) diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py index 0f7131fe21c..1c8f9f3b590 100644 --- a/examples/models/toy_model/model.py +++ b/examples/models/toy_model/model.py @@ -45,9 +45,6 @@ def __init__(self): def forward(self, x, y): z = x + y - z = z + x - z = z + x - z = z + z return z def get_eager_model(self) -> torch.nn.Module: diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 949b771b9cc..6e31dbe4939 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -17,702 +17,7 @@ # See the README.md file in this directory for a description of the syntax used # by this file. -- op: _log_softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::log_softmax_out - -- op: _native_batch_norm_legit_no_training.out - kernels: - - arg_meta: null - kernel_name: torch::executor::_native_batch_norm_legit_no_training_out - -- op: _softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::softmax_out - -- op: _to_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::to_copy_out - -- op: abs.out - kernels: - - arg_meta: null - kernel_name: torch::executor::abs_out - -- op: acos.out - kernels: - - arg_meta: null - kernel_name: torch::executor::acos_out - -- op: acosh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::acosh_out - - op: add.out kernels: - arg_meta: null kernel_name: torch::executor::add_out - -- op: add.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::add_scalar_out - -- op: addmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::addmm_out - -- op: alias_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::alias_copy_out - -- op: amax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::amax_out - -- op: amin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::amin_out - -- op: any.all_out - kernels: - - arg_meta: null - kernel_name: torch::executor::any_all_out - -- op: arange.out - kernels: - - arg_meta: null - kernel_name: torch::executor::arange_out - -- op: arange.start_out - kernels: - - arg_meta: null - kernel_name: torch::executor::arange_start_out - -- op: argmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::argmax_out - -- op: argmin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::argmin_out - -- op: as_strided_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::as_strided_copy_out - -- op: asin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::asin_out - -- op: asinh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::asinh_out - -- op: atan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::atan_out - -- op: atanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::atanh_out - -- op: avg_pool2d.out - kernels: - - arg_meta: null - kernel_name: torch::executor::avg_pool2d_out - -- op: bitwise_and.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_and_Scalar_out - -- op: bitwise_and.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_and_Tensor_out - -- op: bitwise_not.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_not_out - -- op: bitwise_or.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_or_Scalar_out - -- op: bitwise_or.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_or_Tensor_out - -- op: bitwise_xor.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_xor_Scalar_out - -- op: bitwise_xor.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_xor_Tensor_out - -- op: bmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bmm_out - -- op: cat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cat_out - -- op: ceil.out - kernels: - - arg_meta: null - kernel_name: torch::executor::ceil_out - -- op: clamp.out - cpp_no_default_args: ['min'] - kernels: - - arg_meta: null - kernel_name: torch::executor::clamp_out - -- op: clone.out - kernels: - - arg_meta: null - kernel_name: torch::executor::clone_out - -- op: constant_pad_nd.out - kernels: - - arg_meta: null - kernel_name: torch::executor::constant_pad_nd_out - -- op: convolution.out - kernels: - - arg_meta: null - kernel_name: torch::executor::convolution_out - -- op: copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::copy_out - -- op: cos.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cos_out - -- op: cosh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cosh_out - -- op: cumsum.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cumsum_out - -- op: detach_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::detach_copy_out - -- op: div.out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out - -- op: div.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_scalar_out - -- op: div.out_mode - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out_mode - - -- op: embedding.out - kernels: - - arg_meta: null - kernel_name: torch::executor::embedding_out - -- op: empty.out - kernels: - - arg_meta: null - kernel_name: torch::executor::empty_out - -- op: eq.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::eq_scalar_out - -- op: erf.out - kernels: - - arg_meta: null - kernel_name: torch::executor::erf_out - -- op: exp.out - kernels: - - arg_meta: null - kernel_name: torch::executor::exp_out - -- op: expand_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::expand_copy_out - -- op: fill.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fill_scalar_out - -- op: fill.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fill_tensor_out - -- op: floor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::floor_out - -- op: floor_divide.out - kernels: - - arg_meta: null - kernel_name: torch::executor::floor_divide_out - -- op: fmod.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fmod_Tensor_out - -- op: fmod.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fmod_Scalar_out - -- op: full.out - kernels: - - arg_meta: null - kernel_name: torch::executor::full_out - -# TODO: Investigate why empty dispatch is required for building: -# buck2 build //executorch/kernels/portable:generated_lib -- op: full_like.out - dispatch: {} - kernels: - - arg_meta: null - kernel_name: torch::executor::full_like_out - -- op: ge.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ge_scalar_out - -- op: ge.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ge_tensor_out - -- op: gelu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::gelu_out - -- op: glu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::glu_out - -- op: gt.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::gt_scalar_out - -- op: gt.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::gt_tensor_out - -- op: hardtanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::hardtanh_out - -- op: index.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_Tensor_out - -- op: index_put.out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_put_out - -- op: index_select.out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_select_out - -- op: isinf.out - kernels: - - arg_meta: null - kernel_name: torch::executor::isinf_out - -- op: isnan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::isnan_out - -- op: le.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::le_scalar_out - -- op: le.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::le_tensor_out - -- op: leaky_relu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::leaky_relu_out - -- op: lift_fresh_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::lift_fresh_copy_out - -- op: log.out - kernels: - - arg_meta: null - kernel_name: torch::executor::log_out - -- op: logical_and.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_and_out - -- op: logical_not.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_not_out - -- op: logical_or.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_or_out - -- op: logical_xor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_xor_out - -- op: logit.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logit_out - -- op: lt.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::lt_scalar_out - -- op: lt.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::lt_tensor_out - -- op: masked_fill.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::masked_fill_scalar_out - -- op: max.dim_max - kernels: - - arg_meta: null - kernel_name: torch::executor::max_out - -- op: max_pool2d_with_indices.out - kernels: - - arg_meta: null - kernel_name: torch::executor::max_pool2d_with_indices_out - -- op: mean.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mean_dim_out - -- op: min.dim_min - kernels: - - arg_meta: null - kernel_name: torch::executor::min_out - -- op: minimum.out - kernels: - - arg_meta: null - kernel_name: torch::executor::minimum_out - -- op: mm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mm_out - -- op: mul.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_out - -- op: mul.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_scalar_out - -- op: native_layer_norm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::native_layer_norm_out - -- op: ne.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ne_scalar_out - -- op: ne.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ne_tensor_out - -- op: neg.out - kernels: - - arg_meta: null - kernel_name: torch::executor::neg_out - -- op: nonzero.out - kernels: - - arg_meta: null - kernel_name: torch::executor::nonzero_out - -- op: ones.out - kernels: - - arg_meta: null - kernel_name: torch::executor::ones_out - -- op: permute_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::permute_copy_out - -- op: pixel_shuffle.out - kernels: - - arg_meta: null - kernel_name: torch::executor::pixel_shuffle_out - -- op: pow.Tensor_Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::pow_Tensor_Scalar_out - -- op: pow.Tensor_Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::pow_Tensor_Tensor_out - -- op: reciprocal.out - kernels: - - arg_meta: null - kernel_name: torch::executor::reciprocal_out - -- op: relu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::relu_out - -- op: remainder.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::remainder_Tensor_out - -- op: remainder.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::remainder_Scalar_out - -- op: repeat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::repeat_out - -- op: round.out - kernels: - - arg_meta: null - kernel_name: torch::executor::round_out - -- op: rsqrt.out - kernels: - - arg_meta: null - kernel_name: torch::executor::rsqrt_out - -- op: rsub.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::rsub_scalar_out - -- op: scalar_tensor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::scalar_tensor_out - -- op: scatter_add.out - kernels: - - arg_meta: null - kernel_name: torch::executor::scatter_add_out - -- op: select_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::select_copy_int_out - -- op: select_scatter.out - kernels: - - arg_meta: null - kernel_name: torch::executor::select_scatter_out - -- op: sigmoid.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sigmoid_out - -- op: sign.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sign_out - -- op: sin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sin_out - -- op: sinh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sinh_out - -- op: slice_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_copy_Tensor_out - -- op: slice_scatter.out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_scatter_out - -- op: split_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::split_copy_Tensor_out - -- op: sqrt.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sqrt_out - -- op: squeeze_copy.dim_out - kernels: - - arg_meta: null - kernel_name: torch::executor::squeeze_copy_dim_out - -- op: stack.out - kernels: - - arg_meta: null - kernel_name: torch::executor::stack_out - -- op: sub.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_out - -- op: sub.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_scalar_out - -- op: sum.IntList_out - kernels: - - arg_meta: null - kernel_name: torch::executor::sum_dim_out - -- op: t_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::t_copy_out - -- op: tan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tan_out - -- op: tanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tanh_out - -- op: transpose_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::transpose_copy_int_out - -- op: tril.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tril_out - -- op: unbind_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::unbind_copy_int_out - -- op: unsqueeze_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::unsqueeze_copy_out - -- op: var.out - kernels: - - arg_meta: null - kernel_name: torch::executor::var_out - -- op: view_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::view_copy_out - -- op: where.self_out - kernels: - - arg_meta: null - kernel_name: torch::executor::where_out - -- op: zeros.out - kernels: - - arg_meta: null - kernel_name: torch::executor::zeros_out From 9b244b31afdabcddd7e5e9c2b767d7dc159504b3 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 13:08:41 -0700 Subject: [PATCH 14/20] [NOT FOR LAND] HACK for manual kernel registration --- codegen/templates/RegisterCodegenUnboxedKernels.cpp | 13 ++++++++----- manual.h | 5 +++++ 2 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 manual.h diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fed..86938d065b8 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -11,6 +11,8 @@ #include #include #include "${fn_header}" // Generated Function import headers +#include + // ${generated_comment} // NOTE [Sharded File]: This file is generated in a sharded fashion to speed up @@ -24,8 +26,6 @@ using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; namespace torch { namespace executor { -namespace function { -namespace { static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels @@ -39,8 +39,11 @@ static KernelArrayRef kernel_array_ref( // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); -} // namespace -} // namespace function +// static auto success_with_kernel_reg = register_kernels(kernel_array_ref); + +void manual_override() { + static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +} + } // namespace executor } // namespace torch diff --git a/manual.h b/manual.h new file mode 100644 index 00000000000..eaee9a15407 --- /dev/null +++ b/manual.h @@ -0,0 +1,5 @@ +namespace torch { +namespace executor { + void manual_override(); + void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack); +}} From 1cabc63d8ce1e339dcd61bb5919386772b7e9477 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 22:32:47 -0700 Subject: [PATCH 15/20] [NOT FOR LAND] Allow enabling logging in Release mode --- CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c384800b52..1678bc2d8a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,8 +59,18 @@ endif() # - targets in the current directory, before and after this command is invoked # - targets in sub-directories added after this command is invoked if(CMAKE_BUILD_TYPE STREQUAL "Release") + # To enable logging in Release mode + option( + EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE + "Enable logging in release mode" OFF) + + set(_ET_LOG_ENABLE 0) + if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE}) + set(_ET_LOG_ENABLE 1) + endif() + # Avoid pulling in the logging strings, which can be large. - add_definitions(-DET_LOG_ENABLED=0) + add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE}) # Avoid pulling in the flatbuffer data verification # logic, which can add about 20kB. add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0) From 2f10ee3ebe522ec7a8e8b76a2135502dfd0d3031 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 28 Sep 2023 20:24:49 -0700 Subject: [PATCH 16/20] [NOT FOR LAND][arm] setup for core_platform --- ...regress-cmake-version-from-3.21-3.20.patch | 25 ++ ...disable-warnings-to-reduce-verbosity.patch | 52 ++++ ...0003-HACK-Add-Executorch-add-example.patch | 224 ++++++++++++++++++ examples/arm/cs300/setup.sh | 28 +++ 4 files changed, 329 insertions(+) create mode 100644 examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch create mode 100644 examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch create mode 100644 examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch create mode 100755 examples/arm/cs300/setup.sh diff --git a/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch new file mode 100644 index 00000000000..efb02478229 --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch @@ -0,0 +1,25 @@ +From a969839b90756b2458cb80ac5edb619e87210bea Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:03 -0700 +Subject: [PATCH 1/3] [HACK] regress cmake version from 3.21 --> 3.20 + +--- + targets/corstone-300/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt +index 62205bb..7dda8a1 100644 +--- a/targets/corstone-300/CMakeLists.txt ++++ b/targets/corstone-300/CMakeLists.txt +@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena") + # Project + ############################################################################# + +-cmake_minimum_required(VERSION 3.21) ++cmake_minimum_required(VERSION 3.20) + + project(ethos-u-corstone-300 VERSION 0.0.1) + +-- +2.39.3 + diff --git a/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch new file mode 100644 index 00000000000..f2a6e17ccd8 --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch @@ -0,0 +1,52 @@ +From 3687c49c2ca85ca8a7d554b1206272870c565de3 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:30 -0700 +Subject: [PATCH 2/3] [HACK] disable warnings to reduce verbosity + +--- + cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake +index 093005e..0e6a2ed 100644 +--- a/cmake/toolchain/arm-none-eabi-gcc.cmake ++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake +@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections) + + # Compilation warnings + add_compile_options( +- -Wall +- -Wextra ++ # -Wall ++ # -Wextra + +- -Wcast-align +- -Wdouble-promotion +- -Wformat +- -Wmissing-field-initializers +- -Wnull-dereference +- -Wredundant-decls +- -Wshadow +- -Wswitch +- -Wswitch-default +- -Wunused ++ # -Wcast-align ++ # -Wdouble-promotion ++ # -Wformat ++ # -Wmissing-field-initializers ++ # -Wnull-dereference ++ # -Wredundant-decls ++ # -Wshadow ++ # -Wswitch ++ # -Wswitch-default ++ # -Wunused + +- -Wno-redundant-decls ++ # -Wno-redundant-decls + +- -Wno-psabi ++ # -Wno-psabi + ) +-- +2.39.3 + diff --git a/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch new file mode 100644 index 00000000000..9a0b0be554e --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch @@ -0,0 +1,224 @@ +From b5369c873814d765276a746ce26d2be5724da8f8 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 19:07:51 -0700 +Subject: [PATCH 3/3] [HACK] Add Executorch add example + +--- + applications/CMakeLists.txt | 2 + + applications/executorch_tests/CMakeLists.txt | 53 ++++++++ + applications/executorch_tests/add.cpp | 130 +++++++++++++++++++ + 3 files changed, 185 insertions(+) + create mode 100644 applications/executorch_tests/CMakeLists.txt + create mode 100644 applications/executorch_tests/add.cpp + +diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt +index 1fa2b2e..68e5427 100644 +--- a/applications/CMakeLists.txt ++++ b/applications/CMakeLists.txt +@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo) + + add_subdirectory(message_handler_openamp) + ++add_subdirectory(executorch_tests) ++ + if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") + # Only armclang supported for now + add_subdirectory(trustzone_inference) +diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt +new file mode 100644 +index 0000000..8a34c44 +--- /dev/null ++++ b/applications/executorch_tests/CMakeLists.txt +@@ -0,0 +1,53 @@ ++# ++# Copyright (c) 2021 Arm Limited. All rights reserved. ++# ++# SPDX-License-Identifier: Apache-2.0 ++# ++# Licensed under the Apache License, Version 2.0 (the License); you may ++# not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an AS IS BASIS, WITHOUT ++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++if (NOT TARGET ethosu_core_driver) ++ return() ++endif() ++ ++#### ++#### Executorch demo app/test ++#### ++ ++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to Executorch dir") ++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to Executorch build dir") ++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to Executorch headers") ++ ++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) ++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) ++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH) ++ ++message("**********************") ++message("Executorch dir (ET_DIR_PATH) : ${ET_DIR_PATH}") ++message("Executorch build dir(ET_BUILD_DIR_PATH): ${ET_BUILD_DIR_PATH}") ++message("Executorch headers (ET_INCUDE_PATH) : ${ET_INCLUDE_PATH}") ++message("**********************") ++ ++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") ++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") ++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") ++ ++ethosu_add_executable_test(executorch_add PRIVATE ++ SOURCES add.cpp ++ LIBRARIES ${LIB_ET_RUNTIME} ${LIB_ET_OP_REGISTRATION} ++ ${LIB_ET_OP_KERNELS}) ++ ++target_include_directories(executorch_add PRIVATE ++${ET_INCLUDE_PATH}) ++ ++# TODO Memory setup +diff --git a/applications/executorch_tests/add.cpp b/applications/executorch_tests/add.cpp +new file mode 100644 +index 0000000..115af66 +--- /dev/null ++++ b/applications/executorch_tests/add.cpp +@@ -0,0 +1,130 @@ ++/* ++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates ++ * ++ * SPDX-License-Identifier: Apache-2.0 ++ * ++ * Licensed under the Apache License, Version 2.0 (the License); you may ++ * not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT ++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/**************************************************************************** ++ * Includes ++ ****************************************************************************/ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include ++#include ++ ++using namespace std; ++ ++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; ++ ++/**************************************************************************** ++ * Functions ++ ****************************************************************************/ ++ ++int main() { ++ /* ++ * This is a simple Executorch app which runs `add.pte`. ++ */ ++ ++ torch::executor::runtime_init(); ++ ++ torch::executor::manual_override(); // Hack: This will be updated soon. ++ ++ using torch::executor::Result; ++ using torch::executor::Error; ++ ++ auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte)); ++ ++ Result program = torch::executor::Program::load(&loader); ++ if(!program.ok()) { ++ ET_LOG(Info,"ET: Program loading failed @ 0x%p: 0x%" PRIx32, add_pte, program.error()); ++ } ++ ++ ET_LOG(Info,"ET: Model buffer loaded, has %lu methods", program->num_methods()); ++ ++ const char* method_name = nullptr; ++ { ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ } ++ ET_LOG(Info,"ET: Running method %s", method_name); ++ ++ Result method_meta = program->method_meta(method_name); ++ if (!method_meta.ok()) { ++ ET_LOG(Info,"ET: Failed to get method_meta for %s: 0x%x", ++ method_name, (unsigned int)method_meta.error()); ++ } ++ ++ torch::executor::MemoryAllocator method_allocator{ ++ torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; ++ ++ std::vector> planned_buffers; // Owns the memory ++ std::vector> planned_spans; // Passed to the allocator ++ size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ++ ++ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { ++ size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); ++ ET_LOG(Info,"ET: Setting up planned buffer %zu, size %zu.", id, buffer_size); ++ ++ planned_buffers.push_back(std::make_unique(buffer_size)); ++ planned_spans.push_back({planned_buffers.back().get(), buffer_size}); ++ } ++ ++ torch::executor::HierarchicalAllocator planned_memory( ++ {planned_spans.data(), planned_spans.size()}); ++ ++ torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory); ++ ++ Result method = program->load_method(method_name, &memory_manager); ++ if(!method.ok()) { ++ ET_LOG(Info,"ET: Loading of method %s failed with status 0x%" PRIx32, method_name, method.error()); ++ } ++ ET_LOG(Info,"ET: Method loaded."); ++ ++ ET_LOG(Info,"ET: Preparing inputs..."); ++ auto inputs = torch::executor::util::PrepareInputTensors(*method); ++ ET_LOG(Info,"ET: Input prepared."); ++ ++ ET_LOG(Info,"ET: Starting the model execution..."); ++ Error status = method->execute(); ++ if(status != Error::Ok){ ++ ET_LOG(Info,"ET: Execution of method %s failed with status 0x%" PRIx32, method_name, status); ++ } else { ++ ET_LOG(Info,"ET: Model executed successfully."); ++ } ++ ++ // Print the outputs. ++ std::vector outputs(method->outputs_size()); ++ ET_LOG(Info, "%zu outputs: ", outputs.size()); ++ status = method->get_outputs(outputs.data(), outputs.size()); ++ ET_CHECK(status == Error::Ok); ++ for (int i = 0; i < outputs.size(); ++i) { ++ for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { ++ printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr()[j]); ++ } ++ } ++ return 0; ++} +-- +2.39.3 + diff --git a/examples/arm/cs300/setup.sh b/examples/arm/cs300/setup.sh new file mode 100755 index 00000000000..63fbd36b3bc --- /dev/null +++ b/examples/arm/cs300/setup.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -eu + +ethos_u_dir=${1:-/tmp/ethos-u} +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +function patch_repo() { + echo -e "\nPreparing ${name}..." + cd ${ethos_u_dir}/${name} + + git reset --hard ${base_rev} + + patch_dir=${script_dir}/${name}/patches/ + [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \ + git am -3 ${patch_dir}/*.patch + + echo -e "Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${ethos_u_dir}/${name} dir.\n" +} + +name="core_platform" +base_rev=204210b1074071532627da9dc69950d058a809f4 +patch_repo + +name="core_software" +base_rev=74c514a5b50a19197a64a86095bc0429188adcbe +patch_repo + +exit $? From 298fb222438984c59265f75975d7bb345f7eb0c3 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Fri, 29 Sep 2023 13:49:52 +0000 Subject: [PATCH 17/20] Fixes to make simple_add run on hardware * fix to encode and decode of vela_bin_stream block sizes * hardcoded input/output population to check operation behaviour * use manual.h to init and register backend Signed-off-by: Rob Elliott --- backends/arm/arm_backend.py | 2 +- backends/arm/cmake/build.sh | 6 ++- backends/arm/runtime/ArmBackendEthosU.cpp | 51 +++++++++++++++++------ manual.h | 1 + 4 files changed, 44 insertions(+), 16 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index e1f07d0a266..82b24f4b9b6 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -173,7 +173,7 @@ def vela_compile(tosa_fb): # We need the acual unpadded block lengths for hw setup block_length = len(block_data).to_bytes(16, 'little') # pad block data to multiple of 16 bytes - block_data = block_data + b'\x00'*(16-len(block_data)%16) + block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16) block = block_name + block_length + block_data blocks = blocks + block diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh index d2bedd7769f..0dbb8cf2177 100755 --- a/backends/arm/cmake/build.sh +++ b/backends/arm/cmake/build.sh @@ -39,13 +39,15 @@ cd cmake-corstone #cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake .. cmake -DFLATC_EXECUTABLE=flatc \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \ .. -# -DCMAKE_TOOLCHAIN_FILE=backends/arm/cmake/arm-none-eabi-gcc.cmake \ cd .. -cmake --build cmake-corstone -j1 --target ethos_u ethosu_core_driver executorch +cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index e7f791d2b9f..c0c5db7df0a 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -34,6 +34,10 @@ namespace executor { class ArmBackend final : public PyTorchBackendInterface { public: + ArmBackend() { + printf("Constructing ARM Backend\n"); + } + ~ArmBackend() = default; virtual bool is_available() const override { @@ -73,18 +77,29 @@ class ArmBackend final : public PyTorchBackendInterface { printf("ArmBackend::execute 0x%X\n", processed->data()); - vela_handles handles = { 0, 0, 0, 0, 0, 0}; + vela_handles handles = { 0, 0, 0, 0, 0, 0 }; // Command stream - we know at this point it's aligned char *data = (char*)processed->data(); // Read key sections from the vela_bin_stream this->vela_read( data, &handles ); - + printf("Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", handles.cmd_data, handles.cmd_data_length, handles.weight_data, handles.weight_data_length, handles.scratch_data, handles.scratch_data_length ); + + // TMP emit scratch + printf("Scratch before:\n"); + for( int i=0; icmd_data = block_data; h->cmd_data_length = block_length; } @@ -161,14 +187,13 @@ class ArmBackend final : public PyTorchBackendInterface { } } } - }; -namespace { auto backend = ArmBackend(); - Backend backend_id{"ArmBackend", &backend}; - static auto registered = register_backend(backend_id); -} // namespace + void arm_backend_register() { + Backend backend_id{"ArmBackend", &backend}; + static auto registered = register_backend(backend_id); + } } // namespace executor } // namespace torch diff --git a/manual.h b/manual.h index eaee9a15407..3719a142718 100644 --- a/manual.h +++ b/manual.h @@ -2,4 +2,5 @@ namespace torch { namespace executor { void manual_override(); void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack); + void arm_backend_register(); }} From 08d71d9510ccf08d56df49d687d4aa5ce0b3dc5c Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Mon, 2 Oct 2023 09:39:26 +0000 Subject: [PATCH 18/20] tidied binary reading and moved to ET_LOG Signed-off-by: Rob Elliott --- backends/arm/runtime/ArmBackendEthosU.cpp | 155 +++++++++++++--------- 1 file changed, 91 insertions(+), 64 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index c0c5db7df0a..10ba1dbfd58 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -6,7 +6,8 @@ */ /* - * Arm backend for Ethos-U baremetal driver stack relies on ethos-u-core-driver + * Arm backend for Ethos-U baremetal driver stack, this relies on the + * ethos-u-core-driver for hardware interaction. */ #include @@ -21,21 +22,19 @@ #include "command_stream.hpp" using namespace EthosU::CommandStream; -// Required byte alignment of all input pointers -#define ETHOS_U_ALIGN 0xF -char *ethos_align( char *ptr ) -{ - return (char*)((uintptr_t)~ETHOS_U_ALIGN & (uintptr_t)(ptr + (ETHOS_U_ALIGN-1))); -} - namespace torch { namespace executor { +// TODO we should be in 0x31, not this lower 1MB sRAM +// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000 +#define CS300_SRAM_LOW ((void*)0x11000000) +#define CS300_SRAM_HIGH ((void*)0x110FFFFF) + class ArmBackend final : public PyTorchBackendInterface { public: ArmBackend() { - printf("Constructing ARM Backend\n"); + ET_LOG(Debug, "Constructing ARM Backend"); } ~ArmBackend() = default; @@ -49,19 +48,37 @@ class ArmBackend final : public PyTorchBackendInterface { FreeableBuffer* processed, ArrayRef compile_specs) const override { - printf("ArmBackend::init 0x%X\n", processed->data()); + ET_LOG(Info, "ArmBackend::init %p", processed->data() ); char *data = (char*)processed->data(); size_t size = processed->size(); - - //the model should have been placed in sram with - //__attribute__((section(".sram.data"), aligned(16))) - void *aligned = ethos_align(data); - if( data != ethos_align(data)) return Error::InvalidProgram; - - // TODO: Verify address range is accessible to Ethos-U - // current expectation is the program is in SRAM - if(0) return Error::InvalidProgram; + char *foot = data + size - 16; + + // Header and footer both 16 bit aligned suggest valid structure and we + // wont walk off the end of the chunks and segfault + if( !((int)data == next_mul_16((int)data)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !((int)foot == next_mul_16((int)foot)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( data, "vela_bin_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( foot, "vela_end_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + // Verify address range is accessible current expectation is the program + // is wholly stored in SRAM + if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ); // Return the same buffer we were passed - this data will be // executed directly @@ -75,7 +92,7 @@ class ArmBackend final : public PyTorchBackendInterface { FreeableBuffer* processed = (FreeableBuffer*)input_handle; - printf("ArmBackend::execute 0x%X\n", processed->data()); + ET_LOG(Info, "ArmBackend::execute %p", processed->data() ); vela_handles handles = { 0, 0, 0, 0, 0, 0 }; @@ -83,16 +100,20 @@ class ArmBackend final : public PyTorchBackendInterface { char *data = (char*)processed->data(); // Read key sections from the vela_bin_stream - this->vela_read( data, &handles ); + if( !this->vela_read( data, &handles, processed->size() ) ) + { + ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" ); + return Error::InvalidProgram; + } - printf("Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", - handles.cmd_data, handles.cmd_data_length, - handles.weight_data, handles.weight_data_length, - handles.scratch_data, handles.scratch_data_length ); + ET_LOG(Debug, "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, handles.cmd_data_size, + handles.weight_data, handles.weight_data_size, + handles.scratch_data, handles.scratch_data_size ); // TMP emit scratch printf("Scratch before:\n"); - for( int i=0; isize); + + // Exit with success on finding end of stream + if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1; + + if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) ) { - printf("Capturing cmd_data %p %c%c%c%c\n", block_data, - block_data[0], block_data[1], block_data[2], block_data[3]); - h->cmd_data = block_data; - h->cmd_data_length = block_length; + // This magic header confirms a valid command stream in binary + if( strncmp( b->data, "COP1", 4 ) ) return 0; + h->cmd_data = b->data; + h->cmd_data_size = b->size; } - if( !strncmp( block_name, "weight_data", strlen("weight_data")) ) + if( !strncmp( b->name, "weight_data", strlen("weight_data")) ) { - printf("Capturing weight_data\n"); - h->weight_data = block_data; - h->weight_data_length = block_length; + h->weight_data = b->data;; + h->weight_data_size = b->size; } - if( !strncmp( block_name, "scratch_data", strlen("scratch_data")) ) + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) { - printf("Capturing scratch_data\n"); - h->scratch_data = block_data; - h->scratch_data_length = block_length; + h->scratch_data = b->data; + h->scratch_data_size = b->size; } } } + }; auto backend = ArmBackend(); From 189b04c8e99b19ab9b0457db8747de65aca032a6 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Mon, 2 Oct 2023 11:57:17 +0000 Subject: [PATCH 19/20] Simplified EthosU invocation code * Removed dependencies on aything but driver * moved to minimal invocation pattern Signed-off-by: Rob Elliott --- backends/arm/runtime/ArmBackendEthosU.cpp | 48 +++--- backends/arm/runtime/command_stream.cpp | 169 ---------------------- backends/arm/runtime/command_stream.hpp | 120 --------------- 3 files changed, 21 insertions(+), 316 deletions(-) delete mode 100644 backends/arm/runtime/command_stream.cpp delete mode 100644 backends/arm/runtime/command_stream.hpp diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 10ba1dbfd58..abf0d8e63d6 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -19,9 +19,6 @@ #include #include -#include "command_stream.hpp" -using namespace EthosU::CommandStream; - namespace torch { namespace executor { @@ -120,29 +117,26 @@ class ArmBackend final : public PyTorchBackendInterface { if( !((i+1)%4) ) printf("\n"); } printf("\n"); - - // Invoke driver using the above pointers - CommandStream cs( - DataPointer(handles.cmd_data, handles.cmd_data_size), - BasePointers({ - DataPointer(handles.weight_data, handles.weight_data_size), - DataPointer(handles.scratch_data, handles.scratch_data_size) - }), - PmuEvents({ETHOSU_PMU_CYCLE, ETHOSU_PMU_NPU_IDLE, ETHOSU_PMU_NPU_ACTIVE}) - ); - - cs.getPmu().clear(); - int res = cs.run(1); - if(res == 0) + + // Allocate driver handle and synchronously invoke driver + ethosu_driver *drv = ethosu_reserve_driver(); + + uint64_t bases[2] = {(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; + size_t bases_size[2] = {handles.weight_data_size, handles.scratch_data_size}; + int result = ethosu_invoke_v3(drv, + (void*)handles.cmd_data, + handles.cmd_data_size, + bases, + bases_size, + 2, + nullptr); + + if(result != 0) { - uint64_t cycleCount = cs.getPmu().getCycleCount(); - cs.getPmu().print(); - printf("cycleCount=%llu, cycleCountPerJob=%llu\n", cycleCount, cycleCount); - } else { - printf("Error, failure executing job\n"); + ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result); return Error::InvalidProgram; - } - + } + // TMP emit scratch printf("Scratch after:\n"); for( int i=0; i - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/**************************************************************************** - * Includes - ****************************************************************************/ - -#include "command_stream.hpp" - -#include -#include - -using namespace std; - -namespace EthosU { -namespace CommandStream { - -/**************************************************************************** - * DataPointer - ****************************************************************************/ - -DataPointer::DataPointer() : data(nullptr), size(0) {} - -DataPointer::DataPointer(const char *_data, size_t _size) : data(_data), size(_size) {} - -bool DataPointer::operator!=(const DataPointer &other) { - if (size != other.size) { - return true; - } - - for (size_t i = 0; i < size; i++) { - if (data[i] != other.data[i]) { - return true; - } - } - - return false; -} - -/**************************************************************************** - * PmuConfig - ****************************************************************************/ - -Pmu::Pmu(ethosu_driver *_drv, const PmuEvents &_config) : drv(_drv), config(_config) { - // Enable PMU block - ETHOSU_PMU_Enable(drv); - - // Enable cycle counter - ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); - - // Configure event types - for (size_t i = 0; i < config.size(); i++) { - ETHOSU_PMU_Set_EVTYPER(drv, i, config[i]); - ETHOSU_PMU_CNTR_Enable(drv, 1u << i); - } -} - -void Pmu::clear() { - ETHOSU_PMU_CYCCNT_Reset(drv); - ETHOSU_PMU_EVCNTR_ALL_Reset(drv); -} - -void Pmu::print() { - printf("PMU={cycleCount=%llu, events=[%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32 "]}\n", - ETHOSU_PMU_Get_CCNTR(drv), - ETHOSU_PMU_Get_EVCNTR(drv, 0), - ETHOSU_PMU_Get_EVCNTR(drv, 1), - ETHOSU_PMU_Get_EVCNTR(drv, 2), - ETHOSU_PMU_Get_EVCNTR(drv, 3)); -} - -uint64_t Pmu::getCycleCount() const { - return ETHOSU_PMU_Get_CCNTR(drv); -} - -uint32_t Pmu::getEventCount(size_t index) const { - return ETHOSU_PMU_Get_EVCNTR(drv, index); -} - -/**************************************************************************** - * CommandStream - ****************************************************************************/ - -CommandStream::CommandStream(const DataPointer &_commandStream, - const BasePointers &_basePointers, - const PmuEvents &_pmuEvents) : - drv(ethosu_reserve_driver()), - commandStream(_commandStream), basePointers(_basePointers), pmu(drv, _pmuEvents) {} - -CommandStream::~CommandStream() { - ethosu_release_driver(drv); -} - -int CommandStream::run(size_t repeat) { - // Base pointer array - uint64_t baseAddress[ETHOSU_BASEP_INDEXES]; - size_t baseAddressSize[ETHOSU_BASEP_INDEXES]; - for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) { - baseAddress[i] = reinterpret_cast(basePointers[i].data); - baseAddressSize[i] = reinterpret_cast(basePointers[i].size); - } - - while (repeat-- > 0) { - int error = ethosu_invoke_v3( - drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr); - - if (error != 0) { - printf("Inference failed. error=%d\n", error); - return 1; - } - } - - return 0; -} - -int CommandStream::run_async() { - // Base pointer array - uint64_t baseAddress[ETHOSU_BASEP_INDEXES]; - size_t baseAddressSize[ETHOSU_BASEP_INDEXES]; - - for (size_t i = 0; i < ETHOSU_BASEP_INDEXES; i++) { - baseAddress[i] = reinterpret_cast(basePointers[i].data); - baseAddressSize[i] = reinterpret_cast(basePointers[i].size); - } - - int error = ethosu_invoke_async( - drv, commandStream.data, commandStream.size, baseAddress, baseAddressSize, ETHOSU_BASEP_INDEXES, nullptr); - - if (error != 0) { - printf("Inference invoke async failed. error=%d\n", error); - return 1; - } - - return 0; -} - -int CommandStream::wait_async(bool block) { - return ethosu_wait(drv, block); -} - -DataPointer &CommandStream::getCommandStream() { - return commandStream; -} - -BasePointers &CommandStream::getBasePointers() { - return basePointers; -} - -Pmu &CommandStream::getPmu() { - return pmu; -} - -}; // namespace CommandStream -}; // namespace EthosU diff --git a/backends/arm/runtime/command_stream.hpp b/backends/arm/runtime/command_stream.hpp deleted file mode 100644 index 7163b9d58ca..00000000000 --- a/backends/arm/runtime/command_stream.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef COMMAND_STREAM_HPP -#define COMMAND_STREAM_HPP - -/**************************************************************************** - * Includes - ****************************************************************************/ - -#include -#include -#include -#include - -/**************************************************************************** - * Defines - ****************************************************************************/ - -#ifndef ETHOSU_BASEP_INDEXES -#define ETHOSU_BASEP_INDEXES 8 -#endif - -/**************************************************************************** - * Types - ****************************************************************************/ - -namespace EthosU { -namespace CommandStream { - -/**************************************************************************** - * DataPointer - ****************************************************************************/ - -struct DataPointer { - DataPointer(); - DataPointer(const char *_data, size_t _size); - - bool operator!=(const DataPointer &other); - - const char *data; - size_t size; -}; - -/**************************************************************************** - * Pmu - ****************************************************************************/ - -using PmuEvents = std::array; - -class Pmu { -public: - Pmu(ethosu_driver *_drv, const PmuEvents &_config = {}); - - void clear(); - void print(); - - uint64_t getCycleCount() const; - uint32_t getEventCount(size_t index) const; - -private: - ethosu_driver *drv; - PmuEvents config; -}; - -/**************************************************************************** - * CommandStream - ****************************************************************************/ - -using BasePointers = std::array; - -class CommandStream { -public: - CommandStream(const DataPointer &_commandStream, - const BasePointers &_pointers = {}, - const PmuEvents &_pmuEvents = {}); - virtual ~CommandStream(); - - int run(size_t repeat = 1); - int run_async(); - int wait_async(bool block = true); - - DataPointer &getCommandStream(); - BasePointers &getBasePointers(); - Pmu &getPmu(); - -private: - ethosu_driver *drv; - DataPointer commandStream; - BasePointers basePointers; - Pmu pmu; -}; - -#define DRIVER_ACTION_MAGIC() 'C', 'O', 'P', '1', - -#define DRIVER_ACTION_COMMAND_STREAM(length) 0x02, (length >> 16) & 0xff, length & 0xff, (length >> 8) & 0xff, - -#define DRIVER_ACTION_NOP() 0x05, 0x00, 0x00, 0x00, - -#define NPU_OP_STOP(mask) (mask >> 8) && 0xff, mask & 0xff, 0x08, 0x00, - -}; // namespace CommandStream -}; // namespace EthosU - -#endif /* COMMAND_STREAM_HPP */ From 70071205bf8f433268e45c87e6d334ec7011da43 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Mon, 2 Oct 2023 14:14:12 +0000 Subject: [PATCH 20/20] Basic ethos output copy to EValue * currently assumes function signature * read relevant argument data from vela_bin Signed-off-by: Rob Elliott --- backends/arm/CMakeLists.txt | 2 +- backends/arm/runtime/ArmBackendEthosU.cpp | 47 +++++++++++++++++++++-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 6d6cd1938b7..2cc5cf94740 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -18,7 +18,7 @@ set(_common_compile_options -Wno-deprecated-declarations) include(cmake/Dependencies.cmake) -set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp backends/arm/runtime/command_stream.cpp) +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") add_library(ethos_u STATIC ${_arm_baremetal_sources}) target_include_directories(ethos_u PUBLIC ${_common_include_directories}) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index abf0d8e63d6..3dc52645089 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -117,7 +117,7 @@ class ArmBackend final : public PyTorchBackendInterface { if( !((i+1)%4) ) printf("\n"); } printf("\n"); - + // Allocate driver handle and synchronously invoke driver ethosu_driver *drv = ethosu_reserve_driver(); @@ -135,9 +135,9 @@ class ArmBackend final : public PyTorchBackendInterface { { ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result); return Error::InvalidProgram; - } + } - // TMP emit scratch + // TMP emit scratch printf("Scratch after:\n"); for( int i=0; itoTensor(); + for(int j=0; j()[j] = output_address[j]; + } + return Error::Ok; } @@ -158,6 +169,8 @@ class ArmBackend final : public PyTorchBackendInterface { const char *cmd_data; size_t cmd_data_size; const char *weight_data; size_t weight_data_size; const char *scratch_data; size_t scratch_data_size; + size_t input_offset; size_t input_data_shape[3]; + size_t output_offset; size_t output_data_shape[3]; } vela_handles; typedef struct { @@ -205,6 +218,34 @@ class ArmBackend final : public PyTorchBackendInterface { h->scratch_data = b->data; h->scratch_data_size = b->size; } + + // capture inputs and outputs + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) + { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + if( !strncmp( b->name, "input_offset", strlen("input_offset")) ) + { + h->input_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "output_offset", strlen("output_offset")) ) + { + h->output_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "input_shape", strlen("input_shape")) ) + { + h->input_data_shape[0] = ((int*)b->data)[0]; + h->input_data_shape[0] = ((int*)b->data)[1]; + h->input_data_shape[0] = ((int*)b->data)[2]; + + } + if( !strncmp( b->name, "output_shape", strlen("output_shape")) ) + { + h->output_data_shape[0] = ((int*)b->data)[0]; + h->output_data_shape[0] = ((int*)b->data)[1]; + h->output_data_shape[0] = ((int*)b->data)[2]; + } } }