From 3eff57ab825e8265cbb7781a8a9f74d349b85f9d Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Fri, 30 Sep 2022 13:41:09 +0000 Subject: [PATCH 1/2] [ETHOSN] Throw error message when inference fails Previously the runtime would silently skip interence failures and return random values as the result. This can make spotting inference failures challenging. The runtime now throws a fatal error when inference did not complete successfully along with an error message that gives some details about the error that occurred. Change-Id: Iadb6da04ad1c906e3ec49959eb3da0978295aebf --- CMakeLists.txt | 3 + src/runtime/contrib/ethosn/ethosn_device.cc | 67 ++++++++++++------- src/runtime/contrib/ethosn/ethosn_runtime.h | 32 +++++++++ .../runtime/contrib/ethosn/inference_test.cc | 60 +++++++++++++++++ 4 files changed, 136 insertions(+), 26 deletions(-) create mode 100644 tests/cpp/runtime/contrib/ethosn/inference_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 47d30a89d2d1..be7814f03cf1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -677,6 +677,9 @@ if(GTEST_FOUND) if(DEFINED LLVM_LIBS) target_link_libraries(cpptest PRIVATE ${LLVM_LIBS}) endif() + if(DEFINED ETHOSN_RUNTIME_LIBRARY) + target_link_libraries(cpptest PRIVATE ${ETHOSN_RUNTIME_LIBRARY}) + endif() set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1) set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1) if(USE_RELAY_DEBUG) diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc index 612f4b4cec39..3dd931c68460 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.cc +++ b/src/runtime/contrib/ethosn/ethosn_device.cc @@ -32,6 +32,7 @@ #include #include +#include #include "ethosn_driver_library/Buffer.hpp" #include "ethosn_runtime.h" @@ -48,7 +49,7 @@ namespace ethosn { namespace dl = ::ethosn::driver_library; -bool WaitForInference(dl::Inference* inference, int timeout) { +WaitStatus WaitForInference(dl::Inference* inference, int timeout) { // Wait for inference to complete int fd = inference->GetFileDescriptor(); struct pollfd fds; @@ -58,20 +59,29 @@ bool WaitForInference(dl::Inference* inference, int timeout) { const int ms_per_seconds = 1000; int poll_result = poll(&fds, 1, timeout * ms_per_seconds); - if (poll_result > 0) { - dl::InferenceResult result; - if (read(fd, &result, sizeof(result)) != sizeof(result)) { - return false; - } - if (result != dl::InferenceResult::Completed) { - return false; - } + int poll_error_code = errno; + + if (poll_result < 0) { + return WaitStatus(WaitErrorCode::Error, "Error while waiting for the inference to complete (" + + std::string(strerror(poll_error_code)) + ")"); } else if (poll_result == 0) { - return false; - } else { - return false; + return WaitStatus(WaitErrorCode::Timeout, + "Timed out while waiting for the inference to complete."); } - return true; + + // poll_result > 0 + dl::InferenceResult npu_result; + if (read(fd, &npu_result, sizeof(npu_result)) != static_cast(sizeof(npu_result))) { + return WaitStatus(WaitErrorCode::Error, "Failed to read inference result status (" + + std::string(strerror(poll_error_code)) + ")"); + } + + if (npu_result != dl::InferenceResult::Completed) { + return WaitStatus(WaitErrorCode::Error, "Inference failed with status " + + std::to_string(static_cast(npu_result))); + } + + return WaitStatus(WaitErrorCode::Success); } void CreateBuffers(std::vector>* fm, @@ -123,21 +133,26 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, } // Execute the inference. - std::unique_ptr result( + std::unique_ptr inference( npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs)); - bool inferenceCompleted = WaitForInference(result.get(), 60); - if (inferenceCompleted) { - for (size_t i = 0; i < n_outputs; i++) { - DLTensor* tensor = outputs[i]; - dl::Buffer* source_buffer = ofm_raw[i]; - uint8_t* dest_buffer = static_cast(tensor->data); - size_t size = source_buffer->GetSize(); - uint8_t* source_buffer_data = source_buffer->Map(); - std::copy(source_buffer_data, source_buffer_data + size, dest_buffer); - source_buffer->Unmap(); - } + WaitStatus result = WaitForInference(inference.get(), 60); + + if (result.GetErrorCode() != WaitErrorCode::Success) { + LOG(FATAL) << "An error has occured waiting for the inference of a sub-graph on the NPU: " + << result.GetErrorDescription(); + } + + for (size_t i = 0; i < n_outputs; i++) { + DLTensor* tensor = outputs[i]; + dl::Buffer* source_buffer = ofm_raw[i]; + uint8_t* dest_buffer = static_cast(tensor->data); + size_t size = source_buffer->GetSize(); + uint8_t* source_buffer_data = source_buffer->Map(); + std::copy(source_buffer_data, source_buffer_data + size, dest_buffer); + source_buffer->Unmap(); } - return inferenceCompleted; + + return true; } } // namespace ethosn diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h index 7c8c32e784be..2b60bfa5ad24 100644 --- a/src/runtime/contrib/ethosn/ethosn_runtime.h +++ b/src/runtime/contrib/ethosn/ethosn_runtime.h @@ -107,6 +107,38 @@ class EthosnModule : public ModuleNode { std::map network_map_; }; +/*! + * \brief Error codes for evaluating the result of inference on the NPU. + */ +enum class WaitErrorCode { Success = 0, Timeout = 1, Error = 2 }; + +/*! + * \brief A helper class holding the status of inference on the NPU and + * associated error message(s) if any occurred. + */ +class WaitStatus { + public: + WaitStatus() : error_code_(WaitErrorCode::Success), error_description_("") {} + + explicit WaitStatus(WaitErrorCode errorCode, std::string errorDescription = "") + : error_code_(errorCode), error_description_(errorDescription) {} + + WaitStatus(const WaitStatus&) = default; + WaitStatus(WaitStatus&&) = default; + WaitStatus& operator=(const WaitStatus&) = default; + WaitStatus& operator=(WaitStatus&&) = default; + + explicit operator bool() const noexcept { return error_code_ == WaitErrorCode::Success; } + + WaitErrorCode GetErrorCode() const { return error_code_; } + + std::string GetErrorDescription() const { return error_description_; } + + private: + WaitErrorCode error_code_; + std::string error_description_; +}; + } // namespace ethosn } // namespace runtime } // namespace tvm diff --git a/tests/cpp/runtime/contrib/ethosn/inference_test.cc b/tests/cpp/runtime/contrib/ethosn/inference_test.cc new file mode 100644 index 000000000000..ee3af985cdd3 --- /dev/null +++ b/tests/cpp/runtime/contrib/ethosn/inference_test.cc @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tests/cpp/runtime/contrib/ethosn/inference_test.cc + * \brief Tests to check runtime components used during inference. + */ + +#ifdef ETHOSN_HW + +#include + +#include "../../../../../src/runtime/contrib/ethosn/ethosn_device.cc" + +namespace tvm { +namespace runtime { +namespace ethosn { + +TEST(WaitForInference, FailedResultRead) { + const int inference_error = 3; + const int timeout = 0; + dl::Inference inference = dl::Inference(inference_error); + WaitStatus result = WaitForInference(&inference, timeout); + + ASSERT_EQ(result.GetErrorCode(), WaitErrorCode::Error); + ICHECK_EQ(result.GetErrorDescription(), + "Failed to read inference result status (No such file or directory)"); +} + +TEST(WaitForInference, InferenceTimeout) { + const int inference_scheduled = 0; + const int timeout = 0; + dl::Inference inference = dl::Inference(inference_scheduled); + WaitStatus result = WaitForInference(&inference, timeout); + + ASSERT_EQ(result.GetErrorCode(), WaitErrorCode::Timeout); + ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete."); +} + +} // namespace ethosn +} // namespace runtime +} // namespace tvm + +#endif From 61548130ce74bd36651ae9099ea8b2d47a2103b3 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 12 Oct 2022 15:01:40 +0000 Subject: [PATCH 2/2] Address comments * clarify test file brief * add test case for running status * add driver stack reference to WaitStatus class Change-Id: I792742892b761534904816135ae2ffcb3f028b2c --- src/runtime/contrib/ethosn/ethosn_device.cc | 27 +++++++------ src/runtime/contrib/ethosn/ethosn_runtime.h | 27 +++++++------ .../runtime/contrib/ethosn/inference_test.cc | 40 +++++++++++++------ 3 files changed, 56 insertions(+), 38 deletions(-) diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc index 3dd931c68460..0d79f69815fa 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.cc +++ b/src/runtime/contrib/ethosn/ethosn_device.cc @@ -49,7 +49,7 @@ namespace ethosn { namespace dl = ::ethosn::driver_library; -WaitStatus WaitForInference(dl::Inference* inference, int timeout) { +InferenceWaitStatus WaitForInference(dl::Inference* inference, int timeout) { // Wait for inference to complete int fd = inference->GetFileDescriptor(); struct pollfd fds; @@ -62,26 +62,29 @@ WaitStatus WaitForInference(dl::Inference* inference, int timeout) { int poll_error_code = errno; if (poll_result < 0) { - return WaitStatus(WaitErrorCode::Error, "Error while waiting for the inference to complete (" + - std::string(strerror(poll_error_code)) + ")"); + return InferenceWaitStatus(InferenceWaitErrorCode::kError, + "Error while waiting for the inference to complete (" + + std::string(strerror(poll_error_code)) + ")"); } else if (poll_result == 0) { - return WaitStatus(WaitErrorCode::Timeout, - "Timed out while waiting for the inference to complete."); + return InferenceWaitStatus(InferenceWaitErrorCode::kTimeout, + "Timed out while waiting for the inference to complete."); } // poll_result > 0 dl::InferenceResult npu_result; if (read(fd, &npu_result, sizeof(npu_result)) != static_cast(sizeof(npu_result))) { - return WaitStatus(WaitErrorCode::Error, "Failed to read inference result status (" + - std::string(strerror(poll_error_code)) + ")"); + return InferenceWaitStatus( + InferenceWaitErrorCode::kError, + "Failed to read inference result status (" + std::string(strerror(poll_error_code)) + ")"); } if (npu_result != dl::InferenceResult::Completed) { - return WaitStatus(WaitErrorCode::Error, "Inference failed with status " + - std::to_string(static_cast(npu_result))); + return InferenceWaitStatus( + InferenceWaitErrorCode::kError, + "Inference failed with status " + std::to_string(static_cast(npu_result))); } - return WaitStatus(WaitErrorCode::Success); + return InferenceWaitStatus(InferenceWaitErrorCode::kSuccess); } void CreateBuffers(std::vector>* fm, @@ -135,9 +138,9 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, // Execute the inference. std::unique_ptr inference( npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs)); - WaitStatus result = WaitForInference(inference.get(), 60); + InferenceWaitStatus result = WaitForInference(inference.get(), 60); - if (result.GetErrorCode() != WaitErrorCode::Success) { + if (result.GetErrorCode() != InferenceWaitErrorCode::kSuccess) { LOG(FATAL) << "An error has occured waiting for the inference of a sub-graph on the NPU: " << result.GetErrorDescription(); } diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h index 2b60bfa5ad24..b8942fef12d9 100644 --- a/src/runtime/contrib/ethosn/ethosn_runtime.h +++ b/src/runtime/contrib/ethosn/ethosn_runtime.h @@ -110,32 +110,33 @@ class EthosnModule : public ModuleNode { /*! * \brief Error codes for evaluating the result of inference on the NPU. */ -enum class WaitErrorCode { Success = 0, Timeout = 1, Error = 2 }; +enum class InferenceWaitErrorCode { kSuccess = 0, kTimeout = 1, kError = 2 }; /*! * \brief A helper class holding the status of inference on the NPU and * associated error message(s) if any occurred. + * + * Similar to the implementation of 'WaitStatus' in the driver stack: + * https://github.com/ARM-software/ethos-n-driver-stack/blob/22.08/armnn-ethos-n-backend/workloads/EthosNPreCompiledWorkload.cpp#L48 */ -class WaitStatus { +class InferenceWaitStatus { public: - WaitStatus() : error_code_(WaitErrorCode::Success), error_description_("") {} + InferenceWaitStatus() : error_code_(InferenceWaitErrorCode::kSuccess), error_description_("") {} - explicit WaitStatus(WaitErrorCode errorCode, std::string errorDescription = "") + explicit InferenceWaitStatus(InferenceWaitErrorCode errorCode, std::string errorDescription = "") : error_code_(errorCode), error_description_(errorDescription) {} - WaitStatus(const WaitStatus&) = default; - WaitStatus(WaitStatus&&) = default; - WaitStatus& operator=(const WaitStatus&) = default; - WaitStatus& operator=(WaitStatus&&) = default; - - explicit operator bool() const noexcept { return error_code_ == WaitErrorCode::Success; } - - WaitErrorCode GetErrorCode() const { return error_code_; } + InferenceWaitStatus(const InferenceWaitStatus&) = default; + InferenceWaitStatus(InferenceWaitStatus&&) = default; + InferenceWaitStatus& operator=(const InferenceWaitStatus&) = default; + InferenceWaitStatus& operator=(InferenceWaitStatus&&) = default; + explicit operator bool() const { return error_code_ == InferenceWaitErrorCode::kSuccess; } + InferenceWaitErrorCode GetErrorCode() const { return error_code_; } std::string GetErrorDescription() const { return error_description_; } private: - WaitErrorCode error_code_; + InferenceWaitErrorCode error_code_; std::string error_description_; }; diff --git a/tests/cpp/runtime/contrib/ethosn/inference_test.cc b/tests/cpp/runtime/contrib/ethosn/inference_test.cc index ee3af985cdd3..95b27070e19a 100644 --- a/tests/cpp/runtime/contrib/ethosn/inference_test.cc +++ b/tests/cpp/runtime/contrib/ethosn/inference_test.cc @@ -19,7 +19,7 @@ /*! * \file tests/cpp/runtime/contrib/ethosn/inference_test.cc - * \brief Tests to check runtime components used during inference. + * \brief Tests to check Arm(R) Ethos(TM)-N runtime components used during inference. */ #ifdef ETHOSN_HW @@ -32,27 +32,41 @@ namespace tvm { namespace runtime { namespace ethosn { -TEST(WaitForInference, FailedResultRead) { - const int inference_error = 3; +TEST(WaitForInference, InferenceScheduled) { + const int inference_result = 0 /* Scheduled */; const int timeout = 0; - dl::Inference inference = dl::Inference(inference_error); - WaitStatus result = WaitForInference(&inference, timeout); - ASSERT_EQ(result.GetErrorCode(), WaitErrorCode::Error); - ICHECK_EQ(result.GetErrorDescription(), - "Failed to read inference result status (No such file or directory)"); + dl::Inference inference = dl::Inference(inference_result); + InferenceWaitStatus result = WaitForInference(&inference, timeout); + + ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kTimeout); + ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete."); } -TEST(WaitForInference, InferenceTimeout) { - const int inference_scheduled = 0; +TEST(WaitForInference, InferenceRunning) { + const int inference_result = 1 /* Running */; const int timeout = 0; - dl::Inference inference = dl::Inference(inference_scheduled); - WaitStatus result = WaitForInference(&inference, timeout); - ASSERT_EQ(result.GetErrorCode(), WaitErrorCode::Timeout); + dl::Inference inference = dl::Inference(inference_result); + InferenceWaitStatus result = WaitForInference(&inference, timeout); + + ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kTimeout); + std::cout << result.GetErrorDescription() << std::endl; ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete."); } +TEST(WaitForInference, InferenceError) { + const int inference_result = 3 /* Error */; + const int timeout = 0; + + dl::Inference inference = dl::Inference(inference_result); + InferenceWaitStatus result = WaitForInference(&inference, timeout); + + ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kError); + ICHECK_EQ(result.GetErrorDescription(), + "Failed to read inference result status (No such file or directory)"); +} + } // namespace ethosn } // namespace runtime } // namespace tvm