Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 7 additions & 10 deletions examples/models/yolo12/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

#find dependencies
find_package(absl CONFIG REQUIRED PATHS ${EXECUTORCH_ROOT}/cmake-out)
find_package(re2 CONFIG REQUIRED PATHS ${EXECUTORCH_ROOT}/cmake-out)
find_package(tokenizers CONFIG REQUIRED PATHS ${EXECUTORCH_ROOT}/cmake-out)
Comment on lines +32 to +34

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need tokenizers and other dependencies for YOLO?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you able to build it without these? The yolo example doesn't use them but I thought some dependencies need them. I will check again if we can build without these.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without these I see the error below:

CMake Error at CMakeLists.txt:37 (find_package):
  Found package configuration file:

    /home/mcavus/executorch/executorch/cmake-out/lib/cmake/ExecuTorch/executorch-config.cmake

  but it set executorch_FOUND to FALSE so package "executorch" is considered
  to be NOT FOUND.  Reason given by package:

  The following imported targets are referenced, but are missing:
  tokenizers::tokenizers

Copy link

@daniil-lyakhov daniil-lyakhov Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the example was fully functional when it was merged. That's a bit strange, how do you build the example?
You can find a test script over there https://github.com/pytorch/executorch/blob/main/.ci/scripts/test_yolo12.sh

I think meta guys could potentially help with that

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The build commands below should work right? I can reproduce with the main branch. I see similar error either with OV backend or XNNPACK by the way. I will ping them in discord.

rm -rf build
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DUSE_XNNPACK_BACKEND=OFF -DUSE_OPENVINO_BACKEND=ON ..
make -j$(nproc)


# find `executorch` libraries Same as for gflags
find_package(executorch CONFIG REQUIRED PATHS ${EXECUTORCH_ROOT}/cmake-out)
executorch_target_link_options_shared_lib(executorch)
Expand All @@ -38,21 +43,13 @@ list(APPEND link_libraries portable_ops_lib portable_kernels)
executorch_target_link_options_shared_lib(portable_ops_lib)

if(USE_XNNPACK_BACKEND)
set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod)
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
list(APPEND link_libraries ${xnnpack_backend_libs})
executorch_target_link_options_shared_lib(xnnpack_backend)
endif()

if(USE_OPENVINO_BACKEND)
add_subdirectory(${EXECUTORCH_ROOT}/backends/openvino openvino_backend)

target_include_directories(
openvino_backend
INTERFACE
${CMAKE_CURRENT_BINARY_DIR}/../../include
${CMAKE_CURRENT_BINARY_DIR}/../../include/executorch/runtime/core/portable_type/c10
${CMAKE_CURRENT_BINARY_DIR}/../../lib
)
find_package(OpenVINO REQUIRED)
list(APPEND link_libraries openvino_backend)
executorch_target_link_options_shared_lib(openvino_backend)
endif()
Expand Down
34 changes: 23 additions & 11 deletions examples/models/yolo12/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <iostream>
#include <vector>
#include <mutex>

#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
Expand All @@ -18,6 +19,8 @@ using executorch::extension::Module;
using executorch::runtime::Error;
using executorch::runtime::Result;

std::mutex execute_mutex;

struct Detection {
int class_id{0};
std::string className{};
Expand Down Expand Up @@ -59,32 +62,40 @@ cv::Mat scale_with_padding(
return result;
}

std::vector<Detection> infer_yolo_once(
Module& module,
cv::Mat input,
cv::Size img_dims,
const DetectionConfig yolo_config) {
int pad_x, pad_y;
float scale;
input = scale_with_padding(input, &pad_x, &pad_y, &scale, img_dims);

cv::Mat blob;
std::shared_ptr<executorch::aten::Tensor> prepare_input(
cv::Mat& input,
cv::Mat& blob,
cv::Size img_dims) {

cv::dnn::blobFromImage(
input, blob, 1.0 / 255.0, img_dims, cv::Scalar(), true, false);
const auto t_input = from_blob(
(void*)blob.data,
std::vector<int>(blob.size.p, blob.size.p + blob.dims),
ScalarType::Float);
return t_input;
}

executorch::aten::Tensor execute_frame(
Module& module,
std::shared_ptr<executorch::aten::Tensor> t_input) {
std::lock_guard<std::mutex> lock(execute_mutex);
const auto result = module.forward(t_input);

ET_CHECK_MSG(
result.ok(),
"Execution of method forward failed with status 0x%" PRIx32,
(uint32_t)result.error());

const auto t = result->at(0).toTensor(); // Using only the 0 output
return result->at(0).toTensor(); // Using only the 0 output
// yolov8 has an output of shape (batchSize, 84, 8400) (Num classes +
// box[x,y,w,h])
}

std::vector<Detection> process_output(
executorch::aten::Tensor& t,
const DetectionConfig yolo_config,
int pad_x, int pad_y, float scale) {
cv::Mat mat_output(t.dim() - 1, t.sizes().data() + 1, CV_32FC1, t.data_ptr());

std::vector<int> class_ids;
Expand Down Expand Up @@ -148,4 +159,5 @@ std::vector<Detection> infer_yolo_once(

return detections;
}

#endif // INFERENCE_H
100 changes: 82 additions & 18 deletions examples/models/yolo12/main.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "inference.h"
#include <thread>
#include <future>

#include <gflags/gflags.h>

Expand Down Expand Up @@ -97,32 +99,94 @@ int main(int argc, char** argv) {
unsigned long long iters = 0;
// Show progress every 10%
unsigned long long progress_bar_tick = std::round(video_lenght / 10);

struct frame_ctx {
cv::Mat frame;
cv::Mat scaled_input;
cv::Mat blob;
int pad_x;
int pad_y;
float scale;
};
std::queue<frame_ctx*> ready_q;
std::queue<std::pair<frame_ctx*, std::future<cv::Mat>>> scale_q;
std::queue<std::pair<frame_ctx*, std::future<std::shared_ptr<executorch::aten::Tensor>>>> input_q;
std::queue<std::pair<frame_ctx*, std::future<executorch::aten::Tensor>>> execute_q;
std::queue<std::pair<frame_ctx*, std::future<std::vector<Detection>>>> output_q;
const et_timestamp_t before_execute = et_pal_current_ticks();
size_t frame_queue_size = 2;
while (true) {
cv::Mat frame;
cap >> frame;

if (frame.empty())
if (frame.empty() && ready_q.empty() && scale_q.empty() && input_q.empty() && execute_q.empty() && output_q.empty())
break;

const et_timestamp_t before_execute = et_pal_current_ticks();
std::vector<Detection> output =
infer_yolo_once(yolo_module, frame, img_dims, DEFAULT_YOLO_CONFIG);
if (!frame.empty()) {
frame_ctx *new_frame_ctx = new frame_ctx;
new_frame_ctx->frame = frame;
ready_q.push(new_frame_ctx);
}

for (auto& detection : output) {
draw_detection(frame, detection, cv::Scalar(0, 0, 255));
while (!ready_q.empty() && scale_q.size() < frame_queue_size) {
frame_ctx *scale_f = ready_q.front();
scale_q.push(std::make_pair(scale_f, std::async(std::launch::async, scale_with_padding, std::ref(scale_f->frame), &(scale_f->pad_x), &(scale_f->pad_y), &(scale_f->scale), img_dims)));
ready_q.pop();
}
const et_timestamp_t after_execute = et_pal_current_ticks();
time_spent_executing += after_execute - before_execute;
iters++;

if (!(iters % progress_bar_tick)) {
const int precent_ready = (100 * iters) / video_lenght;
std::cout << iters << " out of " << video_lenght
<< " frames are are processed (" << precent_ready << "\%)"
<< std::endl;
while (!scale_q.empty() && input_q.size() < frame_queue_size) {
auto status = scale_q.front().second.wait_for(std::chrono::milliseconds(1));
if (status == std::future_status::ready) {
Comment on lines +131 to +138
Copy link

@daniil-lyakhov daniil-lyakhov Oct 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General questions:

  1. Looks like you are implementing inference request queue, is it possible to utilize the standard openvino API somehow?
  2. This is a real-time demo, the data is streamed sequentially and should be shoved sequentially, how does it work with your updated?
  3. I believe it is unfair to collect only model inference time without pre and post processing and claim it as a FPS stats. In real application the pre and post processing will affect the FPS

In general - could you please state the motives behind this PR? What are the purpose and improvements this PR introducing?

Copy link
Owner Author

@cavusmustafa cavusmustafa Oct 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We could try using async call inside openvino backend. This way, we could simply call forward function from executorch application and let openvino schedule the tasks. But I found two issues with it (explained below). But I don't think we need it anyways. The model inference still executes sequentially as we have a mutex lock on that part and we don't need to execute model inference asynchronously for this use case. I explained it more in 2.
    • We claim to support xnnpack with this application as well. We may need to add a lot of customizations only for openvino in that case.
    • Single executorch module seems to be using same output buffer for all executions. Upcoming task may override the result of the previous task. This seems to be risky (and it fails for xnnpack). We can create multiple executorch modules but in that case, I don't know if it would share the same openvino backend object. If it doesn't we may not be able to use async execution as intended and we may have additional memory overheads.
  2. We can assume the data is streamed sequentially and shoved sequentially. But still we can use pipelining for preprocess, infer, and postprocess which was the intention in this PR. So, as the first frame completes preprocessing on CPU and starts model execution on GPU (or NPU), we can also start preprocessing for the second frame as long as it is ready. Once the first frame completes GPU process, the second frame task can be assigned to GPU while the first frame start postprocessing.
    Also, in a real time stream, it will be better to limit the size of ready queue (maybe 2 or even 1). Larger ready queue size can cause delays in the output video.
  3. I didn't understand this part. The time measurement should be already for end-to-end object detection process (collecting timing before and after the whole while loop). It increases iters only when a frame retires. At the end it calculates the timing based on total while loop timing and total number of frames retired.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, thanks

scale_q.front().first->scaled_input = scale_q.front().second.get();
input_q.push(std::make_pair(scale_q.front().first, std::async(std::launch::async, prepare_input, std::ref(scale_q.front().first->scaled_input), std::ref(scale_q.front().first->blob), img_dims)));
scale_q.pop();
} else {
break;
}
}
while (!input_q.empty() && execute_q.size() < frame_queue_size) {
auto status = input_q.front().second.wait_for(std::chrono::milliseconds(1));
if (status == std::future_status::ready) {
std::shared_ptr<executorch::aten::Tensor> prepared_input = input_q.front().second.get();
execute_q.push(std::make_pair(input_q.front().first, std::async(std::launch::async, execute_frame, std::ref(yolo_module), prepared_input)));
input_q.pop();
} else {
break;
}
}
while (!execute_q.empty() && output_q.size() < frame_queue_size) {
auto status = execute_q.front().second.wait_for(std::chrono::milliseconds(1));
if (status == std::future_status::ready) {
executorch::aten::Tensor raw_output = execute_q.front().second.get();
output_q.push(std::make_pair(execute_q.front().first, std::async(std::launch::async, process_output, std::ref(raw_output), DEFAULT_YOLO_CONFIG, execute_q.front().first->pad_x, execute_q.front().first->pad_y, execute_q.front().first->scale)));
execute_q.pop();
} else {
break;
}
}
while (!output_q.empty()) {
auto status = output_q.front().second.wait_for(std::chrono::milliseconds(1));
if (status == std::future_status::ready) {
std::vector<Detection> output = output_q.front().second.get();
for (auto& detection : output) {
draw_detection(output_q.front().first->frame, detection, cv::Scalar(0, 0, 255));
}
iters++;

if (!(iters % progress_bar_tick)) {
const int precent_ready = (100 * iters) / video_lenght;
std::cout << iters << " out of " << video_lenght
<< " frames are are processed (" << precent_ready << "\%)"
<< std::endl;
}
video.write(output_q.front().first->frame);
output_q.pop();
} else {
break;
}
}
video.write(frame);
}
const et_timestamp_t after_execute = et_pal_current_ticks();
time_spent_executing = after_execute - before_execute;

const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
Expand Down Expand Up @@ -165,4 +229,4 @@ void draw_detection(
cv::Scalar(0, 0, 0),
2,
0);
}
}
2 changes: 1 addition & 1 deletion examples/models/yolo12/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ultralytics==8.3.97
ultralytics==8.3.196
Loading