From 55d4facc1a9b9aa1c2694b044d3d20b2a90999a7 Mon Sep 17 00:00:00 2001
From: Srirammaswamy <srirammaswamy.s@intel.com>
Date: Tue, 27 Aug 2024 14:07:38 +0530
Subject: [PATCH 1/5] Implements blob compatibility check for NPU

* OVEP catches the NPU driver exception and return failure status

* NPU to CPU fallback is disabled when inferencing with blob
---
 .../providers/openvino/backend_manager.cc     |  3 +-
 .../openvino/openvino_execution_provider.cc   | 78 ++++++++++---------
 2 files changed, 45 insertions(+), 36 deletions(-)
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 6466eb8880b38..78f77f9b3dc48 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -111,7 +111,8 @@ BackendManager::BackendManager(const GlobalContext& global_context,
       ORT_THROW(ex.what());
 #else
       if (device_type.find("NPU") != std::string::npos &&
-          !GetGlobalContext().disable_cpu_fallback) {
+          !GetGlobalContext().disable_cpu_fallback &&
+          !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
         LOGS_DEFAULT(WARNING) << ex.what();
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 29c45916795d3..b4269b8262b1f 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -139,42 +139,50 @@ common::Status OpenVINOExecutionProvider::Compile(
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
-
-    std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(*global_context_,
-                                                      fused_node,
-                                                      graph_body_viewer,
-                                                      *GetLogger(),
-                                                      ep_ctx_handle_);
-
-    compute_info.create_state_func =
-        [backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
-          p->allocate_func = context->allocate_func;
-          p->destroy_func = context->release_func;
-          p->allocator_handle = context->allocator_handle;
-          p->backend_manager = backend_manager;
-          *state = static_cast<FunctionState>(p);
-          return 0;
-        };
-    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-      try {
-        function_state->backend_manager->Compute(context);
-      } catch (const std::exception& ex) {
-        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+    try {
+      std::shared_ptr<openvino_ep::BackendManager> backend_manager =
+          std::make_shared<openvino_ep::BackendManager>(*global_context_,
+                                                        fused_node,
+                                                        graph_body_viewer,
+                                                        *GetLogger(),
+                                                        ep_ctx_handle_);
+      compute_info.create_state_func =
+          [backend_manager](ComputeContext* context, FunctionState* state) {
+            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
+            p->allocate_func = context->allocate_func;
+            p->destroy_func = context->release_func;
+            p->allocator_handle = context->allocator_handle;
+            p->backend_manager = backend_manager;
+            *state = static_cast<FunctionState>(p);
+            return 0;
+          };
+      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+        try {
+          function_state->backend_manager->Compute(context);
+        } catch (const std::exception& ex) {
+          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+        }
+        return Status::OK();
+      };
+
+      compute_info.release_state_func =
+          [](FunctionState state) {
+            if (state) {
+              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+              delete function_state;
+            }
+          };
+      node_compute_funcs.push_back(compute_info);
+    } catch (const OnnxRuntimeException& ex) {
+      std::string exception_str = ex.what();
+      if (exception_str.find("ZE_RESULT_ERROR_UNKNOWN") != std::string::npos ||
+          exception_str.find("ZE_RESULT_ERROR_UNINITIALIZED") != std::string::npos) {
+        return Status(common::ONNXRUNTIME, common::EP_FAIL, "Model needs to be recompiled");
+      } else {
+        ORT_THROW(exception_str);
       }
-      return Status::OK();
-    };
-
-    compute_info.release_state_func =
-        [](FunctionState state) {
-          if (state) {
-            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-            delete function_state;
-          }
-        };
-    node_compute_funcs.push_back(compute_info);
+    }
   }
 
   return Status::OK();

From e4ff4babb558e75a1fb7832c68b24375b1eada17 Mon Sep 17 00:00:00 2001
From: Srirammaswamy <srirammaswamy.s@intel.com>
Date: Fri, 30 Aug 2024 12:07:00 +0530
Subject: [PATCH 2/5] Update NPU device exception handling approach

* Changes failure status code to exception (std::runtime_error)

* Capture all NPU related errors

* Throw minimal error message with error type and error code for Release
  builds
---
 .../providers/openvino/backend_manager.cc     | 24 +++++-
 .../openvino/openvino_execution_provider.cc   | 78 +++++++++----------
 2 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 78f77f9b3dc48..fc4e5eb9a5283 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <cassert>
 #include <fstream>
+#include <regex>
 #include <sstream>
 #include <unordered_map>
 #include <unordered_set>
@@ -107,13 +108,14 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                       subgraph_context_,
                                                       ep_ctx_handle_);
     } catch (const OnnxRuntimeException& ex) {
+      std::string exception_str = ex.what();
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      ORT_THROW(ex.what());
+      ORT_THROW(exception_str);
 #else
       if (device_type.find("NPU") != std::string::npos &&
           !GetGlobalContext().disable_cpu_fallback &&
           !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
-        LOGS_DEFAULT(WARNING) << ex.what();
+        LOGS_DEFAULT(WARNING) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
         GetGlobalContext().device_type = "CPU";
@@ -126,6 +128,24 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
+      } else if (device_type.find("NPU") != std::string::npos &&
+                 exception_str.find("intel_npu") != std::string::npos) {
+        // Handle NPU device related errors
+#ifndef NDEBUG
+        ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
+#endif
+        std::string error_message = "UNKNOWN NPU ERROR";
+        std::string error_code = "code 0x0";
+        std::regex error_message_pattern(R"(\bZE_\w*\b)");
+        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
+        std::smatch matches;
+        if (std::regex_search(exception_str, matches, error_message_pattern)) {
+          error_message = matches[0];
+        }
+        if (std::regex_search(exception_str, matches, error_code_pattern)) {
+          error_code = matches[0];
+        }
+        throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
       } else {
         ORT_THROW(ex.what());
       }
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index b4269b8262b1f..29c45916795d3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -139,50 +139,42 @@ common::Status OpenVINOExecutionProvider::Compile(
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
-    try {
-      std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-          std::make_shared<openvino_ep::BackendManager>(*global_context_,
-                                                        fused_node,
-                                                        graph_body_viewer,
-                                                        *GetLogger(),
-                                                        ep_ctx_handle_);
-      compute_info.create_state_func =
-          [backend_manager](ComputeContext* context, FunctionState* state) {
-            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
-            p->allocate_func = context->allocate_func;
-            p->destroy_func = context->release_func;
-            p->allocator_handle = context->allocator_handle;
-            p->backend_manager = backend_manager;
-            *state = static_cast<FunctionState>(p);
-            return 0;
-          };
-      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-        try {
-          function_state->backend_manager->Compute(context);
-        } catch (const std::exception& ex) {
-          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
-        }
-        return Status::OK();
-      };
-
-      compute_info.release_state_func =
-          [](FunctionState state) {
-            if (state) {
-              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-              delete function_state;
-            }
-          };
-      node_compute_funcs.push_back(compute_info);
-    } catch (const OnnxRuntimeException& ex) {
-      std::string exception_str = ex.what();
-      if (exception_str.find("ZE_RESULT_ERROR_UNKNOWN") != std::string::npos ||
-          exception_str.find("ZE_RESULT_ERROR_UNINITIALIZED") != std::string::npos) {
-        return Status(common::ONNXRUNTIME, common::EP_FAIL, "Model needs to be recompiled");
-      } else {
-        ORT_THROW(exception_str);
+
+    std::shared_ptr<openvino_ep::BackendManager> backend_manager =
+        std::make_shared<openvino_ep::BackendManager>(*global_context_,
+                                                      fused_node,
+                                                      graph_body_viewer,
+                                                      *GetLogger(),
+                                                      ep_ctx_handle_);
+
+    compute_info.create_state_func =
+        [backend_manager](ComputeContext* context, FunctionState* state) {
+          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
+          p->allocate_func = context->allocate_func;
+          p->destroy_func = context->release_func;
+          p->allocator_handle = context->allocator_handle;
+          p->backend_manager = backend_manager;
+          *state = static_cast<FunctionState>(p);
+          return 0;
+        };
+    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+      try {
+        function_state->backend_manager->Compute(context);
+      } catch (const std::exception& ex) {
+        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
       }
-    }
+      return Status::OK();
+    };
+
+    compute_info.release_state_func =
+        [](FunctionState state) {
+          if (state) {
+            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+            delete function_state;
+          }
+        };
+    node_compute_funcs.push_back(compute_info);
   }
 
   return Status::OK();

From b5400cbab5a00408a7dc6d5ebf8f8e088f39bac5 Mon Sep 17 00:00:00 2001
From: "S, Srirammaswamy" <srirammaswamy.s@intel.com>
Date: Mon, 2 Sep 2024 17:13:47 +0530
Subject: [PATCH 3/5] Fix lint issues

---
 .../providers/openvino/backend_manager.cc     | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index fc4e5eb9a5283..b86444927740a 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -75,24 +75,24 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                 "QDQ stripping should not be enabled for models with dynamic input shapes. "
                 "Set enable_qdq_optimizer to False");
     if ((GetGlobalContext().device_type.find("CPU") != std::string::npos ||
-        GetGlobalContext().device_type.find("GPU") != std::string::npos) &&
+         GetGlobalContext().device_type.find("GPU") != std::string::npos) &&
         !GetGlobalContext().disable_dynamic_shapes) {
-        LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
-                           << "Creating backend Dynamic Shapes";
-        try {
-          concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                          GetGlobalContext(),
-                                                          subgraph_context_,
-                                                          ep_ctx_handle_);
-        } catch (std::string const& msg) {
-          ORT_THROW(msg);
-        }
-        LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
-                           << "Backend created for graph " << subgraph_context_.subgraph_name;
+      LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
+                         << "Creating backend Dynamic Shapes";
+      try {
+        concrete_backend_ = BackendFactory::MakeBackend(model_proto,
+                                                        GetGlobalContext(),
+                                                        subgraph_context_,
+                                                        ep_ctx_handle_);
+      } catch (std::string const& msg) {
+        ORT_THROW(msg);
+      }
+      LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
+                         << "Backend created for graph " << subgraph_context_.subgraph_name;
     } else {
-        // Only cache model_proto in global to rewrite the model with input shapes at runtime.
-        // For dynamic backend creation
-        model_proto_ = std::move(model_proto);
+      // Only cache model_proto in global to rewrite the model with input shapes at runtime.
+      // For dynamic backend creation
+      model_proto_ = std::move(model_proto);
     }
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has concrete input dims. "

From 574e82a9143e178fb4e094fec6b0c62b27e368ef Mon Sep 17 00:00:00 2001
From: "S, Srirammaswamy" <srirammaswamy.s@intel.com>
Date: Tue, 3 Sep 2024 14:49:48 +0530
Subject: [PATCH 4/5] Address review comments

---
 .../core/providers/openvino/backend_manager.cc        | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index b86444927740a..a4e1a9367be80 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -109,12 +109,12 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                       ep_ctx_handle_);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
+      bool enable_cpu_fallback = !GetGlobalContext().disable_cpu_fallback;
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      ORT_THROW(exception_str);
-#else
+      enable_cpu_fallback = false;
+#endif
       if (device_type.find("NPU") != std::string::npos &&
-          !GetGlobalContext().disable_cpu_fallback &&
-          !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+          enable_cpu_fallback && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
         LOGS_DEFAULT(WARNING) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
@@ -147,9 +147,8 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         }
         throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
       } else {
-        ORT_THROW(ex.what());
+        ORT_THROW(exception_str);
       }
-#endif
     }
   }
   if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {

From 4203dcb1135f90a8a4629a9afca14fab28eaa3c2 Mon Sep 17 00:00:00 2001
From: "S, Srirammaswamy" <srirammaswamy.s@intel.com>
Date: Tue, 3 Sep 2024 16:46:45 +0530
Subject: [PATCH 5/5] Address review comments

---
 .../providers/openvino/backend_manager.cc     | 52 +++++++++++--------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index a4e1a9367be80..4fca4037301fb 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -109,13 +109,14 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                       ep_ctx_handle_);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
-      bool enable_cpu_fallback = !GetGlobalContext().disable_cpu_fallback;
+      bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
+                                       !GetGlobalContext().disable_cpu_fallback &&
+                                       !ep_ctx_handle_.IsValidOVEPCtxGraph();
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      enable_cpu_fallback = false;
-#endif
-      if (device_type.find("NPU") != std::string::npos &&
-          enable_cpu_fallback && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
-        LOGS_DEFAULT(WARNING) << exception_str;
+      eligible_for_cpu_fallback = false;
+#else
+      if (eligible_for_cpu_fallback) {
+        LOGS_DEFAULT(VERBOSE) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
         GetGlobalContext().device_type = "CPU";
@@ -128,26 +129,31 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
-      } else if (device_type.find("NPU") != std::string::npos &&
-                 exception_str.find("intel_npu") != std::string::npos) {
-        // Handle NPU device related errors
+      }
+#endif
+      if (!eligible_for_cpu_fallback) {
+        if (device_type.find("NPU") != std::string::npos &&
+            exception_str.find("intel_npu") != std::string::npos) {
+          // Handle NPU device related errors
 #ifndef NDEBUG
-        ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
+          ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
+#else
+          std::string error_message = "UNKNOWN NPU ERROR";
+          std::string error_code = "code 0x0";
+          std::regex error_message_pattern(R"(\bZE_\w*\b)");
+          std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
+          std::smatch matches;
+          if (std::regex_search(exception_str, matches, error_message_pattern)) {
+            error_message = matches[0];
+          }
+          if (std::regex_search(exception_str, matches, error_code_pattern)) {
+            error_code = matches[0];
+          }
+          throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
 #endif
-        std::string error_message = "UNKNOWN NPU ERROR";
-        std::string error_code = "code 0x0";
-        std::regex error_message_pattern(R"(\bZE_\w*\b)");
-        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-        std::smatch matches;
-        if (std::regex_search(exception_str, matches, error_message_pattern)) {
-          error_message = matches[0];
-        }
-        if (std::regex_search(exception_str, matches, error_code_pattern)) {
-          error_code = matches[0];
+        } else {
+          ORT_THROW(exception_str);
         }
-        throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
-      } else {
-        ORT_THROW(exception_str);
       }
     }
   }