From f6cff3427fe7af62a688d3f2a8219720379b8a3a Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 19 Dec 2024 23:29:29 -0800
Subject: [PATCH 01/35] Rename EP instance context as session_context

---
 .../providers/openvino/backend_manager.cc     |  82 ++++-----
 .../core/providers/openvino/backend_manager.h |   7 +-
 .../core/providers/openvino/backend_utils.cc  |  16 +-
 .../core/providers/openvino/backend_utils.h   |   4 +-
 .../openvino/backends/backend_factory.cc      |   6 +-
 .../openvino/backends/basic_backend.cc        | 164 +++++++++---------
 .../openvino/backends/basic_backend.h         |   4 +-
 .../core/providers/openvino/contexts.h        |   2 +-
 .../core/providers/openvino/ibackend.h        |   2 +-
 .../openvino/openvino_execution_provider.cc   |  64 +++----
 .../openvino/openvino_execution_provider.h    |   2 +-
 11 files changed, 176 insertions(+), 177 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index b079e3794c4cc..1796256a23441 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -21,8 +21,8 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-GlobalContext& BackendManager::GetGlobalContext() {
-  return global_context_;
+SessionContext& BackendManager::GetSessionContext() {
+  return session_context_;
 }
 
 ov::CompiledModel& BackendManager::GetOVCompiledModel() {
@@ -30,17 +30,17 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() {
   return (ov_ptr);
 }
 
-BackendManager::BackendManager(const GlobalContext& global_context,
+BackendManager::BackendManager(const SessionContext& session_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger,
                                EPCtxHandler& ep_ctx_handle_) {
-  global_context_ = global_context;
+  session_context_ = session_context;
 
-  openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." +
-                          std::to_string(global_context_.OpenVINO_Version.at(1));
+  openvino_sdk_version_ = std::to_string(session_context_.OpenVINO_Version.at(0)) + "." +
+                          std::to_string(session_context_.OpenVINO_Version.at(1));
   if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) {
-    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, global_context_.ep_context_embed_mode) != Status::OK())
+    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, session_context_.ep_context_embed_mode) != Status::OK())
       ORT_THROW("Import blob from model failed");
   }
 
@@ -74,19 +74,19 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) {
     model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
   }
-  std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
+  std::string device_type = session_context_.device_type;
 
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
-    if ((GetGlobalContext().device_type.find("CPU") != std::string::npos ||
-         GetGlobalContext().device_type.find("GPU") != std::string::npos) &&
-        !GetGlobalContext().disable_dynamic_shapes) {
+    if ((session_context_.device_type.find("CPU") != std::string::npos ||
+         session_context_.device_type.find("GPU") != std::string::npos) &&
+        !session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
       try {
         concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                        GetGlobalContext(),
+                                                        session_context_,
                                                         subgraph_context_,
                                                         ep_ctx_handle_);
       } catch (std::string const& msg) {
@@ -95,7 +95,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Backend created for graph " << subgraph_context_.subgraph_name;
     } else {
-      // Only cache model_proto in global to rewrite the model with input shapes at runtime.
+      // Only cache model_proto in session context to rewrite the model with input shapes at runtime.
       // For dynamic backend creation
       model_proto_ = std::move(model_proto);
     }
@@ -109,13 +109,13 @@ BackendManager::BackendManager(const GlobalContext& global_context,
     // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
     try {
       concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                      GetGlobalContext(),
+                                                      session_context_,
                                                       subgraph_context_,
                                                       ep_ctx_handle_);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
       bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
-                                       !GetGlobalContext().disable_cpu_fallback &&
+                                       !session_context_.disable_cpu_fallback &&
                                        !ep_ctx_handle_.IsValidOVEPCtxGraph();
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
       eligible_for_cpu_fallback = false;
@@ -124,11 +124,11 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         LOGS_DEFAULT(VERBOSE) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
-        GetGlobalContext().device_type = "CPU";
-        GetGlobalContext().precision_str = "FP32";
+        session_context_.device_type = "CPU";
+        session_context_.precision_str = "FP32";
         try {
           concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                          GetGlobalContext(),
+                                                          session_context_,
                                                           subgraph_context_,
                                                           ep_ctx_handle_);
         } catch (std::string const& msg) {
@@ -162,7 +162,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
       }
     }
   }
-  if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+  if (session_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
     auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
                                                                                           logger);
     if ((!status.IsOK())) {
@@ -177,7 +177,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
 // the EPContext node.
 Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer,
                                                      const logging::Logger& logger) {
-  if (GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
+  if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
     std::string exception_str =
         "Exporting dynamically compiled models at runtime is not supported. "
         "Cannot export blobs of dynamic models that request static shape inference. "
@@ -189,19 +189,19 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
   auto compiled_model = concrete_backend_->GetOVCompiledModel();
   std::string graph_name = "";
   // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
-  if (!global_context_.cache_dir.empty()) {
-    graph_name = global_context_.cache_dir;
+  if (!session_context_.cache_dir.empty()) {
+    graph_name = session_context_.cache_dir;
   } else {
-    graph_name = global_context_.onnx_model_path_name;
+    graph_name = session_context_.onnx_model_path_name;
     // Remove extension so we can append suffix to form the complete name of output graph
-    size_t dot = global_context_.onnx_model_path_name.find_last_of(".");
+    size_t dot = session_context_.onnx_model_path_name.find_last_of(".");
     graph_name = graph_name.substr(0, dot);
     if (dot != std::string::npos) graph_name += "_ctx.onnx";
   }
 
   // If embed_mode, then pass on the serialized blob
   // If not embed_mode, dump the blob here and only pass on the path to the blob
-  if (global_context_.ep_context_embed_mode) {
+  if (session_context_.ep_context_embed_mode) {
     std::ostringstream model_blob_stream;
     compiled_model.export_model(model_blob_stream);
     model_blob_str = std::move(model_blob_stream).str();
@@ -223,7 +223,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
   ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer,
                                                       graph_name,
                                                       logger,
-                                                      global_context_.ep_context_embed_mode,
+                                                      session_context_.ep_context_embed_mode,
                                                       std::move(model_blob_str),
                                                       openvino_sdk_version_));
 
@@ -342,8 +342,8 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   };
 
   // QDQ stripping enabled only for the NPU
-  if (global_context_.device_type.find("NPU") != std::string::npos &&
-      global_context_.enable_qdq_optimizer &&
+  if (session_context_.device_type.find("NPU") != std::string::npos &&
+      session_context_.enable_qdq_optimizer &&
       IsQDQGraph(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
     std::unique_ptr<onnxruntime::Model> model;
@@ -351,7 +351,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else {
@@ -361,7 +361,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     subgraph.ToProto(*model_proto->mutable_graph(), true, true);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node);
     return model_proto;
   }
 }
@@ -453,13 +453,13 @@ void BackendManager::Compute(OrtKernelContext* context) {
   // by rewriting the model to static shaped model at runtime based on input shape.
   // disable_dynamic_shapes is always set to true for OV NPU plugin.
   if (subgraph_context_.has_dynamic_input_shape &&
-      !GetGlobalContext().disable_dynamic_shapes &&
-      (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
-       GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
+      !session_context_.disable_dynamic_shapes &&
+      (session_context_.device_type.find("CPU") != std::string::npos ||
+       session_context_.device_type.find("GPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
   } else if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
-    auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+    auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
@@ -470,7 +470,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
       auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes);
       try {
         dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
-                                                      GetGlobalContext(),
+                                                      session_context_,
                                                       subgraph_context_,
                                                       ep_ctx_handle_);
       } catch (const OnnxRuntimeException& ex) {
@@ -479,17 +479,17 @@ void BackendManager::Compute(OrtKernelContext* context) {
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU.";
         ORT_THROW(ex.what());
 #else
-        if (GetGlobalContext().device_type.find("NPU") != std::string::npos &&
-            !GetGlobalContext().disable_cpu_fallback) {
+        if (session_context_.device_type.find("NPU") != std::string::npos &&
+            !session_context_.disable_cpu_fallback) {
           LOGS_DEFAULT(WARNING) << ex.what();
           LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                                 << "Falling back to OV CPU for execution";
-          GetGlobalContext().device_type = "CPU";
-          GetGlobalContext().precision_str = "FP32";
-          key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+          session_context_.device_type = "CPU";
+          session_context_.precision_str = "FP32";
+          key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
           try {
             dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
-                                                          GetGlobalContext(),
+                                                          session_context_,
                                                           subgraph_context_,
                                                           ep_ctx_handle_);
           } catch (std::string const& msg) {
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 5ec462afd9d01..7ae647188976d 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -19,15 +19,14 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const GlobalContext& global_context,
+  BackendManager(const SessionContext& session_context,
                  const onnxruntime::Node& fused_node,
                  const onnxruntime::GraphViewer& subgraph,
                  const logging::Logger& logger,
                  EPCtxHandler& ctx_handle);
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
-  void SetGlobalCotext(const GlobalContext& global_context);
-  GlobalContext& GetGlobalContext();
+  SessionContext& GetSessionContext();
   Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph,
                                        const logging::Logger& logger);
   ov::CompiledModel& GetOVCompiledModel();
@@ -52,7 +51,7 @@ class BackendManager {
   std::shared_ptr<IBackend> concrete_backend_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
-  GlobalContext global_context_;
+  SessionContext session_context_;
   EPCtxHandler ep_ctx_handle_{};
   std::string openvino_sdk_version_{};
 };
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index b97736f2e124d..6c28db5803cb1 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -40,17 +40,17 @@ struct static_cast_int64 {
 };
 
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
+CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContext& session_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
   }
   const std::string model = model_proto.SerializeAsString();
   try {
-    auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
+    auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name);
 
     // Check for Constant Folding
-    if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) {
+    if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
       pass_const_obj.run_on_model(ov_model);
       auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
@@ -129,13 +129,13 @@ GetOutputTensor(Ort::KernelContext& context,
   return context.GetOutput(index, output_shape.get(), num_dims);
 }
 
-int GetFirstAvailableDevice(GlobalContext& global_context) {
+int GetFirstAvailableDevice(SessionContext& session_context) {
   int i = 0;
   // Get the first available VAD-M device and set the device to busy
   while (i < 8) {
-    bool device = global_context.deviceAvailableList[i];
+    bool device = session_context.deviceAvailableList[i];
     if (device) {
-      global_context.deviceAvailableList[i] = false;
+      session_context.deviceAvailableList[i] = false;
       break;
     }
     i++;
@@ -144,9 +144,9 @@ int GetFirstAvailableDevice(GlobalContext& global_context) {
   // make all remaining devices free
   if (i == 8) {
     i = 0;
-    global_context.deviceAvailableList[i] = false;
+    session_context.deviceAvailableList[i] = false;
     for (int j = 1; j < 8; j++) {
-      global_context.deviceAvailableList[j] = true;
+      session_context.deviceAvailableList[j] = true;
     }
   }
   return i;
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 9d58e1ca73abb..4a500a3f146f7 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -34,7 +34,7 @@ bool IsDebugEnabled();
 // Internal diagnostic function.
 bool IsCILogEnabled();
 
-int GetFirstAvailableDevice(GlobalContext& global_context);
+int GetFirstAvailableDevice(SessionContext& session_context);
 
 void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);
 
@@ -62,7 +62,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 
 std::shared_ptr<const OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
-              const GlobalContext& global_context,
+              const SessionContext& session_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index b7e4aed6e7e18..4b3e57d087381 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -12,10 +12,10 @@ namespace openvino_ep {
 
 std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-                            GlobalContext& global_context,
+                            SessionContext& session_context,
                             const SubGraphContext& subgraph_context,
                             EPCtxHandler& ep_ctx_handle) {
-  std::string type = global_context.device_type;
+  std::string type = session_context.device_type;
   if (type == "CPU" || type.find("GPU") != std::string::npos ||
       type.find("NPU") != std::string::npos ||
       type.find("HETERO") != std::string::npos ||
@@ -23,7 +23,7 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
       type.find("AUTO") != std::string::npos) {
     std::shared_ptr<IBackend> concrete_backend_;
     try {
-      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context, ep_ctx_handle);
+      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, ep_ctx_handle);
     } catch (std::string const& msg) {
       ORT_THROW(msg);
     }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 435ca83ff69d4..7dbd8bd5e979b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -21,11 +21,11 @@ namespace openvino_ep {
 using namespace backend_utils;
 
 BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-                           GlobalContext& global_context,
+                           SessionContext& session_context,
                            const SubGraphContext& subgraph_context,
                            EPCtxHandler& ep_ctx_handle)
-    : global_context_(global_context), subgraph_context_(subgraph_context) {
-  std::string& hw_target = global_context_.device_type;
+    : session_context_(session_context), subgraph_context_(subgraph_context) {
+  std::string& hw_target = session_context_.device_type;
 
   is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph();
 
@@ -59,77 +59,77 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   }
 
   try {
-    std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
+    std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str;
 
-    if (global_context.is_wholly_supported_graph) {  // Full graph is supported
+    if (session_context.is_wholly_supported_graph) {  // Full graph is supported
 #if defined(IO_BUFFER_ENABLED)
       if (is_ep_ctx_graph_) {
         std::istringstream model_stream(ep_ctx_handle.GetModelBlobString());
-        exe_network_ = global_context_.ie_core.ImportModel(model_stream,
+        exe_network_ = session_context_.ie_core.ImportModel(model_stream,
                                                            remote_context_,
                                                            subgraph_context_.subgraph_name);
-      } else if ((global_context.device_type.find("GPU") != std::string::npos) &&
-                 (global_context_.context != nullptr)) {
+      } else if ((session_context.device_type.find("GPU") != std::string::npos) &&
+                 (session_context_.context != nullptr)) {
         LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-        cl_context ctx = static_cast<cl_context>(global_context_.context);
-        remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_.ie_core.Get(), ctx);
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
+        cl_context ctx = static_cast<cl_context>(session_context_.context);
+        remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx);
+        ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_);
+        exe_network_ = session_context_.ie_core.CompileModel(
             ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name);
       } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
+        ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_);
+        exe_network_ = session_context_.ie_core.CompileModel(
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #else  // !IO_BUFFER_ENABLED
-      std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision;
+      std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : session_context_.model_precision;
       if (is_ep_ctx_graph_) {
         // If the blob is held in an EPContext node, then skip FE+Compile
         // and directly move on to creating a backend with the executable blob
-        exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(),
+        exe_network_ = session_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(),
                                                            hw_target,
                                                            device_config,
-                                                           global_context_.ep_context_embed_mode,
+                                                           session_context_.ep_context_embed_mode,
                                                            subgraph_context_.subgraph_name);
-      } else if (global_context_.export_ep_ctx_blob &&
+      } else if (session_context_.export_ep_ctx_blob &&
                  hw_target.find("NPU") != std::string::npos &&
-                 !global_context_.has_external_weights) {
+                 !session_context_.has_external_weights) {
         std::shared_ptr<ov::Model> ov_model;
         {
           const std::string model = model_proto->SerializeAsString();
           if (!subgraph_context.has_dynamic_input_shape) {
             delete model_proto.release();
           }
-          ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor());
+          ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor());
         }
-        exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
-      } else if (!global_context_.has_external_weights &&
+        exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
+      } else if (!session_context_.has_external_weights &&
                  (!subgraph_context_.has_dynamic_input_shape) &&
                  ((hw_target.find("AUTO") == std::string::npos) ||
-                  (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) {
+                  (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) {
         // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above
         // Inputs with static dimenstions
         const std::string model = model_proto->SerializeAsString();
-        exe_network_ = global_context_.ie_core.CompileModel(model,
+        exe_network_ = session_context_.ie_core.CompileModel(model,
                                                             hw_target,
                                                             device_config,
                                                             subgraph_context_.subgraph_name);
       } else {  // For all other types use ov::Model Type
-        auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
+        auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_);
+        exe_network_ = session_context_.ie_core.CompileModel(
             ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #endif
     } else {  // Full graph is not supported
-      auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
-      exe_network_ = global_context_.ie_core.CompileModel(
+      auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_);
+      exe_network_ = session_context_.ie_core.CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
     ORT_THROW(msg);
   }
-  int num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1;
+  int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req));
 }
 
@@ -146,21 +146,21 @@ bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::No
 void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   device_config = {};
   // Set inference precision based on device precision for OV backend
-  if (global_context_.precision_str.find("FP16") != std::string::npos &&
-      global_context_.device_type == "GPU") {
+  if (session_context_.precision_str.find("FP16") != std::string::npos &&
+      session_context_.device_type == "GPU") {
     device_config.emplace(ov::hint::inference_precision("f16"));
   }
-  if (global_context_.precision_str.find("FP32") != std::string::npos) {
+  if (session_context_.precision_str.find("FP32") != std::string::npos) {
     device_config.emplace(ov::hint::inference_precision("f32"));
   }
-  if (global_context_.precision_str.find("ACCURACY") != std::string::npos &&
-      global_context_.device_type.find("GPU") != std::string::npos) {
-    if (global_context_.OpenVINO_Version.at(0) >= 2024) {
+  if (session_context_.precision_str.find("ACCURACY") != std::string::npos &&
+      session_context_.device_type.find("GPU") != std::string::npos) {
+    if (session_context_.OpenVINO_Version.at(0) >= 2024) {
       device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
       device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
     } else {
-      if (global_context_.model_precision != "")
-        device_config.emplace(ov::hint::inference_precision(global_context_.model_precision));
+      if (session_context_.model_precision != "")
+        device_config.emplace(ov::hint::inference_precision(session_context_.model_precision));
     }
   }
 #ifndef NDEBUG
@@ -171,10 +171,10 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
 
   // Set a priority level for the current workload for preemption;  default priority is "DEFAULT"
   // CPU Plugin doesn't support workload priority
-  if (global_context_.device_type.find("CPU") == std::string::npos)
-    device_config.emplace(ov::hint::model_priority(global_context_.model_priority));
+  if (session_context_.device_type.find("CPU") == std::string::npos)
+    device_config.emplace(ov::hint::model_priority(session_context_.model_priority));
 
-  if (global_context_.device_type.find("NPU") != std::string::npos) {
+  if (session_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
 
@@ -184,16 +184,16 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     }
     device_config.emplace(ov::device::properties("NPU", device_property));
 #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
-    if (global_context_.export_ep_ctx_blob) {
-      global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
+    if (session_context_.export_ep_ctx_blob) {
+      session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
     }
 #endif
   }
 
-  if (!global_context_.load_config.empty()) {
-    const std::map<std::string, ov::AnyMap>& target_config = global_context_.load_config;
+  if (!session_context_.load_config.empty()) {
+    const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
-    if (global_context_.device_type.find("NPU") != std::string::npos) {
+    if (session_context_.device_type.find("NPU") != std::string::npos) {
       auto npuw_config = target_config.at("NPU");
 
       // Check if "NPU_USE_NPUW" exists and is set to "YES"
@@ -253,7 +253,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
-          global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
+          session_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
         } else {
           LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key
                                 << "\" is either unsupported in current OpenVINO version"
@@ -264,26 +264,26 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     };
 
     // Check if the device type is AUTO, HETERO, or MULTI
-    if (global_context_.device_type.find("AUTO") == 0 ||
-        global_context_.device_type.find("HETERO") == 0 ||
-        global_context_.device_type.find("MULTI") == 0) {
+    if (session_context_.device_type.find("AUTO") == 0 ||
+        session_context_.device_type.find("HETERO") == 0 ||
+        session_context_.device_type.find("MULTI") == 0) {
       // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
-      auto individual_devices = parse_individual_devices(global_context_.device_type);
+      auto individual_devices = parse_individual_devices(session_context_.device_type);
       // Set properties only for individual devices (e.g., "CPU", "GPU")
       for (const std::string& device : individual_devices) {
         if (target_config.count(device)) {
           // Get supported properties for each individual device
-          auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties);
+          auto device_properties = session_context_.ie_core.Get().get_property(device, ov::supported_properties);
           // Set properties for the device
           set_target_properties(device, target_config.at(device), device_properties);
         }
       }
     } else {
-      if (target_config.count(global_context_.device_type)) {
-        auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type,
+      if (target_config.count(session_context_.device_type)) {
+        auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type,
                                                                                ov::supported_properties);
-        set_target_properties(global_context_.device_type,
-                              target_config.at(global_context_.device_type), supported_properties);
+        set_target_properties(session_context_.device_type,
+                              target_config.at(session_context_.device_type), supported_properties);
       }
     }
   }
@@ -293,21 +293,21 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
   if (is_ep_ctx_graph_) return;
 
-  if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) {
+  if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
-    if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) {
+    if (session_context_.device_type.find("AUTO:GPU") != std::string::npos) {
       std::pair<std::string, ov::Any> device_property;
-      device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir);
+      device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir);
       device_config.emplace(ov::device::properties("GPU", device_property));
     } else {
-      global_context_.ie_core.SetCache(global_context_.cache_dir);
+      session_context_.ie_core.SetCache(session_context_.cache_dir);
     }
   }
 }
 
 void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
-  if (global_context_.enable_opencl_throttling == true &&
-      global_context_.device_type.find("GPU") != std::string::npos) {
+  if (session_context_.enable_opencl_throttling == true &&
+      session_context_.device_type.find("GPU") != std::string::npos) {
     LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device";
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("PLUGIN_THROTTLE", "1");
@@ -318,28 +318,28 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
 void BasicBackend::EnableStreams() {
   // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin
   // and throws an exception for the same
-  if (global_context_.device_type.find("NPU") != std::string::npos)
+  if (session_context_.device_type.find("NPU") != std::string::npos)
     return;
 
   // Streams can be set only if the device is not one of AUTO, MULTI, or HETERO
   // Throw an exception if the user tries to set num_streams for these devices
-  if ((global_context_.device_type.find("MULTI") != std::string::npos) ||
-      (global_context_.device_type.find("HETERO") != std::string::npos) ||
-      (global_context_.device_type.find("AUTO") != std::string::npos)) {
-    if (global_context_.num_streams != 1) {
+  if ((session_context_.device_type.find("MULTI") != std::string::npos) ||
+      (session_context_.device_type.find("HETERO") != std::string::npos) ||
+      (session_context_.device_type.find("AUTO") != std::string::npos)) {
+    if (session_context_.num_streams != 1) {
       ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " +
-                std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
+                std::to_string(session_context_.num_streams) + " for device " + session_context_.device_type);
     }
     // Do nothing
   } else {
-    global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams);
+    session_context_.ie_core.SetStreams(session_context_.device_type, session_context_.num_streams);
   }
 }
 
 void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
   // inference_num_threads is applicable only for the CPU device
-  if (global_context_.device_type.find("CPU") != std::string::npos)
-    device_config.emplace(ov::inference_num_threads(global_context_.num_of_threads));
+  if (session_context_.device_type.find("CPU") != std::string::npos)
+    device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
 // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
@@ -370,9 +370,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
-          !global_context_.disable_dynamic_shapes &&
-          (global_context_.device_type.find("CPU") != std::string::npos ||
-           global_context_.device_type.find("GPU") != std::string::npos)) {
+          !session_context_.disable_dynamic_shapes &&
+          (session_context_.device_type.find("CPU") != std::string::npos ||
+           session_context_.device_type.find("GPU") != std::string::npos)) {
         auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
         auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
         auto tensor_shape = tensor_info.GetShape();
@@ -387,7 +387,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         const auto& input = graph_input_info.at(input_idx);
         OVTensorPtr tensor_ptr;
         // avoid input copies on the CPU device
-        if (global_context_.device_type.find("CPU") != std::string::npos) {
+        if (session_context_.device_type.find("CPU") != std::string::npos) {
           tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input_tensor_shape,
                                                     (void*)tensor_data);
         } else {
@@ -401,8 +401,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           ORT_THROW(msg);
         }
       } else {
-        if ((global_context_.device_type.find("CPU") != std::string::npos ||
-             global_context_.device_type.find("GPU") != std::string::npos)) {
+        if ((session_context_.device_type.find("CPU") != std::string::npos ||
+             session_context_.device_type.find("GPU") != std::string::npos)) {
           OVTensorPtr graph_input_blob;
           try {
             graph_input_blob = infer_request->GetTensor(input_name);
@@ -434,7 +434,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
       input_idx++;
     }
-    if (global_context_.device_type.find("NPU") != std::string::npos) {
+    if (session_context_.device_type.find("NPU") != std::string::npos) {
       // Set the output blob as remote blob
       auto graph_output_info = exe_network_.Get().outputs();
       auto output_idx = 0;
@@ -628,8 +628,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
             " doesn't exist in the "
             "list of OpenVINO output tensor names");
       }
-      if ((global_context_.device_type.find("CPU") != std::string::npos ||
-           global_context_.device_type.find("GPU") != std::string::npos)) {
+      if ((session_context_.device_type.find("CPU") != std::string::npos ||
+           session_context_.device_type.find("GPU") != std::string::npos)) {
         try {
           graph_output_blob = infer_request->GetTensor(output_name);
         } catch (const char* msg) {
@@ -703,8 +703,8 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     OVInferRequestPtr infer_request;
     infer_request = inferRequestsQueue_->getIdleRequest();
 #ifdef IO_BUFFER_ENABLED
-    if ((global_context_.device_type.find("GPU") != std::string::npos) &&
-        (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) {
+    if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+        (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
@@ -748,7 +748,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
 #ifndef IO_BUFFER_ENABLED  // Printing performance counts is disabled when IO_BUFFER_ENABLED
     if (openvino_ep::backend_utils::IsDebugEnabled()) {
       inferRequestsQueue_->printstatus();  // Printing the elements of infer_requests_ vector pool only in debug mode
-      std::string& hw_target = global_context_.device_type;
+      std::string& hw_target = session_context_.device_type;
       printPerformanceCounts(std::move(infer_request_), std::cout, hw_target);
     }
 #endif
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 3fcf6e4384d52..0aab336ce909f 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -30,7 +30,7 @@ class InferRequestsQueue;
 class BasicBackend : public IBackend {
  public:
   BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-               GlobalContext& global_context,
+               SessionContext& session_context,
                const SubGraphContext& subgraph_context,
                EPCtxHandler& ep_ctx_handle);
 
@@ -55,7 +55,7 @@ class BasicBackend : public IBackend {
 
   void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
 
-  GlobalContext& global_context_;
+  SessionContext& session_context_;
   SubGraphContext subgraph_context_;
   mutable std::mutex compute_lock_;
   OVExeNetwork exe_network_;
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 4f970bc7bc287..23256b8df6fd0 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -13,7 +13,7 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Holds context applicable to the entire EP instance.
-struct GlobalContext {
+struct SessionContext {
   OVCore ie_core;
   bool is_wholly_supported_graph = false;
   bool enable_opencl_throttling = false;
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 7a2d6f4e8cd69..6d4aad3aec919 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -21,7 +21,7 @@ class BackendFactory {
  public:
   static std::shared_ptr<IBackend>
   MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-              GlobalContext& global_context,
+              SessionContext& session_context,
               const SubGraphContext& subgraph_context,
               EPCtxHandler& ctx_handle);
 };
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 72a188108adef..e5ffde62eeedb 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -25,28 +25,28 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
   InitProviderOrtApi();
 
-  global_context_ = std::make_unique<openvino_ep::GlobalContext>();
-  global_context_->device_type = info.device_type_;
-  global_context_->precision_str = info.precision_;
-  global_context_->cache_dir = info.cache_dir_;
-  global_context_->load_config = info.load_config_;
-  global_context_->model_priority = info.model_priority_;
-  global_context_->num_streams = info.num_streams_;
-  global_context_->context = info.context_;
-  global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
-  global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
-  global_context_->num_of_threads = info.num_of_threads_;
-  global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
-  global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
-  global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
-  global_context_->disable_cpu_fallback = info.disable_cpu_fallback_;
-  global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_;
+  session_context_ = std::make_unique<openvino_ep::SessionContext>();
+  session_context_->device_type = info.device_type_;
+  session_context_->precision_str = info.precision_;
+  session_context_->cache_dir = info.cache_dir_;
+  session_context_->load_config = info.load_config_;
+  session_context_->model_priority = info.model_priority_;
+  session_context_->num_streams = info.num_streams_;
+  session_context_->context = info.context_;
+  session_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
+  session_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
+  session_context_->num_of_threads = info.num_of_threads_;
+  session_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
+  session_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
+  session_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
+  session_context_->disable_cpu_fallback = info.disable_cpu_fallback_;
+  session_context_->ep_context_embed_mode = info.so_epctx_embed_mode_;
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir_.empty()) {
     bool device_found = false;
-    std::vector<std::string> available_devices = global_context_->ie_core.GetAvailableDevices();
+    std::vector<std::string> available_devices = session_context_->ie_core.GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type_ != "") {
       if (info.device_type_.find("HETERO") != std::string::npos ||
@@ -85,8 +85,8 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." +
-                                     std::to_string(global_context_->OpenVINO_Version.at(1));
+  std::string openvino_sdk_version = std::to_string(session_context_->OpenVINO_Version.at(0)) + "." +
+                                     std::to_string(session_context_->OpenVINO_Version.at(1));
 
   // Check for valid ctx node and maintain state for validity
   if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version)))
@@ -97,20 +97,20 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-  global_context_->onnx_model_path_name = graph_viewer.ModelPath().string();
+  session_context_->onnx_model_path_name = graph_viewer.ModelPath().string();
 
-  global_context_->onnx_opset_version =
+  session_context_->onnx_opset_version =
       graph_viewer.DomainToVersionMap().at(kOnnxDomain);
 
-  global_context_->model_precision = [&](const GraphViewer& graph_viewer) {
+  session_context_->model_precision = [&](const GraphViewer& graph_viewer) {
     // return empty if graph has no inputs or if types are not one of FP32/FP16
     // else assume the type of the first input
     if (graph_viewer.GetInputs().empty()) {
       return "";
     } else {
       auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
-      if (global_context_->precision_str == "ACCURACY" &&
-          global_context_->device_type.find("GPU") != std::string::npos) {
+      if (session_context_->precision_str == "ACCURACY" &&
+          session_context_->device_type.find("GPU") != std::string::npos) {
         if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
           return "FP32";
         } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
@@ -122,12 +122,12 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   }(graph_viewer);
 
   openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->enable_qdq_optimizer);
+                                 session_context_->device_type,
+                                 session_context_->enable_qdq_optimizer);
   result = obj.Execute();
 
-  global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
-  global_context_->has_external_weights = obj.HasExternalWeights();
+  session_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+  session_context_->has_external_weights = obj.HasExternalWeights();
 
   return result;
 }
@@ -141,14 +141,14 @@ common::Status OpenVINOExecutionProvider::Compile(
 
     NodeComputeInfo compute_info;
 
-    global_context_->use_api_2 = true;
+    session_context_->use_api_2 = true;
 
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
 
     std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(*global_context_,
+        std::make_shared<openvino_ep::BackendManager>(*session_context_,
                                                       fused_node,
                                                       graph_body_viewer,
                                                       *GetLogger(),
@@ -189,11 +189,11 @@ common::Status OpenVINOExecutionProvider::Compile(
 
 #ifdef USE_OVEP_NPU_MEMORY
 std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
-  if (global_context_->device_type.find("NPU") != std::string::npos) {
+  if (session_context_->device_type.find("NPU") != std::string::npos) {
     AllocatorCreationInfo npu_allocator_info{
         [this](OrtDevice::DeviceId device_id) {
           return std::make_unique<OVRTAllocator>(
-              global_context_->ie_core.Get(),
+              session_context_->ie_core.Get(),
               OrtDevice::NPU,
               device_id,
               OpenVINO_RT_NPU);
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index d5c22a4e2a9e4..26a67ba04756b 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -198,7 +198,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 #endif
  private:
-  std::unique_ptr<openvino_ep::GlobalContext> global_context_;
+  std::unique_ptr<openvino_ep::SessionContext> session_context_;
   std::shared_ptr<openvino_ep::BackendManager> backend_manager_;
   openvino_ep::EPCtxHandler ep_ctx_handle_{};
 };

From f170c88bbea5b0461251c5714d5f89225ce8217a Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Wed, 8 Jan 2025 00:56:07 -0800
Subject: [PATCH 02/35] Add support for GetEpContextNodes

---
 .../providers/openvino/backend_manager.cc     | 132 +++++++++--------
 .../core/providers/openvino/backend_manager.h |   3 +-
 .../core/providers/openvino/backend_utils.cc  |   6 +-
 .../core/providers/openvino/backend_utils.h   |   1 +
 .../openvino/backends/backend_factory.cc      |   4 +-
 .../openvino/backends/basic_backend.cc        |  55 ++++---
 .../openvino/backends/basic_backend.h         |   3 +-
 .../core/providers/openvino/contexts.h        |  11 +-
 .../core/providers/openvino/ibackend.h        |   5 +-
 .../openvino/onnx_ctx_model_helper.cc         | 132 +++++++++--------
 .../openvino/onnx_ctx_model_helper.h          |  28 ++--
 .../openvino/openvino_execution_provider.cc   | 137 ++++++++----------
 .../openvino/openvino_execution_provider.h    |  13 +-
 .../core/providers/openvino/ov_interface.cc   |  13 +-
 .../core/providers/openvino/ov_interface.h    |   3 +-
 .../openvino/ov_versions/capability.cc        |  56 +++++--
 .../openvino/ov_versions/capability.h         |   5 +-
 .../shared_library/provider_wrappedtypes.h    |   3 +-
 .../core/session/provider_bridge_ort.cc       |   3 +-
 19 files changed, 334 insertions(+), 279 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 1796256a23441..04c1ffebb838d 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -10,8 +10,10 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <istream>
 
 #include "core/providers/shared_library/provider_api.h"
+#include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/contexts.h"
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/ibackend.h"
@@ -34,15 +36,35 @@ BackendManager::BackendManager(const SessionContext& session_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger,
-                               EPCtxHandler& ep_ctx_handle_) {
-  session_context_ = session_context;
-
-  openvino_sdk_version_ = std::to_string(session_context_.OpenVINO_Version.at(0)) + "." +
-                          std::to_string(session_context_.OpenVINO_Version.at(1));
-  if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) {
-    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, session_context_.ep_context_embed_mode) != Status::OK())
-      ORT_THROW("Import blob from model failed");
-  }
+                               EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), session_context_(session_context) {
+  subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
+
+  subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
+    // return empty if graph has no inputs or if types are not one of FP32/FP16
+    // else assume the type of the first input
+    if (graph_viewer.GetInputs().empty()) {
+      return "";
+    } else {
+      auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
+      if (session_context_.precision_str == "ACCURACY" &&
+          session_context_.device_type.find("GPU") != std::string::npos) {
+        if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
+          return "FP32";
+        } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+          return "FP16";
+        }
+      }
+    }
+    return "";
+  }(subgraph);
+
+  openvino_ep::GetCapability obj(ep_ctx_handle_,
+                                 subgraph,
+                                 session_context_.device_type,
+                                 session_context_.enable_qdq_optimizer);
+  std::ignore = obj.Execute();
+  subgraph_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+  subgraph_context_.has_external_weights = obj.HasExternalWeights();
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
@@ -70,8 +92,11 @@ BackendManager::BackendManager(const SessionContext& session_context,
     i++;
   }
   subgraph_context_.subgraph_name = fused_node.Name();
+  ptr_stream_t model_stream;
   std::unique_ptr<onnx::ModelProto> model_proto;
-  if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+  if (subgraph_context_.is_ep_ctx_graph) {
+    model_stream = ep_ctx_handle_.GetModelBlobStream(subgraph);
+  } else {
     model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
   }
   std::string device_type = session_context_.device_type;
@@ -88,7 +113,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
         concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                         session_context_,
                                                         subgraph_context_,
-                                                        ep_ctx_handle_);
+                                                        model_stream);
       } catch (std::string const& msg) {
         ORT_THROW(msg);
       }
@@ -111,12 +136,12 @@ BackendManager::BackendManager(const SessionContext& session_context,
       concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                       session_context_,
                                                       subgraph_context_,
-                                                      ep_ctx_handle_);
+                                                      model_stream);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
       bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
                                        !session_context_.disable_cpu_fallback &&
-                                       !ep_ctx_handle_.IsValidOVEPCtxGraph();
+                                       !subgraph_context_.is_ep_ctx_graph;
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
       eligible_for_cpu_fallback = false;
 #else
@@ -130,7 +155,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
           concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                           session_context_,
                                                           subgraph_context_,
-                                                          ep_ctx_handle_);
+                                                          model_stream);
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
@@ -162,7 +187,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
       }
     }
   }
-  if (session_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+  if (session_context_.export_ep_ctx_blob && !subgraph_context_.is_ep_ctx_graph) {
     auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
                                                                                           logger);
     if ((!status.IsOK())) {
@@ -185,23 +210,12 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     ORT_THROW(exception_str);
   }
 
-  std::string model_blob_str;
-  auto compiled_model = concrete_backend_->GetOVCompiledModel();
-  std::string graph_name = "";
-  // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
-  if (!session_context_.cache_dir.empty()) {
-    graph_name = session_context_.cache_dir;
-  } else {
-    graph_name = session_context_.onnx_model_path_name;
-    // Remove extension so we can append suffix to form the complete name of output graph
-    size_t dot = session_context_.onnx_model_path_name.find_last_of(".");
-    graph_name = graph_name.substr(0, dot);
-    if (dot != std::string::npos) graph_name += "_ctx.onnx";
-  }
-
   // If embed_mode, then pass on the serialized blob
   // If not embed_mode, dump the blob here and only pass on the path to the blob
+  std::string model_blob_str;
+  auto compiled_model = concrete_backend_->GetOVCompiledModel();
   if (session_context_.ep_context_embed_mode) {
+    // Internal blob
     std::ostringstream model_blob_stream;
     compiled_model.export_model(model_blob_stream);
     model_blob_str = std::move(model_blob_stream).str();
@@ -209,23 +223,30 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
       ORT_THROW("Model blob stream is empty after exporting the compiled model.");
     }
   } else {
-    // Remove extension so we can append suffix to form the complete name of output graph
-    auto blob_name = graph_name.substr(0, graph_name.find_last_of("."));
-    std::ofstream blob_file(blob_name + ".blob",
+    // External blob
+    std::filesystem::path blob_filename;
+    // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
+    if (!session_context_.cache_dir.empty()) {
+      blob_filename = session_context_.cache_dir;
+    } else {
+      blob_filename = graph_body_viewer.ModelPath();
+    }
+    const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)};
+    blob_filename = blob_filename.parent_path() / name;
+    blob_filename.replace_extension("blob");
+    std::ofstream blob_file(blob_filename,
                             std::ios::out | std::ios::trunc | std::ios::binary);
     if (!blob_file) {
       ORT_THROW("Unable to open file for epctx model dump.");
     }
     compiled_model.export_model(blob_file);
-    model_blob_str = blob_name + ".blob";
+    model_blob_str = blob_filename.string();
   }
 
-  ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer,
-                                                      graph_name,
-                                                      logger,
-                                                      session_context_.ep_context_embed_mode,
-                                                      std::move(model_blob_str),
-                                                      openvino_sdk_version_));
+  ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
+                                                           subgraph_context_.subgraph_name,
+                                                           session_context_.ep_context_embed_mode,
+                                                           std::move(model_blob_str)));
 
   return Status::OK();
 }
@@ -296,27 +317,20 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
-static void DumpOpenVINOEPModel(std::string onnx_model_path_name,
+static void DumpOpenVINOEPModel(const std::filesystem::path& onnx_model_path_name,
                                 ONNX_NAMESPACE::ModelProto* model_proto,
                                 const onnxruntime::Node& fused_node) {
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
-    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name);
-#ifdef _WIN32
-    size_t slash = model_name.find_last_of("\\");
-#else
-    size_t slash = model_name.find_last_of("/");
-#endif
-    model_name = model_name.substr(slash + 1, std::string::npos);
-    size_t dot = model_name.find_last_of(".");
-    model_name = model_name.substr(0, dot);
+    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename();
 
-    std::string subgraph_name = fused_node.Name();
+    const auto& subgraph_name = fused_node.Name();
     size_t dash = subgraph_name.find_last_of("-");
-    subgraph_name = subgraph_name.substr(dash, std::string::npos);
-
-    const std::string name = model_name + subgraph_name + ".onnx";
+    if (dash != std::string::npos) {
+      auto new_name = model_name.stem().string() + subgraph_name.substr(dash, std::string::npos);
+      model_name.replace_filename(new_name);
+    }
 
-    std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary);
+    std::fstream dump(model_name, std::ios::out | std::ios::trunc | std::ios::binary);
     model_proto->SerializeToOstream(dump);
   }
 }
@@ -341,6 +355,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     }
   };
 
+  const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU
   if (session_context_.device_type.find("NPU") != std::string::npos &&
       session_context_.enable_qdq_optimizer &&
@@ -351,7 +366,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else {
@@ -361,7 +376,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     subgraph.ToProto(*model_proto->mutable_graph(), true, true);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     return model_proto;
   }
 }
@@ -463,6 +478,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
+      ptr_stream_t model_stream;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Creating dynamic backend for key: " << key;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
@@ -472,7 +488,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
         dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
                                                       session_context_,
                                                       subgraph_context_,
-                                                      ep_ctx_handle_);
+                                                      model_stream);
       } catch (const OnnxRuntimeException& ex) {
         // Build option disables fallback to CPU on compilation failures with NPU.
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
@@ -491,7 +507,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
             dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
                                                           session_context_,
                                                           subgraph_context_,
-                                                          ep_ctx_handle_);
+                                                          model_stream);
           } catch (std::string const& msg) {
             ORT_THROW(msg);
           }
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 7ae647188976d..f77f303c70991 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -51,9 +51,8 @@ class BackendManager {
   std::shared_ptr<IBackend> concrete_backend_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
+  EPCtxHandler& ep_ctx_handle_;
   SessionContext session_context_;
-  EPCtxHandler ep_ctx_handle_{};
-  std::string openvino_sdk_version_{};
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 6c28db5803cb1..84de5eb4f16f9 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -40,7 +40,9 @@ struct static_cast_int64 {
 };
 
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContext& session_context,
+CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
+              const SessionContext& session_context,
+              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
@@ -50,7 +52,7 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContex
     auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name);
 
     // Check for Constant Folding
-    if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
+    if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
       pass_const_obj.run_on_model(ov_model);
       auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 4a500a3f146f7..2765fe0e9b1c7 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -63,6 +63,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 std::shared_ptr<const OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
               const SessionContext& session_context,
+              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index 4b3e57d087381..2fd9a7fa0a537 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -14,7 +14,7 @@ std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                             SessionContext& session_context,
                             const SubGraphContext& subgraph_context,
-                            EPCtxHandler& ep_ctx_handle) {
+                            ptr_stream_t& model_stream) {
   std::string type = session_context.device_type;
   if (type == "CPU" || type.find("GPU") != std::string::npos ||
       type.find("NPU") != std::string::npos ||
@@ -23,7 +23,7 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
       type.find("AUTO") != std::string::npos) {
     std::shared_ptr<IBackend> concrete_backend_;
     try {
-      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, ep_ctx_handle);
+      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, model_stream);
     } catch (std::string const& msg) {
       ORT_THROW(msg);
     }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 7dbd8bd5e979b..bacf25effb0f3 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -23,12 +23,10 @@ using namespace backend_utils;
 BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                            SessionContext& session_context,
                            const SubGraphContext& subgraph_context,
-                           EPCtxHandler& ep_ctx_handle)
+                           ptr_stream_t& model_stream)
     : session_context_(session_context), subgraph_context_(subgraph_context) {
   std::string& hw_target = session_context_.device_type;
 
-  is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph();
-
   if (ValidateSubgraph(const_outputs_map_))
     return;
 
@@ -61,13 +59,12 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   try {
     std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str;
 
-    if (session_context.is_wholly_supported_graph) {  // Full graph is supported
+    if (subgraph_context_.is_wholly_supported_graph) {  // Full graph is supported
 #if defined(IO_BUFFER_ENABLED)
-      if (is_ep_ctx_graph_) {
-        std::istringstream model_stream(ep_ctx_handle.GetModelBlobString());
-        exe_network_ = session_context_.ie_core.ImportModel(model_stream,
-                                                           remote_context_,
-                                                           subgraph_context_.subgraph_name);
+      if (subgraph_context_.is_ep_ctx_graph) {
+        exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+                                                            remote_context_,
+                                                            subgraph_context_.subgraph_name);
       } else if ((session_context.device_type.find("GPU") != std::string::npos) &&
                  (session_context_.context != nullptr)) {
         LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
@@ -82,28 +79,28 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #else  // !IO_BUFFER_ENABLED
-      std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : session_context_.model_precision;
-      if (is_ep_ctx_graph_) {
+      std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : subgraph_context_.model_precision;
+      if (subgraph_context_.is_ep_ctx_graph) {
         // If the blob is held in an EPContext node, then skip FE+Compile
         // and directly move on to creating a backend with the executable blob
-        exe_network_ = session_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(),
-                                                           hw_target,
-                                                           device_config,
-                                                           session_context_.ep_context_embed_mode,
-                                                           subgraph_context_.subgraph_name);
+        exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+                                                            hw_target,
+                                                            device_config,
+                                                            subgraph_context_.subgraph_name);
+        model_stream.reset();  // Delete stream after it is no longer needed
       } else if (session_context_.export_ep_ctx_blob &&
                  hw_target.find("NPU") != std::string::npos &&
-                 !session_context_.has_external_weights) {
+                 !subgraph_context_.has_external_weights) {
         std::shared_ptr<ov::Model> ov_model;
         {
           const std::string model = model_proto->SerializeAsString();
-          if (!subgraph_context.has_dynamic_input_shape) {
+          if (!subgraph_context_.has_dynamic_input_shape) {
             delete model_proto.release();
           }
           ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor());
         }
         exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
-      } else if (!session_context_.has_external_weights &&
+      } else if (!subgraph_context_.has_external_weights &&
                  (!subgraph_context_.has_dynamic_input_shape) &&
                  ((hw_target.find("AUTO") == std::string::npos) ||
                   (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) {
@@ -111,17 +108,17 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         // Inputs with static dimenstions
         const std::string model = model_proto->SerializeAsString();
         exe_network_ = session_context_.ie_core.CompileModel(model,
-                                                            hw_target,
-                                                            device_config,
-                                                            subgraph_context_.subgraph_name);
+                                                             hw_target,
+                                                             device_config,
+                                                             subgraph_context_.subgraph_name);
       } else {  // For all other types use ov::Model Type
-        auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_);
+        auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_);
         exe_network_ = session_context_.ie_core.CompileModel(
             ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #endif
     } else {  // Full graph is not supported
-      auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_);
+      auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_);
       exe_network_ = session_context_.ie_core.CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
@@ -159,8 +156,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
       device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
       device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
     } else {
-      if (session_context_.model_precision != "")
-        device_config.emplace(ov::hint::inference_precision(session_context_.model_precision));
+      if (!subgraph_context_.model_precision.empty())
+        device_config.emplace(ov::hint::inference_precision(subgraph_context_.model_precision));
     }
   }
 #ifndef NDEBUG
@@ -281,7 +278,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     } else {
       if (target_config.count(session_context_.device_type)) {
         auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type,
-                                                                               ov::supported_properties);
+                                                                                ov::supported_properties);
         set_target_properties(session_context_.device_type,
                               target_config.at(session_context_.device_type), supported_properties);
       }
@@ -291,7 +288,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
 
 void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
-  if (is_ep_ctx_graph_) return;
+  if (subgraph_context_.is_ep_ctx_graph) return;
 
   if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
@@ -300,7 +297,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
       device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir);
       device_config.emplace(ov::device::properties("GPU", device_property));
     } else {
-      session_context_.ie_core.SetCache(session_context_.cache_dir);
+      session_context_.ie_core.SetCache(session_context_.cache_dir.string());
     }
   }
 }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 0aab336ce909f..177784a71f575 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -32,7 +32,7 @@ class BasicBackend : public IBackend {
   BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                SessionContext& session_context,
                const SubGraphContext& subgraph_context,
-               EPCtxHandler& ep_ctx_handle);
+               ptr_stream_t& model_stream);
 
   void Infer(OrtKernelContext* context) override;
   ov::CompiledModel& GetOVCompiledModel() override {
@@ -61,7 +61,6 @@ class BasicBackend : public IBackend {
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
-  bool is_ep_ctx_graph_{false};
 #if defined IO_BUFFER_ENABLED
   OVRemoteContextPtr remote_context_;
 #endif
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 23256b8df6fd0..e9405b5ac5142 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -7,6 +7,7 @@
 #include <map>
 #include <unordered_map>
 #include <string>
+#include <filesystem>
 #include "core/providers/openvino/ov_interface.h"
 
 namespace onnxruntime {
@@ -15,19 +16,16 @@ namespace openvino_ep {
 // Holds context applicable to the entire EP instance.
 struct SessionContext {
   OVCore ie_core;
-  bool is_wholly_supported_graph = false;
   bool enable_opencl_throttling = false;
   bool disable_dynamic_shapes = false;
   bool ep_context_embed_mode = false;
   bool export_ep_ctx_blob = false;
   bool enable_qdq_optimizer = false;
   bool disable_cpu_fallback = false;
-  bool has_external_weights = false;
   size_t num_of_threads;
   std::string device_type;
   std::string precision_str;
-  std::string model_precision;
-  std::string cache_dir;
+  std::filesystem::path cache_dir;
   std::map<std::string, ov::AnyMap> load_config;
   std::string model_priority = "DEFAULT";
   int num_streams;
@@ -38,6 +36,7 @@ struct SessionContext {
   void* context = 0;
   bool use_api_2;
   std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
+  std::string openvino_sdk_version;
 };
 
 // Holds context specific to subgraph.
@@ -51,6 +50,10 @@ struct SubGraphContext {
   std::vector<int> input_indexes;
   std::unordered_map<std::string, int> input_names;
   std::unordered_map<std::string, int> output_names;
+  bool is_wholly_supported_graph = false;
+  bool has_external_weights = false;
+  std::string model_precision;
+  bool is_ep_ctx_graph = false;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 6d4aad3aec919..0d440eee598d3 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <memory>
+#include <istream>
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
@@ -16,14 +17,14 @@ class IBackend {
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel& GetOVCompiledModel() = 0;
 };
-
+using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
  public:
   static std::shared_ptr<IBackend>
   MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
               SessionContext& session_context,
               const SubGraphContext& subgraph_context,
-              EPCtxHandler& ctx_handle);
+              ptr_stream_t& model_stream);
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 6d159db3b390d..907650257c3f2 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -11,25 +11,45 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger) : openvino_sdk_version_(ov_sdk_version), logger_(logger) {
+  epctx_model_ = Model::Create("ovep_context_model", false, logger_);
+}
+
 /* Export the serialized blob string embedded onto an EPContext Node
  * along with other metadata necessary to validate the graph on import
  */
 
-Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
-                                      const std::string& graph_name,
-                                      const logging::Logger& logger,
-                                      const bool& ep_context_embed_mode,
-                                      std::string&& model_blob_str,
-                                      const std::string& openvino_sdk_version) const {
-  auto& metadata = graph_viewer.GetGraph().GetModel().MetaData();
-  auto model_build = graph_viewer.CreateModel(logger, metadata);
-  auto& graph_build = model_build->MainGraph();
+Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) {
+  // Serialize modelproto to string
+  auto model_proto = epctx_model_->ToProto();
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  // Finally, dump the model
+  std::ofstream epctx_onnx_model(model_name,
+                                 std::ios::out | std::ios::trunc | std::ios::binary);
+  if (!epctx_onnx_model) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file");
+  }
+
+  if (!model_proto->SerializeToOstream(epctx_onnx_model)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file");
+  }
+  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
+
+  return Status::OK();
+}
+
+Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
+                                           const std::string& graph_name,
+                                           const bool ep_context_embed_mode,
+                                           std::string&& model_blob_str) const {
+  auto& graph = epctx_model_->MainGraph();
 
   // Get graph inputs and outputs
   const auto& viewer_inputs = graph_viewer.GetInputs();
   const auto& viewer_outputs = graph_viewer.GetOutputs();
   std::vector<onnxruntime::NodeArg*> inputs(viewer_inputs.size()), outputs(viewer_outputs.size());
-  auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph_build.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); };
+  auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); };
   auto fill_vectors = [transform_f](auto& src, auto& dst) {
     std::transform(src.begin(), src.end(), dst.begin(), transform_f);
   };
@@ -60,7 +80,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
     auto sdk_version_attr = ONNX_NAMESPACE::AttributeProto::Create();
     sdk_version_attr->set_name(EP_SDK_VER);
     sdk_version_attr->set_type(onnx::AttributeProto_AttributeType_STRING);
-    sdk_version_attr->set_s(openvino_sdk_version);
+    sdk_version_attr->set_s(openvino_sdk_version_);
     node_attributes->emplace(EP_SDK_VER, std::move(*sdk_version_attr));
 
     // source
@@ -70,73 +90,65 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
     source_attr->set_s(kOpenVINOExecutionProvider);
     node_attributes->emplace(SOURCE, std::move(*source_attr));
   }
+
   // Create EP context node
-  graph_build.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain);
-  ORT_ENFORCE(graph_build.Resolve().IsOK());
+  graph.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain);
 
-  {
-    // Serialize modelproto to string
-    auto model_proto = model_build->ToProto();
-    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-
-    // Finally, dump the model
-    std::ofstream epctx_onnx_model(graph_name,
-                                   std::ios::out | std::ios::trunc | std::ios::binary);
-    if (!epctx_onnx_model) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file");
-    }
-
-    if (!model_proto->SerializeToOstream(epctx_onnx_model)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file");
-    }
-  }
-  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
+  ORT_ENFORCE(graph.Resolve().IsOK());
 
   return Status::OK();
 }
 
-Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode) {
-  auto node = graph_viewer.GetNode(0);
+std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const GraphViewer& graph_viewer) const {
+  auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
+  auto node = graph_viewer.GetNode(first_index);
+  ORT_ENFORCE(node != nullptr);
   auto& attrs = node->GetAttributes();
-  ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
 
-  ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT);
+  ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1);
+  const auto& ep_cache_context_attribute = attrs.at(EP_CACHE_CONTEXT);
+  const auto& cache_context = ep_cache_context_attribute.s();
 
-  ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
-  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
-
-  is_valid_ep_ctx_graph_ = true;
-  return Status::OK();
-}
+  ORT_ENFORCE(attrs.count(EMBED_MODE) == 1);
+  bool ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
 
-const std::string& EPCtxHandler::GetModelBlobStream() const {
-  static std::string empty;
-  if (ep_cache_context_attribute_ != nullptr) {
-    return ep_cache_context_attribute_->s();
+  std::unique_ptr<std::istream> result;
+  if (ep_context_embed_mode) {
+    result.reset((std::istream*)new std::istringstream(cache_context));
   } else {
-    return empty;
+    result.reset((std::istream*)new std::ifstream(cache_context, std::ios_base::binary | std::ios_base::in));
   }
+  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
+  return result;
 }
 
-bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const {
-  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
-    auto node = graph_viewer.GetNode(i);
-    auto& attrs = node->GetAttributes();
-
-    // Check for correct Op Type, EP SOURCE, and SDK version
-    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
-      if (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider) {
-        if (attrs.at(EP_SDK_VER).s() == openvino_sdk_version) {
-          return true;
-        } else {
-          ORT_THROW("[Invalid Graph] Versions of OpenVINO used to export blob (" + attrs.at(EP_SDK_VER).s() +
-                    ") and current runtime (" + openvino_sdk_version + ") don't match.");
-        }
-      }
+bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {
+  if (graph_viewer.NumberOfNodes() == 1) {
+    auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
+    if (auto node = graph_viewer.GetNode(first_index); (node != nullptr) && CheckForOVEPCtxNode(*node)) {
+      return true;
     }
   }
   return false;
 }
 
+bool EPCtxHandler::CheckForOVEPCtxNode(const Node& node) const {
+  // Check for correct Op Type, EP SOURCE, and SDK version
+  if (node.OpType() == EPCONTEXT_OP) {
+    auto& attrs = node.GetAttributes();
+    bool result = (attrs.count(SOURCE) == 1) && (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider);
+    result &= (attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_);
+    result &= attrs.count(EMBED_MODE) == 1;
+    result &= attrs.count(EP_CACHE_CONTEXT) == 1;
+    return result;
+  }
+  return false;
+}
+
+InlinedVector<const Node*> EPCtxHandler::GetEPCtxNodes() const {
+  const auto& epctx_nodes{epctx_model_->MainGraph().Nodes()};
+  return InlinedVector<const Node*>(epctx_nodes.begin(), epctx_nodes.end());
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index caab33b7db775..7e5d5180b363b 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -22,22 +22,22 @@ static const char SOURCE[] = "source";
 
 class EPCtxHandler {
  public:
-  EPCtxHandler() = default;
-  EPCtxHandler(const EPCtxHandler&) = delete;
-  Status ExportEPCtxModel(const GraphViewer& graph_viewer,
-                          const std::string& graph_name,
-                          const logging::Logger& logger,
-                          const bool& ep_context_embed_mode,
-                          std::string&& model_blob_str,
-                          const std::string& openvino_sdk_version) const;
-  Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode);
-  bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
-  bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
-  const std::string& GetModelBlobStream() const;
+  EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger);
+  EPCtxHandler(const EPCtxHandler&) = delete;  // No copy constructor
+  Status ExportEPCtxModel(const std::string& model_name);
+  bool CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const;
+  bool CheckForOVEPCtxNode(const Node& node) const;
+  Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
+                               const std::string& graph_name,
+                               const bool ep_context_embed_mode,
+                               std::string&& model_blob_str) const;
+  std::unique_ptr<std::istream> GetModelBlobStream(const GraphViewer& graph_viewer) const;
+  InlinedVector<const Node*> GetEPCtxNodes() const;
 
  private:
-  bool is_valid_ep_ctx_graph_{false};
-  const onnx::AttributeProto* ep_cache_context_attribute_;
+  const std::string openvino_sdk_version_;
+  std::unique_ptr<Model> epctx_model_;
+  const logging::Logger& logger_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index e5ffde62eeedb..0da0813e9143e 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -5,6 +5,7 @@
 #include <string>
 #include <memory>
 #include <vector>
+#include <format>
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/contexts.h"
@@ -20,33 +21,39 @@
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
 namespace onnxruntime {
+openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInfo& info) {
+  openvino_ep::SessionContext result = {
+      .enable_opencl_throttling = info.enable_opencl_throttling_,
+      .disable_dynamic_shapes = info.disable_dynamic_shapes_,
+      .ep_context_embed_mode = info.so_epctx_embed_mode_,
+      .export_ep_ctx_blob = info.export_ep_ctx_blob_,
+      .enable_qdq_optimizer = info.enable_qdq_optimizer_,
+      .disable_cpu_fallback = info.disable_cpu_fallback_,
+      .num_of_threads = info.num_of_threads_,
+      .device_type = info.device_type_,
+      .precision_str = info.precision_,
+      .cache_dir = info.cache_dir_,
+      .load_config = info.load_config_,
+      .model_priority = info.model_priority_,
+      .num_streams = info.num_streams_,
+      .context = info.context_,
+      .OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR},
+      .openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR),
+  };
+  return result;
+}
 
 OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
+    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider},
+      session_context_{GetSessionContext(info)},
+      ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} {
   InitProviderOrtApi();
 
-  session_context_ = std::make_unique<openvino_ep::SessionContext>();
-  session_context_->device_type = info.device_type_;
-  session_context_->precision_str = info.precision_;
-  session_context_->cache_dir = info.cache_dir_;
-  session_context_->load_config = info.load_config_;
-  session_context_->model_priority = info.model_priority_;
-  session_context_->num_streams = info.num_streams_;
-  session_context_->context = info.context_;
-  session_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
-  session_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
-  session_context_->num_of_threads = info.num_of_threads_;
-  session_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
-  session_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
-  session_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
-  session_context_->disable_cpu_fallback = info.disable_cpu_fallback_;
-  session_context_->ep_context_embed_mode = info.so_epctx_embed_mode_;
-
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir_.empty()) {
     bool device_found = false;
-    std::vector<std::string> available_devices = session_context_->ie_core.GetAvailableDevices();
+    std::vector<std::string> available_devices = session_context_.ie_core.GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type_ != "") {
       if (info.device_type_.find("HETERO") != std::string::npos ||
@@ -85,89 +92,62 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  std::string openvino_sdk_version = std::to_string(session_context_->OpenVINO_Version.at(0)) + "." +
-                                     std::to_string(session_context_->OpenVINO_Version.at(1));
-
-  // Check for valid ctx node and maintain state for validity
-  if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version)))
-    ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1,
-                "[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node.");
-
   // Enable CI Logs
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-  session_context_->onnx_model_path_name = graph_viewer.ModelPath().string();
-
-  session_context_->onnx_opset_version =
-      graph_viewer.DomainToVersionMap().at(kOnnxDomain);
-
-  session_context_->model_precision = [&](const GraphViewer& graph_viewer) {
-    // return empty if graph has no inputs or if types are not one of FP32/FP16
-    // else assume the type of the first input
-    if (graph_viewer.GetInputs().empty()) {
-      return "";
-    } else {
-      auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
-      if (session_context_->precision_str == "ACCURACY" &&
-          session_context_->device_type.find("GPU") != std::string::npos) {
-        if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
-          return "FP32";
-        } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
-          return "FP16";
-        }
-      }
-    }
-    return "";
-  }(graph_viewer);
 
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 session_context_->device_type,
-                                 session_context_->enable_qdq_optimizer);
+  openvino_ep::GetCapability obj(ep_ctx_handle_,
+                                 graph_viewer,
+                                 session_context_.device_type,
+                                 session_context_.enable_qdq_optimizer);
   result = obj.Execute();
 
-  session_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
-  session_context_->has_external_weights = obj.HasExternalWeights();
-
   return result;
 }
 
 common::Status OpenVINOExecutionProvider::Compile(
     const std::vector<FusedNodeAndGraph>& fused_nodes,
     std::vector<NodeComputeInfo>& node_compute_funcs) {
+  auto& logger = *GetLogger();
+  Status status = Status::OK();
+
+  // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+  session_context_.onnx_model_path_name = fused_nodes[0].filtered_graph.get().ModelPath().string();
+  session_context_.onnx_opset_version =
+      fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain);
+
   for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
     const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
     const Node& fused_node = fused_node_graph.fused_node;
 
     NodeComputeInfo compute_info;
 
-    session_context_->use_api_2 = true;
+    session_context_.use_api_2 = true;
 
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
 
-    std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(*session_context_,
-                                                      fused_node,
-                                                      graph_body_viewer,
-                                                      *GetLogger(),
-                                                      ep_ctx_handle_);
-    backend_manager_ = backend_manager;
+    auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                           fused_node,
+                                                           graph_body_viewer,
+                                                           logger,
+                                                           ep_ctx_handle_);
+
     compute_info.create_state_func =
-        [backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
+        [&backend_manager](ComputeContext* context, FunctionState* state) {
+          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(backend_manager);
           p->allocate_func = context->allocate_func;
           p->destroy_func = context->release_func;
           p->allocator_handle = context->allocator_handle;
-          p->backend_manager = backend_manager;
           *state = static_cast<FunctionState>(p);
           return 0;
         };
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
       try {
-        function_state->backend_manager->Compute(context);
+        function_state->backend_manager.Compute(context);
       } catch (const std::exception& ex) {
         return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
       }
@@ -182,18 +162,22 @@ common::Status OpenVINOExecutionProvider::Compile(
           }
         };
     node_compute_funcs.push_back(compute_info);
+
+    if (!status.IsOK()) {
+      break;
+    }
   }
 
-  return Status::OK();
+  return status;
 }
 
 #ifdef USE_OVEP_NPU_MEMORY
 std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
-  if (session_context_->device_type.find("NPU") != std::string::npos) {
+  if (session_context_.device_type.find("NPU") != std::string::npos) {
     AllocatorCreationInfo npu_allocator_info{
         [this](OrtDevice::DeviceId device_id) {
           return std::make_unique<OVRTAllocator>(
-              session_context_->ie_core.Get(),
+              session_context_.ie_core.Get(),
               OrtDevice::NPU,
               device_id,
               OpenVINO_RT_NPU);
@@ -232,8 +216,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
       }
       if (workload_type != "") {
         LOGS_DEFAULT(INFO) << "SetEpDynamicOptions - modifying: " << key << "/" << value;
-        ov::CompiledModel& ov_compiled_model = backend_manager_->GetOVCompiledModel();
-        ov_compiled_model.set_property(ov::workload_type(workload_type));
+        for (auto& backend : backend_managers_) {
+          ov::CompiledModel& ov_compiled_model = backend.GetOVCompiledModel();
+          ov_compiled_model.set_property(ov::workload_type(workload_type));
+        }
       }
     } else {
       // Handle unknown options
@@ -242,4 +228,9 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
   }
   return Status::OK();
 }
+
+const InlinedVector<const Node*> OpenVINOExecutionProvider::GetEpContextNodes() const {
+  return ep_ctx_handle_.GetEPCtxNodes();
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 26a67ba04756b..9b48d9e5ce3a3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -169,10 +169,11 @@ struct OpenVINOExecutionProviderInfo {
 };
 
 struct OpenVINOEPFunctionState {
+  OpenVINOEPFunctionState(openvino_ep::BackendManager& bm) : backend_manager(bm) {}
   AllocateFunc allocate_func = nullptr;
   DestroyFunc destroy_func = nullptr;
   AllocatorHandle allocator_handle = nullptr;
-  std::shared_ptr<openvino_ep::BackendManager> backend_manager;
+  openvino_ep::BackendManager& backend_manager;
 };
 
 // Logical device representation.
@@ -194,13 +195,17 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
+
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
 #ifdef USE_OVEP_NPU_MEMORY
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 #endif
  private:
-  std::unique_ptr<openvino_ep::SessionContext> session_context_;
-  std::shared_ptr<openvino_ep::BackendManager> backend_manager_;
-  openvino_ep::EPCtxHandler ep_ctx_handle_{};
+  openvino_ep::SessionContext session_context_;
+  std::list<openvino_ep::BackendManager> backend_managers_;  // EP session owns the backend objects
+
+  openvino_ep::EPCtxHandler ep_ctx_handle_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 12ab7ecede031..804db5b726fc5 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -109,22 +109,13 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
   }
 }
 
-OVExeNetwork OVCore::ImportModel(const std::string& model_string,
+OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
-                                 bool embed_mode,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    if (embed_mode) {
-      std::istringstream model_stream(model_string);
-      obj = oe.import_model(model_stream, hw_target, device_config);
-    } else {
-      std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in);
-      obj = oe.import_model(modelStream,
-                            hw_target,
-                            {});
-    }
+    obj = oe.import_model(model_stream, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index c3417003f8e1f..550c7962cca13 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -54,10 +54,9 @@ class OVCore {
                             ov::AnyMap& device_config,
                             const std::string& name);
   // OV Interface for Import model Stream
-  OVExeNetwork ImportModel(const std::string& model_string,
+  OVExeNetwork ImportModel(std::istream& model_stream,
                            std::string hw_target,
                            const ov::AnyMap& device_config,
-                           bool embed_mode,
                            std::string name);
 #ifdef IO_BUFFER_ENABLED
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 9e62076ca8777..b9f01cc261f52 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License
 #include <map>
 #include <unordered_set>
+#include <type_traits>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
@@ -26,10 +27,12 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Constructor
-GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
+GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler,
+                             const GraphViewer& graph_viewer_param,
                              const std::string device_type_param,
-                             const bool enable_qdq_optimizer)
-    : graph_viewer_(graph_viewer_param), device_type_(device_type_param) {
+                             const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler),
+                                                                graph_viewer_(graph_viewer_param),
+                                                                device_type_(device_type_param) {
   bool npu_qdq_optimizer_enabled = false;
   if (device_type_.find("NPU") != std::string::npos) {
     device_type_ = "CPU";
@@ -56,6 +59,42 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     return result;
   }
 
+  auto Iterable2String = []<typename U, typename V>(U& strings, const V& node_args) {
+    constexpr bool has_name = requires(V v) {
+      (*v.begin())->Name();
+    };
+    for (const auto& arg : node_args) {
+      if constexpr (has_name) {
+        strings.push_back(arg->Name());
+      } else {
+        strings.push_back(arg);
+      }
+    }
+  };
+
+  // Check for EpContext nodes
+  const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder();
+  for (const auto node_index : nodes) {
+    const auto& node = *graph_viewer_.GetNode(node_index);
+    if (ep_ctx_handler_.CheckForOVEPCtxNode(node)) {
+      std::vector<std::string> inputs;
+      std::vector<std::string> outputs;
+
+      Iterable2String(inputs, node.InputDefs());
+      Iterable2String(outputs, node.OutputDefs());
+
+      auto sub_graph = IndexedSubGraph::Create();
+      sub_graph->Nodes().push_back(node_index);
+      auto meta_def = IndexedSubGraph_MetaDef::Create();
+      meta_def->name() = node.Name();
+      meta_def->domain() = kMSDomain;
+      meta_def->inputs() = inputs;
+      meta_def->outputs() = outputs;
+      sub_graph->SetMetaDef(std::move(meta_def));
+      result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    }
+  }
+
   // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc.
   std::unordered_set<std::string> ng_required_initializers;
 
@@ -75,8 +114,7 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     std::vector<std::string> inputs;
     std::vector<std::string> outputs;
     // Fill inputs with names
-    std::for_each(graph_viewer_.GetInputs().begin(), graph_viewer_.GetInputs().end(),
-                  [&inputs](const NodeArg* node_arg) { inputs.push_back(node_arg->Name()); });
+    Iterable2String(inputs, graph_viewer_.GetInputs());
 
     /* In scenarios, when there are no inputs or all inputs being initializers,
          ConstantFolding optimization in onnxruntime pre-computes the value.*/
@@ -84,8 +122,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
       return result;
     }
 
-    const std::vector<NodeIndex>& nodes = graph_viewer_.GetNodesInTopologicalOrder();
-
     const Node* node = graph_viewer_.GetNode(nodes[0]);
 
     // Handle cases where lone, reoccuring Ops in smaller models cannot be supported in OpenVINO
@@ -105,12 +141,10 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     }
 
     // Initializers need to be part of meta_def->inputs
-    std::for_each(ng_required_initializers.begin(), ng_required_initializers.end(),
-                  [&inputs](const std::string& initializer) { inputs.push_back(initializer); });
+    Iterable2String(inputs, ng_required_initializers);
 
     // Fill outputs with names
-    std::for_each(graph_viewer_.GetOutputs().begin(), graph_viewer_.GetOutputs().end(),
-                  [&outputs](const NodeArg* node_arg) { outputs.push_back(node_arg->Name()); });
+    Iterable2String(outputs, graph_viewer_.GetOutputs());
 
     // Create and add this graph to result.
     AppendClusterToSubGraph(graph_viewer_.GetNodesInTopologicalOrder(), inputs, outputs, result);
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
index 2f87c4c73d892..364e79a76f154 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -6,12 +6,14 @@
 #include <string>
 #include <memory>
 #include "core/providers/openvino/ov_versions/data_ops.h"
+#include "core/providers/openvino/onnx_ctx_model_helper.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
 class GetCapability {
  private:
+  const EPCtxHandler& ep_ctx_handler_;
   const GraphViewer& graph_viewer_;
   std::string device_type_;
   DataOps* data_ops_;
@@ -19,7 +21,8 @@ class GetCapability {
   bool has_external_weights_ = false;
 
  public:
-  GetCapability(const GraphViewer& graph_viewer_param,
+  GetCapability(const EPCtxHandler& ep_ctx_handler,
+                const GraphViewer& graph_viewer_param,
                 const std::string device_type_param,
                 const bool enable_qdq_optimizer);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index e434935343663..4feedd75f8004 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -991,7 +991,8 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
-  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only,
+                                       const logging::Logger& logger) {
     return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
   }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d7c6dab72fde8..2644c8f6ffb36 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1179,7 +1179,8 @@ struct ProviderHostImpl : ProviderHost {
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
-  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name,
+                                          bool is_onnx_domain_only,
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
   }

From 37cee3f04a48ce2589ed8828bd81001dea708fd3 Mon Sep 17 00:00:00 2001
From: saurabhkale117 <saurabh1.kale@intel.com>
Date: Thu, 9 Jan 2025 12:40:27 +0530
Subject: [PATCH 03/35] enable config option for ovep weight sharing

---
 onnxruntime/core/providers/openvino/contexts.h              | 1 +
 .../core/providers/openvino/openvino_execution_provider.cc  | 1 +
 .../core/providers/openvino/openvino_execution_provider.h   | 6 ++++--
 .../core/providers/openvino/openvino_provider_factory.cc    | 3 ++-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index e9405b5ac5142..95954ae204047 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -19,6 +19,7 @@ struct SessionContext {
   bool enable_opencl_throttling = false;
   bool disable_dynamic_shapes = false;
   bool ep_context_embed_mode = false;
+  bool enable_ovep_weight_sharing = false;
   bool export_ep_ctx_blob = false;
   bool enable_qdq_optimizer = false;
   bool disable_cpu_fallback = false;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 0da0813e9143e..43924386eeee1 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -26,6 +26,7 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf
       .enable_opencl_throttling = info.enable_opencl_throttling_,
       .disable_dynamic_shapes = info.disable_dynamic_shapes_,
       .ep_context_embed_mode = info.so_epctx_embed_mode_,
+      .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_;
       .export_ep_ctx_blob = info.export_ep_ctx_blob_,
       .enable_qdq_optimizer = info.enable_qdq_optimizer_,
       .disable_cpu_fallback = info.disable_cpu_fallback_,
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 9b48d9e5ce3a3..b5206c196cb60 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -91,6 +91,7 @@ struct OpenVINOExecutionProviderInfo {
   bool enable_qdq_optimizer_{false};
   bool disable_cpu_fallback_{false};
   bool so_epctx_embed_mode_{false};
+  bool so_enable_ovep_weight_sharing_{false};
 
   OpenVINOExecutionProviderInfo() = delete;
 
@@ -102,7 +103,7 @@ struct OpenVINOExecutionProviderInfo {
                                          void* context, bool enable_opencl_throttling,
                                          bool disable_dynamic_shapes, bool export_ep_ctx_blob,
                                          bool enable_qdq_optimizer, bool disable_cpu_fallback,
-                                         bool so_epctx_embed_mode)
+                                         bool so_epctx_embed_mode, , bool so_enable_ovep_weight_sharing)
       : precision_(std::move(precision)),
         num_of_threads_(num_of_threads),
         load_config_(std::move(load_config)),
@@ -115,7 +116,8 @@ struct OpenVINOExecutionProviderInfo {
         export_ep_ctx_blob_(export_ep_ctx_blob),
         enable_qdq_optimizer_(enable_qdq_optimizer),
         disable_cpu_fallback_(disable_cpu_fallback),
-        so_epctx_embed_mode_{so_epctx_embed_mode} {
+        so_epctx_embed_mode_{so_epctx_embed_mode},
+        so_enable_ovep_weight_sharing_{so_enable_ovep_weight_sharing} {
     std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
                                                        "GPU.0", "GPU.1", "NPU"};
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 5855cb594a08e..951b8223b4dbc 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -55,6 +55,7 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
   bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
   bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
   std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
+  bool so_enable_ovep_weight_sharing = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
 
   if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
     cache_dir_ = std::move(so_cache_path);
@@ -76,7 +77,7 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
   OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_,
                                      cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_,
                                      disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_,
-                                     so_disable_cpu_fallback, so_epctx_embed_mode);
+                                     so_disable_cpu_fallback, so_epctx_embed_mode, so_enable_ovep_weight_sharing);
   return std::make_unique<OpenVINOExecutionProvider>(info);
 }
 

From 409cb476af405db18d76a049f534e82f3ae06236 Mon Sep 17 00:00:00 2001
From: saurabhkale117 <saurabh1.kale@intel.com>
Date: Thu, 9 Jan 2025 13:45:11 +0530
Subject: [PATCH 04/35] add config option for ovep weight sharing

---
 .../core/providers/openvino/openvino_execution_provider.cc      | 2 +-
 .../core/providers/openvino/openvino_execution_provider.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 43924386eeee1..817d0817cbfc6 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -26,7 +26,7 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf
       .enable_opencl_throttling = info.enable_opencl_throttling_,
       .disable_dynamic_shapes = info.disable_dynamic_shapes_,
       .ep_context_embed_mode = info.so_epctx_embed_mode_,
-      .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_;
+      .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_,
       .export_ep_ctx_blob = info.export_ep_ctx_blob_,
       .enable_qdq_optimizer = info.enable_qdq_optimizer_,
       .disable_cpu_fallback = info.disable_cpu_fallback_,
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index b5206c196cb60..1b3990310fc61 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -103,7 +103,7 @@ struct OpenVINOExecutionProviderInfo {
                                          void* context, bool enable_opencl_throttling,
                                          bool disable_dynamic_shapes, bool export_ep_ctx_blob,
                                          bool enable_qdq_optimizer, bool disable_cpu_fallback,
-                                         bool so_epctx_embed_mode, , bool so_enable_ovep_weight_sharing)
+                                         bool so_epctx_embed_mode, bool so_enable_ovep_weight_sharing)
       : precision_(std::move(precision)),
         num_of_threads_(num_of_threads),
         load_config_(std::move(load_config)),

From 3b2b7e9a89b02ee868068867c12d89a887777c46 Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Wed, 8 Jan 2025 20:52:19 -0800
Subject: [PATCH 05/35] Refactor the conditional blocks in OVEP for compilation

---
 cmake/onnxruntime_providers_openvino.cmake    |  2 +-
 .../core/providers/openvino/backend_utils.cc  |  3 +-
 .../core/providers/openvino/backend_utils.h   |  2 +-
 .../openvino/backends/basic_backend.cc        | 89 +++++++++----------
 .../openvino/openvino_provider_factory.cc     |  8 ++
 5 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 01a0d3ce3badc..ff2d4b388e82e 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -30,7 +30,7 @@
   endif()
 
   list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
-  if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
+  if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU)
     add_definitions(-DIO_BUFFER_ENABLED=1)
     list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 84de5eb4f16f9..c447a7847434a 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -40,14 +40,13 @@ struct static_cast_int64 {
 };
 
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
+CreateOVModel(const std::string model,
               const SessionContext& session_context,
               const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
   }
-  const std::string model = model_proto.SerializeAsString();
   try {
     auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name);
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 2765fe0e9b1c7..0d7378072cb1b 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -61,7 +61,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
                     size_t batch_slice_idx);
 
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
+CreateOVModel(const std::string model,
               const SessionContext& session_context,
               const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index bacf25effb0f3..e9882caa1372b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -57,71 +57,68 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   }
 
   try {
-    std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str;
-
-    if (subgraph_context_.is_wholly_supported_graph) {  // Full graph is supported
+    // IO_BUFFER is enabled on GPU HW.
+    // Pre-requisite is provider_option "context" must be set
 #if defined(IO_BUFFER_ENABLED)
-      if (subgraph_context_.is_ep_ctx_graph) {
-        exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
-                                                            remote_context_,
-                                                            subgraph_context_.subgraph_name);
-      } else if ((session_context.device_type.find("GPU") != std::string::npos) &&
-                 (session_context_.context != nullptr)) {
-        LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-        cl_context ctx = static_cast<cl_context>(session_context_.context);
-        remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx);
-        ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = session_context_.ie_core.CompileModel(
-            ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name);
-      } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = session_context_.ie_core.CompileModel(
-            ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
+    cl_context ctx = static_cast<cl_context>(session_context_.context);
+    remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx);
+    if (subgraph_context_.is_ep_ctx_graph) {
+      exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+                                                        remote_context_,
+                                                        subgraph_context_.subgraph_name);
+      model_stream.reset(); // Delete stream after it is no longer needed
+    } else {
+      std::shared_ptr<const OVNetwork> ov_model;
+      {
+        const std::string model = model_proto->SerializeAsString();
+        if (!subgraph_context.has_dynamic_input_shape) {
+          delete model_proto.release();
+        }
+        ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
+      }
+      LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
+      exe_network_ = session_context_.ie_core.CompileModel(
+          ov_model, remote_context_, subgraph_context_.subgraph_name);
       }
 #else  // !IO_BUFFER_ENABLED
-      std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : subgraph_context_.model_precision;
+      auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
+                                   (session_context_.OpenVINO_Version.at(0) >= 2024 &&
+                                   session_context_.OpenVINO_Version.at(1) > 2));
       if (subgraph_context_.is_ep_ctx_graph) {
         // If the blob is held in an EPContext node, then skip FE+Compile
         // and directly move on to creating a backend with the executable blob
         exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+                                                           hw_target,
+                                                           device_config,
+                                                           subgraph_context_.subgraph_name);
+        model_stream.reset(); // Delete stream after it is no longer needed
+      } else if (!subgraph_context_.has_external_weights &&
+                 !subgraph_context_.has_dynamic_input_shape &&
+                 !session_context_.export_ep_ctx_blob &&
+                 auto_unified_compile){
+        // Unified OV compile_model is efficient when ov model caching is enabled
+        // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
+        // Inputs with static dimenstions
+        // Not enabled for models with external weights and when ep context is set.
+        const std::string model = model_proto->SerializeAsString();
+        exe_network_ = session_context_.ie_core.CompileModel(model,
                                                             hw_target,
                                                             device_config,
                                                             subgraph_context_.subgraph_name);
-        model_stream.reset();  // Delete stream after it is no longer needed
-      } else if (session_context_.export_ep_ctx_blob &&
-                 hw_target.find("NPU") != std::string::npos &&
-                 !subgraph_context_.has_external_weights) {
-        std::shared_ptr<ov::Model> ov_model;
+      } else {  // For all other types use ov::core read_model() to generate OV IR
+                // followed by ov::core compile_model()
+        std::shared_ptr<const OVNetwork> ov_model;
         {
           const std::string model = model_proto->SerializeAsString();
-          if (!subgraph_context_.has_dynamic_input_shape) {
+          if (!subgraph_context.has_dynamic_input_shape) {
             delete model_proto.release();
           }
-          ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor());
+          ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
         }
-        exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
-      } else if (!subgraph_context_.has_external_weights &&
-                 (!subgraph_context_.has_dynamic_input_shape) &&
-                 ((hw_target.find("AUTO") == std::string::npos) ||
-                  (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) {
-        // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above
-        // Inputs with static dimenstions
-        const std::string model = model_proto->SerializeAsString();
-        exe_network_ = session_context_.ie_core.CompileModel(model,
-                                                             hw_target,
-                                                             device_config,
-                                                             subgraph_context_.subgraph_name);
-      } else {  // For all other types use ov::Model Type
-        auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_);
         exe_network_ = session_context_.ie_core.CompileModel(
             ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #endif
-    } else {  // Full graph is not supported
-      auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_);
-      exe_network_ = session_context_.ie_core.CompileModel(
-          ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
-    }
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
     ORT_THROW(msg);
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 951b8223b4dbc..09ee83d3a7cc4 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -271,6 +271,14 @@ struct OpenVINO_Provider : Provider {
       uint64_t number = std::strtoull(str.c_str(), nullptr, 16);
       context = reinterpret_cast<void*>(number);
     }
+#if defined(IO_BUFFER_ENABLED)
+  // a valid context must be provided to enable IO Buffer optimizations
+  if(context==nullptr){
+    #undef IO_BUFFER_ENABLED
+    #define IO_BUFFER_ENABLED=0
+    LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
+  }
+#endif
 
     if (provider_options_map.find("num_of_threads") != provider_options_map.end()) {
       if (!std::all_of(provider_options_map.at("num_of_threads").begin(),

From 3949bf51464897ade294a2359409d57a4be7fb01 Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Mon, 30 Dec 2024 16:17:58 +0530
Subject: [PATCH 06/35] Convert initializers with external data to graph inputs

---
 .../qdq_transformations/qdq_stripping.cc      | 109 +++++++++++++++---
 1 file changed, 91 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 387aaf9985b4c..894f418b93482 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -625,6 +625,50 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
   KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node);
 }
 
+// Keep track of inputs across multiple calls
+static std::vector<const NodeArg*> accumulated_inputs;
+static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
+                                  const onnxruntime::GraphViewer& src_graph,
+                                  const std::string& initializer_name) {
+    // Get the initializer from source graph
+    const auto& src_initializers = src_graph.GetAllInitializedTensors();
+    auto init_iter = src_initializers.find(initializer_name);
+
+    if (init_iter == src_initializers.end()) {
+      // Initializer not found
+      return;
+    }
+
+    const ONNX_NAMESPACE::TensorProto* tensor_proto = init_iter->second;
+
+    // Create TypeProto for the initializer
+    std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
+    auto* tensor_type = type_proto->mutable_tensor_type();
+    tensor_type->set_elem_type(tensor_proto->data_type());
+
+    for (int i = 0; i < tensor_proto->dims_size(); ++i) {
+        tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i));
+    }
+
+    // Create NodeArg for the initializer
+    auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get());
+
+    // Check if input already exists in accumulated inputs
+    bool input_exists = false;
+    for (const auto* existing_input : accumulated_inputs) {
+        if (existing_input->Name() == initializer_name) {
+            input_exists = true;
+            break;
+        }
+    }
+
+    if (!input_exists) {
+        // Add to accumulated inputs
+        accumulated_inputs.push_back(&input_arg);
+    }
+}
+
+
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
@@ -665,7 +709,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     dst_graph_outputs.push_back(&ep_graph_output_arg);
   }
 
-  dst_graph.SetInputs(dst_graph_inputs);
+  // Will set inputs after deciding fate oif all internal and external initializers
+  // dst_graph.SetInputs(dst_graph_inputs);
   dst_graph.SetOutputs(dst_graph_outputs);
 
   // TODO(sspintel): add Graph::SetName() provider api
@@ -723,9 +768,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     seen_node_units.insert(node_unit);
   }
 
-  //
-  // Copy initializers to dst graph.
-  //
+
+  //  Copy initializers to dst graph.
 
   std::unordered_set<std::string> current_scope_initializer_set;
 
@@ -739,25 +783,54 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   std::sort(const_inits.begin(), const_inits.end());
 
   for (auto& it : const_inits) {
-    if (initializers_to_keep.count(it))
-      dst_graph.AddInitializedTensor(*(initializers.at(it)));
-    current_scope_initializer_set.insert(it);
+      const auto* initializer_tensor = initializers.at(it);
+
+      // Check if the initializer has external data
+      if (initializer_tensor->has_data_location() &&
+          initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
+          // Add initializer with external data as input
+          AddInitializerAsInput(dst_graph, src_graph, it);
+      } else {
+          // Add as an initialized tensor if it does not have external data
+          if (initializers_to_keep.count(it)) {
+              dst_graph.AddInitializedTensor(*initializer_tensor);
+          }
+      }
+
+      current_scope_initializer_set.insert(it);
   }
 
-  // handle outer scope value which is a constant initializer
+  // Handle outer-scope constant initializers
   for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) {
-    const auto& node = src_graph.GetNode(node_idx);
-    for (const auto& input : node->InputDefs()) {
-      if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) {
-        continue;
+      const auto& node = src_graph.GetNode(node_idx);
+      for (const auto& input : node->InputDefs()) {
+          if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) {
+              continue;
+          }
+
+          if (src_graph.IsConstantInitializer(input->Name(), true)) {
+              const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
+
+              // Check if the initializer has external data
+              if (initializer_tensor->has_data_location() &&
+                  initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
+                  // Add initializer as input if it has external data
+                  AddInitializerAsInput(dst_graph, src_graph, input->Name());
+              } else {
+                  // Add as an initialized tensor if it does not have external data
+                  if (initializers_to_keep.count(input->Name())) {
+                      dst_graph.AddInitializedTensor(*initializer_tensor);
+                  }
+              }
+
+              current_scope_initializer_set.insert(input->Name());
+          }
       }
-      if (src_graph.IsConstantInitializer(input->Name(), true)) {
-        if (initializers_to_keep.count(input->Name()))
-          dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
-        current_scope_initializer_set.insert(input->Name());
-      }
-    }
   }
+  accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end());
+
+  // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph
+  dst_graph.SetInputs(accumulated_inputs);
 
   // Validate graph, remove unnecessary initializers, and run type/shape inference.
   ORT_RETURN_IF_ERROR(dst_graph.Resolve());

From 28c928a50c6c04d50745fb3c3490e8d87927c069 Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Thu, 2 Jan 2025 15:54:31 +0530
Subject: [PATCH 07/35] create, store and export metadata for ovep weight
 sharing

---
 .../qdq_transformations/qdq_stripping.cc      | 112 +++++++++++++++++-
 1 file changed, 107 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 894f418b93482..cca49311c7b15 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -668,6 +668,55 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
     }
 }
 
+bool writeString(std::ofstream& outfile, const std::string& str) {
+    size_t size = str.size();
+    outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    if (!outfile.good()) return false;
+
+    outfile.write(str.c_str(), size);
+    return outfile.good();
+}
+
+bool writeStringVector(std::ofstream& outfile, const std::vector<std::string>& vec) {
+    size_t size = vec.size();
+    outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    if (!outfile.good()) return false;
+
+    for (const auto& str : vec) {
+        if (!writeString(outfile, str)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Main function to dump the map to a binary file
+bool dumpMetaDataMapToBinary(const std::unordered_map<std::string, std::vector<std::string>>& map, const std::string& filename) {
+
+  std::ofstream outfile(filename, std::ios::binary);
+  if (!outfile.is_open()) {
+      ORT_THROW("Error: Could not open file for writing metadata.");
+      return false;
+  }
+
+  // Write the size of the map
+  size_t map_size = map.size();
+  outfile.write(reinterpret_cast<const char*>(&map_size), sizeof(map_size));
+  if (!outfile.good()) {
+      ORT_THROW("Error: Failed to write map size.");
+      return false;
+  }
+
+  // Write each key-value pair
+  for (const auto& pair : map) {
+      if (!writeString(outfile, pair.first) || !writeStringVector(outfile, pair.second)) {
+          ORT_THROW("Error: Failed to write map data.");
+          return false;
+      }
+  }
+
+  return true;
+}
 
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
@@ -782,14 +831,40 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   }
   std::sort(const_inits.begin(), const_inits.end());
 
+  // initialize map for creating metadata for initilizers with external weights
+  std::unordered_map<std::string, std::vector<std::string>> metadata_map;
+
+  // metadata structure: initializer_name as key
+  // and [location, offset, length] as value
+
   for (auto& it : const_inits) {
       const auto* initializer_tensor = initializers.at(it);
 
       // Check if the initializer has external data
       if (initializer_tensor->has_data_location() &&
           initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-          // Add initializer with external data as input
-          AddInitializerAsInput(dst_graph, src_graph, it);
+            if (enable_ovep_weight_sharing) {
+
+              // Cast away const to access mutable_external_data
+              struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
+
+              // get meta data about the initilizers with external data
+              struct ONNX_NAMESPACE::StringStringEntryProtos* external_data =  non_const_initializer_tensor->mutable_external_data();
+
+              std::vector<std::string> init_info;
+              // init_info structure: [location, offset, length]
+
+              for (int i = 0 ; i < external_data->size() ; i++) {
+                init_info.push_back(*external_data->at(i).mutable_value());
+              }
+
+              metadata_map.emplace(initializer_tensor->name(), init_info);
+              // Add initializer with external data as input
+              AddInitializerAsInput(dst_graph, src_graph, it);
+            } else if (initializers_to_keep.count(it)) {
+              dst_graph.AddInitializedTensor(*initializer_tensor);
+            }
+
       } else {
           // Add as an initialized tensor if it does not have external data
           if (initializers_to_keep.count(it)) {
@@ -810,12 +885,30 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
 
           if (src_graph.IsConstantInitializer(input->Name(), true)) {
               const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
-
               // Check if the initializer has external data
               if (initializer_tensor->has_data_location() &&
                   initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-                  // Add initializer as input if it has external data
-                  AddInitializerAsInput(dst_graph, src_graph, input->Name());
+                    if (enable_ovep_weight_sharing) {
+
+                      // Cast away const to access mutable_external_data
+                      struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
+
+                      // get meta data about the initilizers with external data
+                      struct ONNX_NAMESPACE::StringStringEntryProtos* external_data =  non_const_initializer_tensor->mutable_external_data();
+
+                      std::vector<std::string> init_info;
+                      for (int i = 0 ; i < external_data->size() ; i++) {
+                        init_info.push_back(*external_data->at(i).mutable_value());
+                      }
+
+                      metadata_map.emplace(initializer_tensor->name(), init_info);
+
+                      // Add initializer as input if it has external data
+                      AddInitializerAsInput(dst_graph, src_graph, input->Name());
+                    } else if (initializers_to_keep.count(input->Name())) {
+                      dst_graph.AddInitializedTensor(*initializer_tensor);
+                    }
+
               } else {
                   // Add as an initialized tensor if it does not have external data
                   if (initializers_to_keep.count(input->Name())) {
@@ -827,6 +920,15 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
           }
       }
   }
+
+  if (enable_ovep_weight_sharing) {
+    // creating bin file of metadata_map and dumping the bin file
+    dumpMetaDataMapToBinary(metadata_map, "metadata.bin");
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped.";
+  } else{
+    ORT_THROW("Unable to write metadata to file.");
+  }
+
   accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end());
 
   // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph

From 7a89c5a0c92871629fdd8d7b096736ef25b2d11e Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Fri, 3 Jan 2025 15:11:01 +0530
Subject: [PATCH 08/35] fix error handling in weight sharing

---
 .../openvino/qdq_transformations/qdq_stripping.cc      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index cca49311c7b15..54cc15fa1ed5b 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -920,13 +920,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
           }
       }
   }
-
   if (enable_ovep_weight_sharing) {
     // creating bin file of metadata_map and dumping the bin file
-    dumpMetaDataMapToBinary(metadata_map, "metadata.bin");
-    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped.";
-  } else{
-    ORT_THROW("Unable to write metadata to file.");
+    if (dumpMetaDataMapToBinary(metadata_map, "metadata.bin")) {
+      LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped.";
+    } else {
+      ORT_THROW("Error: Unable to write metadat to file.");
+    }
   }
 
   accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end());

From ad66ae09c2251bbfebeecd05c401e0443ae064a0 Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Thu, 9 Jan 2025 23:49:10 -0800
Subject: [PATCH 09/35] fix crash issue while setting up inputs for wai model

---
 .../qdq_transformations/qdq_stripping.cc      | 33 ++++++++-----------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 54cc15fa1ed5b..def51f34fe2ed 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -625,9 +625,8 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
   KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node);
 }
 
-// Keep track of inputs across multiple calls
-static std::vector<const NodeArg*> accumulated_inputs;
-static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
+static void AddInitializerAsInput (onnxruntime::Graph& dst_graph,
+                                  InlinedVector<const NodeArg*>& accumulated_inputs,
                                   const onnxruntime::GraphViewer& src_graph,
                                   const std::string& initializer_name) {
     // Get the initializer from source graph
@@ -759,6 +758,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   }
 
   // Will set inputs after deciding fate oif all internal and external initializers
+  // accumulated_inputs container will store input of the original graph and initializer with ext data
+  InlinedVector<const NodeArg*> accumulated_inputs;
   // dst_graph.SetInputs(dst_graph_inputs);
   dst_graph.SetOutputs(dst_graph_outputs);
 
@@ -842,8 +843,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
 
       // Check if the initializer has external data
       if (initializer_tensor->has_data_location() &&
-          initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-            if (enable_ovep_weight_sharing) {
+          initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+          enable_ovep_weight_sharing) {
 
               // Cast away const to access mutable_external_data
               struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
@@ -860,16 +861,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
 
               metadata_map.emplace(initializer_tensor->name(), init_info);
               // Add initializer with external data as input
-              AddInitializerAsInput(dst_graph, src_graph, it);
-            } else if (initializers_to_keep.count(it)) {
-              dst_graph.AddInitializedTensor(*initializer_tensor);
-            }
+              AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
 
       } else {
           // Add as an initialized tensor if it does not have external data
-          if (initializers_to_keep.count(it)) {
-              dst_graph.AddInitializedTensor(*initializer_tensor);
-          }
+          if (initializers_to_keep.count(it))
+            dst_graph.AddInitializedTensor(*(initializers.at(it)));
+
       }
 
       current_scope_initializer_set.insert(it);
@@ -887,8 +885,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
               const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
               // Check if the initializer has external data
               if (initializer_tensor->has_data_location() &&
-                  initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-                    if (enable_ovep_weight_sharing) {
+                  initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+                  enable_ovep_weight_sharing) {
 
                       // Cast away const to access mutable_external_data
                       struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
@@ -904,15 +902,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                       metadata_map.emplace(initializer_tensor->name(), init_info);
 
                       // Add initializer as input if it has external data
-                      AddInitializerAsInput(dst_graph, src_graph, input->Name());
-                    } else if (initializers_to_keep.count(input->Name())) {
-                      dst_graph.AddInitializedTensor(*initializer_tensor);
-                    }
+                      AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());
 
               } else {
                   // Add as an initialized tensor if it does not have external data
                   if (initializers_to_keep.count(input->Name())) {
-                      dst_graph.AddInitializedTensor(*initializer_tensor);
+                    dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
                   }
               }
 

From c17b27647141cb53d8a9e9f7f5df8aecc95e1a31 Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Fri, 10 Jan 2025 00:54:31 -0800
Subject: [PATCH 10/35] pass weight sharing option to OVEP qdq stripping pass

---
 onnxruntime/core/providers/openvino/backend_manager.cc          | 2 +-
 .../providers/openvino/qdq_transformations/qdq_stripping.cc     | 1 +
 .../core/providers/openvino/qdq_transformations/qdq_stripping.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 04c1ffebb838d..cbba5aa7152ba 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -362,7 +362,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
       IsQDQGraph(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
     std::unique_ptr<onnxruntime::Model> model;
-    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model);
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.enable_ovep_weight_sharing, model);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index def51f34fe2ed..7c1e850b0b7a0 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -720,6 +720,7 @@ bool dumpMetaDataMapToBinary(const std::unordered_map<std::string, std::vector<s
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
+                                       bool enable_ovep_weight_sharing,
                                        /*out*/ std::unique_ptr<onnxruntime::Model>& model) {
   // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc
   // with the following differences:
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
index 94a8eb4d5da17..5b777a388adda 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
@@ -12,6 +12,7 @@ namespace openvino_ep {
 // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
+                                       bool enable_ovep_weight_sharing,
                                        /*out*/ std::unique_ptr<onnxruntime::Model>& model);
 
 }  // namespace openvino_ep

From 5e734f173ab72ed8cc93e7b045e1415b3139cc93 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 9 Jan 2025 14:52:52 -0800
Subject: [PATCH 11/35] Aligning OVEP variable names to match the session
 option value they hold

---
 .../providers/openvino/backend_manager.cc     | 10 +--
 .../openvino/backends/basic_backend.cc        | 74 +++++++++----------
 .../core/providers/openvino/contexts.h        |  8 +-
 .../openvino/onnx_ctx_model_helper.cc         | 15 ++--
 .../openvino/onnx_ctx_model_helper.h          |  2 +-
 .../openvino/openvino_execution_provider.cc   |  8 +-
 .../openvino/openvino_execution_provider.h    | 22 +++---
 .../openvino/openvino_provider_factory.cc     | 32 ++++----
 8 files changed, 85 insertions(+), 86 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index cbba5aa7152ba..790e5f9c4e445 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -140,7 +140,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
       bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
-                                       !session_context_.disable_cpu_fallback &&
+                                       !session_context_.so_disable_cpu_ep_fallback &&
                                        !subgraph_context_.is_ep_ctx_graph;
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
       eligible_for_cpu_fallback = false;
@@ -187,7 +187,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
       }
     }
   }
-  if (session_context_.export_ep_ctx_blob && !subgraph_context_.is_ep_ctx_graph) {
+  if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) {
     auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
                                                                                           logger);
     if ((!status.IsOK())) {
@@ -214,7 +214,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
   // If not embed_mode, dump the blob here and only pass on the path to the blob
   std::string model_blob_str;
   auto compiled_model = concrete_backend_->GetOVCompiledModel();
-  if (session_context_.ep_context_embed_mode) {
+  if (session_context_.so_context_embed_mode) {
     // Internal blob
     std::ostringstream model_blob_stream;
     compiled_model.export_model(model_blob_stream);
@@ -245,7 +245,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
 
   ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
                                                            subgraph_context_.subgraph_name,
-                                                           session_context_.ep_context_embed_mode,
+                                                           session_context_.so_context_embed_mode,
                                                            std::move(model_blob_str)));
 
   return Status::OK();
@@ -496,7 +496,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
         ORT_THROW(ex.what());
 #else
         if (session_context_.device_type.find("NPU") != std::string::npos &&
-            !session_context_.disable_cpu_fallback) {
+            !session_context_.so_disable_cpu_ep_fallback) {
           LOGS_DEFAULT(WARNING) << ex.what();
           LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                                 << "Falling back to OV CPU for execution";
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index e9882caa1372b..1c3a3f9e425d4 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -64,9 +64,9 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx);
     if (subgraph_context_.is_ep_ctx_graph) {
       exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
-                                                        remote_context_,
-                                                        subgraph_context_.subgraph_name);
-      model_stream.reset(); // Delete stream after it is no longer needed
+                                                          remote_context_,
+                                                          subgraph_context_.subgraph_name);
+      model_stream.reset();  // Delete stream after it is no longer needed
     } else {
       std::shared_ptr<const OVNetwork> ov_model;
       {
@@ -79,45 +79,45 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
       exe_network_ = session_context_.ie_core.CompileModel(
           ov_model, remote_context_, subgraph_context_.subgraph_name);
-      }
+    }
 #else  // !IO_BUFFER_ENABLED
-      auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
-                                   (session_context_.OpenVINO_Version.at(0) >= 2024 &&
-                                   session_context_.OpenVINO_Version.at(1) > 2));
-      if (subgraph_context_.is_ep_ctx_graph) {
-        // If the blob is held in an EPContext node, then skip FE+Compile
-        // and directly move on to creating a backend with the executable blob
-        exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+    auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
+                                 (session_context_.OpenVINO_Version.at(0) >= 2024 &&
+                                  session_context_.OpenVINO_Version.at(1) > 2));
+    if (subgraph_context_.is_ep_ctx_graph) {
+      // If the blob is held in an EPContext node, then skip FE+Compile
+      // and directly move on to creating a backend with the executable blob
+      exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+                                                          hw_target,
+                                                          device_config,
+                                                          subgraph_context_.subgraph_name);
+      model_stream.reset();  // Delete stream after it is no longer needed
+    } else if (!subgraph_context_.has_external_weights &&
+               !subgraph_context_.has_dynamic_input_shape &&
+               !session_context_.so_context_enable &&
+               auto_unified_compile) {
+      // Unified OV compile_model is efficient when ov model caching is enabled
+      // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
+      // Inputs with static dimenstions
+      // Not enabled for models with external weights and when ep context is set.
+      const std::string model = model_proto->SerializeAsString();
+      exe_network_ = session_context_.ie_core.CompileModel(model,
                                                            hw_target,
                                                            device_config,
                                                            subgraph_context_.subgraph_name);
-        model_stream.reset(); // Delete stream after it is no longer needed
-      } else if (!subgraph_context_.has_external_weights &&
-                 !subgraph_context_.has_dynamic_input_shape &&
-                 !session_context_.export_ep_ctx_blob &&
-                 auto_unified_compile){
-        // Unified OV compile_model is efficient when ov model caching is enabled
-        // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
-        // Inputs with static dimenstions
-        // Not enabled for models with external weights and when ep context is set.
+    } else {  // For all other types use ov::core read_model() to generate OV IR
+              // followed by ov::core compile_model()
+      std::shared_ptr<const OVNetwork> ov_model;
+      {
         const std::string model = model_proto->SerializeAsString();
-        exe_network_ = session_context_.ie_core.CompileModel(model,
-                                                            hw_target,
-                                                            device_config,
-                                                            subgraph_context_.subgraph_name);
-      } else {  // For all other types use ov::core read_model() to generate OV IR
-                // followed by ov::core compile_model()
-        std::shared_ptr<const OVNetwork> ov_model;
-        {
-          const std::string model = model_proto->SerializeAsString();
-          if (!subgraph_context.has_dynamic_input_shape) {
-            delete model_proto.release();
-          }
-          ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
+        if (!subgraph_context.has_dynamic_input_shape) {
+          delete model_proto.release();
         }
-        exe_network_ = session_context_.ie_core.CompileModel(
-            ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
+        ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
       }
+      exe_network_ = session_context_.ie_core.CompileModel(
+          ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
+    }
 #endif
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
@@ -178,7 +178,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     }
     device_config.emplace(ov::device::properties("NPU", device_property));
 #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
-    if (session_context_.export_ep_ctx_blob) {
+    if (session_context_.so_context_enable) {
       session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
     }
 #endif
@@ -287,7 +287,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
   if (subgraph_context_.is_ep_ctx_graph) return;
 
-  if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) {
+  if (!session_context_.cache_dir.empty() && !session_context_.so_context_enable) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
     if (session_context_.device_type.find("AUTO:GPU") != std::string::npos) {
       std::pair<std::string, ov::Any> device_property;
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 95954ae204047..62e2cfcaa9d98 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -18,11 +18,11 @@ struct SessionContext {
   OVCore ie_core;
   bool enable_opencl_throttling = false;
   bool disable_dynamic_shapes = false;
-  bool ep_context_embed_mode = false;
-  bool enable_ovep_weight_sharing = false;
-  bool export_ep_ctx_blob = false;
+  bool so_context_embed_mode = false;
+  bool so_share_ep_contexts = false;
+  bool so_context_enable = false;
   bool enable_qdq_optimizer = false;
-  bool disable_cpu_fallback = false;
+  bool so_disable_cpu_ep_fallback = false;
   size_t num_of_threads;
   std::string device_type;
   std::string precision_str;
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 907650257c3f2..1c6b0a0467836 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -41,7 +41,7 @@ Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) {
 
 Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
                                            const std::string& graph_name,
-                                           const bool ep_context_embed_mode,
+                                           const bool embed_mode,
                                            std::string&& model_blob_str) const {
   auto& graph = epctx_model_->MainGraph();
 
@@ -66,7 +66,7 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
     auto embed_mode_attr = ONNX_NAMESPACE::AttributeProto::Create();
     embed_mode_attr->set_name(EMBED_MODE);
     embed_mode_attr->set_type(onnx::AttributeProto_AttributeType_INT);
-    embed_mode_attr->set_i(ep_context_embed_mode);
+    embed_mode_attr->set_i(embed_mode);
     node_attributes->emplace(EMBED_MODE, std::move(*embed_mode_attr));
 
     // ep context
@@ -106,17 +106,16 @@ std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const GraphViewer
   auto& attrs = node->GetAttributes();
 
   ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1);
-  const auto& ep_cache_context_attribute = attrs.at(EP_CACHE_CONTEXT);
-  const auto& cache_context = ep_cache_context_attribute.s();
+  const auto& ep_cache_context = attrs.at(EP_CACHE_CONTEXT).s();
 
   ORT_ENFORCE(attrs.count(EMBED_MODE) == 1);
-  bool ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
+  bool embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
 
   std::unique_ptr<std::istream> result;
-  if (ep_context_embed_mode) {
-    result.reset((std::istream*)new std::istringstream(cache_context));
+  if (embed_mode) {
+    result.reset((std::istream*)new std::istringstream(ep_cache_context));
   } else {
-    result.reset((std::istream*)new std::ifstream(cache_context, std::ios_base::binary | std::ios_base::in));
+    result.reset((std::istream*)new std::ifstream(ep_cache_context, std::ios_base::binary | std::ios_base::in));
   }
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
   return result;
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index 7e5d5180b363b..f644e2607904d 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -29,7 +29,7 @@ class EPCtxHandler {
   bool CheckForOVEPCtxNode(const Node& node) const;
   Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
                                const std::string& graph_name,
-                               const bool ep_context_embed_mode,
+                               const bool embed_mode,
                                std::string&& model_blob_str) const;
   std::unique_ptr<std::istream> GetModelBlobStream(const GraphViewer& graph_viewer) const;
   InlinedVector<const Node*> GetEPCtxNodes() const;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 817d0817cbfc6..50eed5443b8df 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -25,11 +25,11 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf
   openvino_ep::SessionContext result = {
       .enable_opencl_throttling = info.enable_opencl_throttling_,
       .disable_dynamic_shapes = info.disable_dynamic_shapes_,
-      .ep_context_embed_mode = info.so_epctx_embed_mode_,
-      .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_,
-      .export_ep_ctx_blob = info.export_ep_ctx_blob_,
+      .so_context_embed_mode = info.so_context_embed_mode_,
+      .so_share_ep_contexts = info.so_share_ep_contexts_,
+      .so_context_enable = info.so_context_enable_,
       .enable_qdq_optimizer = info.enable_qdq_optimizer_,
-      .disable_cpu_fallback = info.disable_cpu_fallback_,
+      .so_disable_cpu_ep_fallback = info.so_disable_cpu_ep_fallback_,
       .num_of_threads = info.num_of_threads_,
       .device_type = info.device_type_,
       .precision_str = info.precision_,
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 1b3990310fc61..5644639c705f8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -87,11 +87,11 @@ struct OpenVINOExecutionProviderInfo {
   void* context_{NULL};
   bool enable_opencl_throttling_{false};
   bool disable_dynamic_shapes_{false};
-  bool export_ep_ctx_blob_{false};
+  bool so_context_enable_{false};
   bool enable_qdq_optimizer_{false};
-  bool disable_cpu_fallback_{false};
-  bool so_epctx_embed_mode_{false};
-  bool so_enable_ovep_weight_sharing_{false};
+  bool so_disable_cpu_ep_fallback_{false};
+  bool so_context_embed_mode_{false};
+  bool so_share_ep_contexts_{false};
 
   OpenVINOExecutionProviderInfo() = delete;
 
@@ -101,9 +101,9 @@ struct OpenVINOExecutionProviderInfo {
                                          const std::string& cache_dir,
                                          const std::string& model_priority, int num_streams,
                                          void* context, bool enable_opencl_throttling,
-                                         bool disable_dynamic_shapes, bool export_ep_ctx_blob,
-                                         bool enable_qdq_optimizer, bool disable_cpu_fallback,
-                                         bool so_epctx_embed_mode, bool so_enable_ovep_weight_sharing)
+                                         bool disable_dynamic_shapes, bool so_context_enable,
+                                         bool enable_qdq_optimizer, bool so_disable_cpu_ep_fallback,
+                                         bool so_context_embed_mode, bool so_share_ep_contexts)
       : precision_(std::move(precision)),
         num_of_threads_(num_of_threads),
         load_config_(std::move(load_config)),
@@ -113,11 +113,11 @@ struct OpenVINOExecutionProviderInfo {
         context_(context),
         enable_opencl_throttling_(enable_opencl_throttling),
         disable_dynamic_shapes_(disable_dynamic_shapes),
-        export_ep_ctx_blob_(export_ep_ctx_blob),
+        so_context_enable_(so_context_enable),
         enable_qdq_optimizer_(enable_qdq_optimizer),
-        disable_cpu_fallback_(disable_cpu_fallback),
-        so_epctx_embed_mode_{so_epctx_embed_mode},
-        so_enable_ovep_weight_sharing_{so_enable_ovep_weight_sharing} {
+        so_disable_cpu_ep_fallback_(so_disable_cpu_ep_fallback),
+        so_context_embed_mode_{so_context_embed_mode},
+        so_share_ep_contexts_{so_share_ep_contexts} {
     std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
                                                        "GPU.0", "GPU.1", "NPU"};
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 09ee83d3a7cc4..92c4948565a1d 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -51,14 +51,14 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
 };
 
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
-  bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
-  std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
-  bool so_enable_ovep_weight_sharing = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
-
-  if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
-    cache_dir_ = std::move(so_cache_path);
+  bool so_disable_cpu_ep_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
+  bool so_context_enable = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+  bool so_context_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
+  std::string so_context_file_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data();
+  bool so_share_ep_contexts = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+
+  if (so_context_enable && !so_context_file_path.empty()) {
+    cache_dir_ = std::move(so_context_file_path);
     auto file_path = std::filesystem::path(cache_dir_);
     // ep_context_file_path_ file extension must be .onnx
     if (file_path.extension().generic_string() == ".onnx") {
@@ -76,8 +76,8 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
 
   OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_,
                                      cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_,
-                                     disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_,
-                                     so_disable_cpu_fallback, so_epctx_embed_mode, so_enable_ovep_weight_sharing);
+                                     disable_dynamic_shapes_, so_context_enable, enable_qdq_optimizer_,
+                                     so_disable_cpu_ep_fallback, so_context_embed_mode, so_share_ep_contexts);
   return std::make_unique<OpenVINOExecutionProvider>(info);
 }
 
@@ -272,12 +272,12 @@ struct OpenVINO_Provider : Provider {
       context = reinterpret_cast<void*>(number);
     }
 #if defined(IO_BUFFER_ENABLED)
-  // a valid context must be provided to enable IO Buffer optimizations
-  if(context==nullptr){
-    #undef IO_BUFFER_ENABLED
-    #define IO_BUFFER_ENABLED=0
-    LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
-  }
+    // a valid context must be provided to enable IO Buffer optimizations
+    if (context == nullptr) {
+#undef IO_BUFFER_ENABLED
+#define IO_BUFFER_ENABLED = 0
+      LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
+    }
 #endif
 
     if (provider_options_map.find("num_of_threads") != provider_options_map.end()) {

From c9fb7577ffe8cc9fa7e05feed438f665fc309a80 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Fri, 10 Jan 2025 12:08:07 -0800
Subject: [PATCH 12/35] Add plumbing for context sharing plus refactoring
 around option handling

---
 .../providers/openvino/backend_manager.cc     |  14 +-
 .../core/providers/openvino/backend_manager.h |   3 +-
 .../openvino/backends/basic_backend.cc        |   6 +-
 .../core/providers/openvino/contexts.h        |  76 ++++--
 .../openvino/openvino_execution_provider.cc   | 153 ++++++++----
 .../openvino/openvino_execution_provider.h    | 144 +----------
 .../openvino/openvino_provider_factory.cc     | 228 ++++++------------
 7 files changed, 263 insertions(+), 361 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 790e5f9c4e445..6e308d78ca066 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -46,7 +46,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
       return "";
     } else {
       auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
-      if (session_context_.precision_str == "ACCURACY" &&
+      if (session_context_.precision == "ACCURACY" &&
           session_context_.device_type.find("GPU") != std::string::npos) {
         if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
           return "FP32";
@@ -150,7 +150,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
         session_context_.device_type = "CPU";
-        session_context_.precision_str = "FP32";
+        session_context_.precision = "FP32";
         try {
           concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                           session_context_,
@@ -188,8 +188,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
     }
   }
   if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) {
-    auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
-                                                                                          logger);
+    auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph);
     if ((!status.IsOK())) {
       ORT_THROW(status);
     }
@@ -200,8 +199,7 @@ BackendManager::BackendManager(const SessionContext& session_context,
 // precompiled blob is set. If that's the case:
 // By default, create model in embed mode where the blob stream is exported as data within
 // the EPContext node.
-Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer,
-                                                     const logging::Logger& logger) {
+Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer) {
   if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
     std::string exception_str =
         "Exporting dynamically compiled models at runtime is not supported. "
@@ -362,7 +360,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
       IsQDQGraph(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
     std::unique_ptr<onnxruntime::Model> model;
-    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.enable_ovep_weight_sharing, model);
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
@@ -501,7 +499,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
           LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                                 << "Falling back to OV CPU for execution";
           session_context_.device_type = "CPU";
-          session_context_.precision_str = "FP32";
+          session_context_.precision = "FP32";
           key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
           try {
             dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index f77f303c70991..43dc9ceaa558e 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -27,8 +27,7 @@ class BackendManager {
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
   SessionContext& GetSessionContext();
-  Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph,
-                                       const logging::Logger& logger);
+  Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
   ov::CompiledModel& GetOVCompiledModel();
 
  private:
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 1c3a3f9e425d4..fb0fdc9b5e85b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -140,14 +140,14 @@ bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::No
 void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   device_config = {};
   // Set inference precision based on device precision for OV backend
-  if (session_context_.precision_str.find("FP16") != std::string::npos &&
+  if (session_context_.precision.find("FP16") != std::string::npos &&
       session_context_.device_type == "GPU") {
     device_config.emplace(ov::hint::inference_precision("f16"));
   }
-  if (session_context_.precision_str.find("FP32") != std::string::npos) {
+  if (session_context_.precision.find("FP32") != std::string::npos) {
     device_config.emplace(ov::hint::inference_precision("f32"));
   }
-  if (session_context_.precision_str.find("ACCURACY") != std::string::npos &&
+  if (session_context_.precision.find("ACCURACY") != std::string::npos &&
       session_context_.device_type.find("GPU") != std::string::npos) {
     if (session_context_.OpenVINO_Version.at(0) >= 2024) {
       device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 62e2cfcaa9d98..2947d43b4600b 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -13,31 +13,71 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+namespace fs = std::filesystem;
+
+struct SharedContext {
+  struct shared_weight_key {
+    std::string_view name;
+    std::string location;
+  };
+  struct shared_weight_value {
+    unsigned int data_offset;
+    unsigned int size;
+    ov::Tensor* tensor;
+  };
+  std::map<shared_weight_key, shared_weight_value> shared_weight_map;
+  fs::path bin_pathname;
+};
+
+using config_t = std::map<std::string, ov::AnyMap>;
+
+struct ProviderInfo {
+  std::string device_type{""};             // [device_type]: Overrides the accelerator hardware type and
+                                           // precision with these values at runtime.
+  std::string precision{""};               // [precision]: Sets the inference precision for execution.
+                                           // Supported precision for devices are
+                                           // CPU=FP32, GPU=FP32,FP16, NPU=FP16.
+                                           // Not setting precision will execute with optimized precision for
+                                           // best inference latency. set Precision=ACCURACY for executing
+                                           // models with input precision for best accuracy.
+  uint32_t num_of_threads{0};              // [num_of_threads]: Overrides the accelerator default value of
+                                           // number of threads with this value at runtime.
+  config_t load_config{};                  // JSON config map to load custom OV parameters.
+  fs::path cache_dir{""};                  // [cache_dir]: specify the path to
+                                           // dump and load the blobs for the model caching/kernel caching
+                                           // (GPU) feature. If blob files are already present,
+                                           // it will be directly loaded.
+  std::string model_priority{"DEFAULT"};   // High-level OpenVINO model priority hint
+                                           // Defines what model should be provided with more performant
+                                           // bounded resource first
+  uint32_t num_streams{1};                 // [num_streams]: Option that specifies the number of parallel
+                                           // inference requests to be processed on a given `device_type`.
+                                           // Overrides the accelerator default value of number of streams
+                                           // with this value at runtime.
+  void* context{nullptr};                  // OpenCL context
+  bool enable_opencl_throttling{false};    // [enable_opencl_throttling]: Enables OpenCL queue throttling for
+                                           // GPU device (Reduces CPU Utilization when using GPU)
+  bool disable_dynamic_shapes{false};      // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to
+                                           // static shape at runtime and execute.
+  bool enable_qdq_optimizer{false};        // Enables QDQ pruning for efficient inference latency with NPU
+  bool so_context_enable{false};           // ORT session option
+  bool so_disable_cpu_ep_fallback{false};  // ORT session option
+  bool so_context_embed_mode{false};       // ORT session option
+  bool so_share_ep_contexts{false};        // ORT session option
+};
+
 // Holds context applicable to the entire EP instance.
-struct SessionContext {
+struct SessionContext : ProviderInfo {
+  SessionContext(const ProviderInfo& info) : ProviderInfo{info} {}
+
   OVCore ie_core;
-  bool enable_opencl_throttling = false;
-  bool disable_dynamic_shapes = false;
-  bool so_context_embed_mode = false;
-  bool so_share_ep_contexts = false;
-  bool so_context_enable = false;
-  bool enable_qdq_optimizer = false;
-  bool so_disable_cpu_ep_fallback = false;
-  size_t num_of_threads;
-  std::string device_type;
-  std::string precision_str;
-  std::filesystem::path cache_dir;
-  std::map<std::string, ov::AnyMap> load_config;
-  std::string model_priority = "DEFAULT";
-  int num_streams;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::string onnx_model_name;
   std::string onnx_model_path_name;
   int onnx_opset_version;
-  void* context = 0;
   bool use_api_2;
-  std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
-  std::string openvino_sdk_version;
+  const std::vector<int> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
+  const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR);
 };
 
 // Holds context specific to subgraph.
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 50eed5443b8df..ab7604e1344f2 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -18,63 +18,123 @@
 #include "core/providers/openvino/ov_allocator.h"
 #endif
 
-#define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
-
 namespace onnxruntime {
-openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInfo& info) {
-  openvino_ep::SessionContext result = {
-      .enable_opencl_throttling = info.enable_opencl_throttling_,
-      .disable_dynamic_shapes = info.disable_dynamic_shapes_,
-      .so_context_embed_mode = info.so_context_embed_mode_,
-      .so_share_ep_contexts = info.so_share_ep_contexts_,
-      .so_context_enable = info.so_context_enable_,
-      .enable_qdq_optimizer = info.enable_qdq_optimizer_,
-      .so_disable_cpu_ep_fallback = info.so_disable_cpu_ep_fallback_,
-      .num_of_threads = info.num_of_threads_,
-      .device_type = info.device_type_,
-      .precision_str = info.precision_,
-      .cache_dir = info.cache_dir_,
-      .load_config = info.load_config_,
-      .model_priority = info.model_priority_,
-      .num_streams = info.num_streams_,
-      .context = info.context_,
-      .OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR},
-      .openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR),
-  };
-  return result;
+namespace openvino_ep {
+
+// Parking this code here for now before it's moved to the factory
+static std::vector<std::string> parseDevices(const std::string& device_string,
+                                             const std::vector<std::string>& available_devices) {
+  std::string comma_separated_devices = device_string;
+  if (comma_separated_devices.find(":") != std::string::npos) {
+    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
+  }
+  auto devices = split(comma_separated_devices, ',');
+  if (devices.size() < 2) {
+    print_build_options();
+    ORT_THROW("Invalid device string: " + device_string);
+  }
+  std::set<std::string> dev_options = {"CPU", "GPU", "NPU"};
+
+  for (auto& device : available_devices) {
+    if (dev_options.find(device) == dev_options.end()) {
+      auto dev_options_update = dev_options.emplace(device);
+    }
+  }
+
+  for (const std::string& dev : devices) {
+    if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
+      print_build_options();
+      ORT_THROW("Invalid device string: " + device_string);
+    }
+  }
+  return devices;
+}
+
+// Parking this code here for now before it's moved to the factory
+void AdjustProviderInfo(ProviderInfo& info) {
+  std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
+                                                     "GPU.0", "GPU.1", "NPU"};
+
+  OVDevices devices;
+  std::vector<std::string> available_devices = devices.get_ov_devices();
+
+  for (auto& device : available_devices) {
+    if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
+      ov_supported_device_types.emplace(device);
+    }
+  }
+
+  if (info.device_type == "") {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
+                       << "No runtime device selection option provided.";
+#if defined OPENVINO_CONFIG_CPU
+    device_type_ = "CPU";
+    precision_ = "FP32";
+#elif defined OPENVINO_CONFIG_GPU
+    device_type_ = "GPU";
+    precision_ = "FP16";
+#elif defined OPENVINO_CONFIG_NPU
+    info.device_type = "NPU";
+    info.precision = "FP16";
+#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+#ifdef DEVICE_NAME
+#define DEVICE DEVICE_NAME
+#endif
+    dev_type = DEVICE;
+
+    if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+      std::vector<std::string> devices = parseDevices(info.device_type, available_devices);
+      info.precision = "FP16";
+      if (devices[0] == "CPU") {
+        info.precision = "FP32";
+      }
+      info.device_type = std::move(dev_type);
+    }
+#endif
+  } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) {
+    info.device_type = std::move(info.device_type);
+  } else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+    std::ignore = parseDevices(info.device_type, available_devices);
+    info.device_type = std::move(info.device_type);
+  } else {
+    ORT_THROW("Invalid device string: " + info.device_type);
+  }
+  LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
+                     << "Choosing Device: " << info.device_type << " , Precision: " << info.precision;
 }
 
-OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info)
+OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context)
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider},
-      session_context_{GetSessionContext(info)},
+      session_context_(info),
+      shared_context_{shared_context},
       ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} {
   InitProviderOrtApi();
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
-  if (info.cache_dir_.empty()) {
+  if (info.cache_dir.empty()) {
     bool device_found = false;
     std::vector<std::string> available_devices = session_context_.ie_core.GetAvailableDevices();
     // Checking for device_type configuration
-    if (info.device_type_ != "") {
-      if (info.device_type_.find("HETERO") != std::string::npos ||
-          info.device_type_.find("MULTI") != std::string::npos ||
-          info.device_type_.find("AUTO") != std::string::npos) {
+    if (info.device_type != "") {
+      if (info.device_type.find("HETERO") != std::string::npos ||
+          info.device_type.find("MULTI") != std::string::npos ||
+          info.device_type.find("AUTO") != std::string::npos) {
         device_found = true;
       } else {
         for (const std::string& device : available_devices) {
-          if (device.rfind(info.device_type_, 0) == 0) {
-            if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" ||
-                                                                       info.precision_ == "FP16" ||
-                                                                       info.precision_ == "ACCURACY")) {
+          if (device.rfind(info.device_type, 0) == 0) {
+            if (info.device_type.find("GPU") != std::string::npos && (info.precision == "FP32" ||
+                                                                      info.precision == "FP16" ||
+                                                                      info.precision == "ACCURACY")) {
               device_found = true;
               break;
             }
-            if (info.device_type_ == "CPU" && (info.precision_ == "FP32")) {
+            if (info.device_type == "CPU" && (info.precision == "FP32")) {
               device_found = true;
               break;
             }
-            if (info.device_type_.find("NPU") != std::string::npos) {
+            if (info.device_type.find("NPU") != std::string::npos) {
               device_found = true;
               break;
             }
@@ -83,7 +143,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
       }
     }
     if (!device_found) {
-      ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type_ + " is not available");
+      ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type + " is not available");
     }
   }
 }
@@ -118,6 +178,13 @@ common::Status OpenVINOExecutionProvider::Compile(
   session_context_.onnx_opset_version =
       fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain);
 
+  struct OpenVINOEPFunctionState {
+    AllocateFunc allocate_func = nullptr;
+    DestroyFunc destroy_func = nullptr;
+    AllocatorHandle allocator_handle = nullptr;
+    BackendManager& backend_manager;
+  };
+
   for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
     const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
     const Node& fused_node = fused_node_graph.fused_node;
@@ -138,13 +205,15 @@ common::Status OpenVINOExecutionProvider::Compile(
 
     compute_info.create_state_func =
         [&backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(backend_manager);
-          p->allocate_func = context->allocate_func;
-          p->destroy_func = context->release_func;
-          p->allocator_handle = context->allocator_handle;
+          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+              .allocate_func = context->allocate_func,
+              .destroy_func = context->release_func,
+              .allocator_handle = context->allocator_handle,
+              .backend_manager = backend_manager};
           *state = static_cast<FunctionState>(p);
           return 0;
         };
+
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
       try {
@@ -162,6 +231,7 @@ common::Status OpenVINOExecutionProvider::Compile(
             delete function_state;
           }
         };
+
     node_compute_funcs.push_back(compute_info);
 
     if (!status.IsOK()) {
@@ -234,4 +304,5 @@ const InlinedVector<const Node*> OpenVINOExecutionProvider::GetEpContextNodes()
   return ep_ctx_handle_.GetEPCtxNodes();
 }
 
+}  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 5644639c705f8..d35dc5513ed1d 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -13,8 +13,10 @@
 #include <utility>
 
 #include "core/providers/openvino/backend_manager.h"
+#include "core/providers/openvino/contexts.h"
 
 namespace onnxruntime {
+namespace openvino_ep {
 
 struct OVDevices {
   ov::Core core;
@@ -47,141 +49,10 @@ static std::vector<std::string> split(const std::string& s, char delim) {
   return result;
 }
 
-static std::vector<std::string> parseDevices(const std::string& device_string,
-                                             const std::vector<std::string>& available_devices) {
-  std::string comma_separated_devices = device_string;
-  if (comma_separated_devices.find(":") != std::string::npos) {
-    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
-  }
-  auto devices = split(comma_separated_devices, ',');
-  if (devices.size() < 2) {
-    print_build_options();
-    ORT_THROW("Invalid device string: " + device_string);
-  }
-  std::set<std::string> dev_options = {"CPU", "GPU", "NPU"};
-
-  for (auto& device : available_devices) {
-    if (dev_options.find(device) == dev_options.end()) {
-      auto dev_options_update = dev_options.emplace(device);
-    }
-  }
-
-  for (const std::string& dev : devices) {
-    if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
-      print_build_options();
-      ORT_THROW("Invalid device string: " + device_string);
-    }
-  }
-  return devices;
-}
-
-// Information needed to construct OpenVINO execution providers.
-struct OpenVINOExecutionProviderInfo {
-  std::string device_type_{""};
-  std::string precision_{""};
-  size_t num_of_threads_{0};
-  std::map<std::string, ov::AnyMap> load_config_{};
-  std::string cache_dir_{""};
-  std::string model_priority_{""};
-  int num_streams_{1};
-  void* context_{NULL};
-  bool enable_opencl_throttling_{false};
-  bool disable_dynamic_shapes_{false};
-  bool so_context_enable_{false};
-  bool enable_qdq_optimizer_{false};
-  bool so_disable_cpu_ep_fallback_{false};
-  bool so_context_embed_mode_{false};
-  bool so_share_ep_contexts_{false};
-
-  OpenVINOExecutionProviderInfo() = delete;
-
-  explicit OpenVINOExecutionProviderInfo(std::string dev_type, const std::string& precision,
-                                         size_t num_of_threads,
-                                         const std::map<std::string, ov::AnyMap>& load_config,
-                                         const std::string& cache_dir,
-                                         const std::string& model_priority, int num_streams,
-                                         void* context, bool enable_opencl_throttling,
-                                         bool disable_dynamic_shapes, bool so_context_enable,
-                                         bool enable_qdq_optimizer, bool so_disable_cpu_ep_fallback,
-                                         bool so_context_embed_mode, bool so_share_ep_contexts)
-      : precision_(std::move(precision)),
-        num_of_threads_(num_of_threads),
-        load_config_(std::move(load_config)),
-        cache_dir_(std::move(cache_dir)),
-        model_priority_(std::move(model_priority)),
-        num_streams_(num_streams),
-        context_(context),
-        enable_opencl_throttling_(enable_opencl_throttling),
-        disable_dynamic_shapes_(disable_dynamic_shapes),
-        so_context_enable_(so_context_enable),
-        enable_qdq_optimizer_(enable_qdq_optimizer),
-        so_disable_cpu_ep_fallback_(so_disable_cpu_ep_fallback),
-        so_context_embed_mode_{so_context_embed_mode},
-        so_share_ep_contexts_{so_share_ep_contexts} {
-    std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
-                                                       "GPU.0", "GPU.1", "NPU"};
-
-    OVDevices devices;
-    std::vector<std::string> available_devices = devices.get_ov_devices();
-
-    for (auto& device : available_devices) {
-      if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
-        ov_supported_device_types.emplace(device);
-      }
-    }
-
-    if (dev_type == "") {
-      LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
-                         << "No runtime device selection option provided.";
-#if defined OPENVINO_CONFIG_CPU
-      device_type_ = "CPU";
-      precision_ = "FP32";
-#elif defined OPENVINO_CONFIG_GPU
-      device_type_ = "GPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU
-      device_type_ = "NPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
-#ifdef DEVICE_NAME
-#define DEVICE DEVICE_NAME
-#endif
-      dev_type = DEVICE;
-
-      if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-        std::vector<std::string> devices = parseDevices(dev_type, available_devices);
-        precision_ = "FP16";
-        if (devices[0] == "CPU") {
-          precision_ = "FP32";
-        }
-        device_type_ = std::move(dev_type);
-      }
-#endif
-    } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) {
-      device_type_ = std::move(dev_type);
-    } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-      std::vector<std::string> devices = parseDevices(dev_type, available_devices);
-      device_type_ = std::move(dev_type);
-    } else {
-      ORT_THROW("Invalid device string: " + dev_type);
-    }
-    LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
-                       << "Choosing Device: " << device_type_ << " , Precision: " << precision_;
-  }
-};
-
-struct OpenVINOEPFunctionState {
-  OpenVINOEPFunctionState(openvino_ep::BackendManager& bm) : backend_manager(bm) {}
-  AllocateFunc allocate_func = nullptr;
-  DestroyFunc destroy_func = nullptr;
-  AllocatorHandle allocator_handle = nullptr;
-  openvino_ep::BackendManager& backend_manager;
-};
-
 // Logical device representation.
 class OpenVINOExecutionProvider : public IExecutionProvider {
  public:
-  explicit OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info);
+  explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context = nullptr);
   ~OpenVINOExecutionProvider() = default;
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -204,10 +75,11 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 #endif
  private:
-  openvino_ep::SessionContext session_context_;
-  std::list<openvino_ep::BackendManager> backend_managers_;  // EP session owns the backend objects
-
-  openvino_ep::EPCtxHandler ep_ctx_handle_;
+  SessionContext session_context_;
+  SharedContext* shared_context_{nullptr};
+  std::list<BackendManager> backend_managers_;  // EP session owns the backend objects
+  EPCtxHandler ep_ctx_handle_;
 };
 
+}  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 92c4948565a1d..2028979c1c87d 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -7,83 +7,29 @@
 #include "core/providers/openvino/openvino_provider_factory.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
+#include "core/providers/openvino/contexts.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
 namespace onnxruntime {
+namespace openvino_ep {
 struct OpenVINOProviderFactory : IExecutionProviderFactory {
-  OpenVINOProviderFactory(const std::string& device_type, const std::string& precision,
-                          size_t num_of_threads,
-                          const std::map<std::string, ov::AnyMap>& load_config, const std::string& cache_dir,
-                          const std::string& model_priority, int num_streams, void* context,
-                          bool enable_opencl_throttling, bool disable_dynamic_shapes,
-                          bool enable_qdq_optimizer, const ConfigOptions& config_options)
-      : device_type_(device_type),
-        precision_(precision),
-        num_of_threads_(num_of_threads),
-        load_config_(load_config),
-        cache_dir_(cache_dir),
-        model_priority_(model_priority),
-        num_streams_(num_streams),
-        context_(context),
-        enable_opencl_throttling_(enable_opencl_throttling),
-        disable_dynamic_shapes_(disable_dynamic_shapes),
-        enable_qdq_optimizer_(enable_qdq_optimizer),
-        config_options_(config_options) {}
+  OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext* shared_context)
+      : provider_info_(provider_info), shared_context_(shared_context) {}
 
   ~OpenVINOProviderFactory() override {}
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override;
 
  private:
-  std::string device_type_;
-  std::string precision_;
-  size_t num_of_threads_;
-  const std::map<std::string, ov::AnyMap> load_config_;
-  std::string cache_dir_;
-  std::string model_priority_;
-  int num_streams_;
-  void* context_;
-  bool enable_opencl_throttling_;
-  bool disable_dynamic_shapes_;
-  bool enable_qdq_optimizer_;
-  const ConfigOptions& config_options_;
+  ProviderInfo provider_info_;
+  SharedContext* shared_context_;
 };
 
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
-  bool so_disable_cpu_ep_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  bool so_context_enable = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-  bool so_context_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
-  std::string so_context_file_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data();
-  bool so_share_ep_contexts = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
-
-  if (so_context_enable && !so_context_file_path.empty()) {
-    cache_dir_ = std::move(so_context_file_path);
-    auto file_path = std::filesystem::path(cache_dir_);
-    // ep_context_file_path_ file extension must be .onnx
-    if (file_path.extension().generic_string() == ".onnx") {
-      // ep_context_file_path_ must be provided as a directory, create it if doesn't exist
-      auto parent_path = file_path.parent_path();
-      if (!parent_path.empty() && !std::filesystem::is_directory(parent_path) &&
-          !std::filesystem::create_directory(parent_path)) {
-        ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " +
-                  file_path.parent_path().generic_string() + " \n");
-      }
-    } else {
-      ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + cache_dir_ + " \n");
-    }
-  }
-
-  OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_,
-                                     cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_,
-                                     disable_dynamic_shapes_, so_context_enable, enable_qdq_optimizer_,
-                                     so_disable_cpu_ep_fallback, so_context_embed_mode, so_share_ep_contexts);
-  return std::make_unique<OpenVINOExecutionProvider>(info);
+  return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
 }
 
-}  // namespace onnxruntime
-
-namespace onnxruntime {
 struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO {
   std::vector<std::string> GetAvailableDevices() const override {
     openvino_ep::OVCore ie_core;
@@ -96,43 +42,16 @@ struct OpenVINO_Provider : Provider {
 
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* void_params) override {
     // Extract the void_params into ProviderOptions and ConfigOptions
-    typedef std::pair<const ProviderOptions*, const ConfigOptions&> ConfigBuffer;
+    using ConfigBuffer = std::pair<const ProviderOptions*, const ConfigOptions&>;
     const ConfigBuffer* buffer = reinterpret_cast<const ConfigBuffer*>(void_params);
-    auto& provider_options_map = *buffer->first;
-    const ConfigOptions& config_options = buffer->second;
-
-    std::string device_type = "";                   // [device_type]: Overrides the accelerator hardware type and
-                                                    // precision with these values at runtime.
-    std::string precision = "";                     // [precision]: Sets the inference precision for execution.
-                                                    // Supported precision for devices are
-                                                    // CPU=FP32, GPU=FP32,FP16, NPU=FP16.
-                                                    // Not setting precision will execute with optimized precision for
-                                                    // best inference latency. set Precision=ACCURACY for executing
-                                                    // models with input precision for best accuracy.
-    int num_of_threads = 0;                         // [num_of_threads]: Overrides the accelerator default value of
-                                                    // number of threads with this value at runtime.
-    std::map<std::string, ov::AnyMap> load_config;  // JSON config map to load custom OV parameters.
-    std::string cache_dir = "";                     // [cache_dir]: specify the path to
-                                                    // dump and load the blobs for the model caching/kernel caching
-                                                    // (GPU) feature. If blob files are already present,
-                                                    // it will be directly loaded.
-    std::string model_priority = "DEFAULT";         // High-level OpenVINO model priority hint
-                                                    // Defines what model should be provided with more performant
-                                                    // bounded resource first
-    int num_streams = 1;                            // [num_streams]: Option that specifies the number of parallel
-                                                    // inference requests to be processed on a given `device_type`.
-                                                    // Overrides the accelerator default value of number of streams
-                                                    // with this value at runtime.
-    bool enable_opencl_throttling = false;          // [enable_opencl_throttling]: Enables OpenCL queue throttling for
-                                                    // GPU device (Reduces CPU Utilization when using GPU)
-
-    bool enable_qdq_optimizer = false;  // Enables QDQ pruning for efficient inference latency with NPU
-
-    void* context = nullptr;
+    const auto& provider_options_map = *buffer->first;
+    const auto& config_options = buffer->second;
+
+    ProviderInfo pi;
 
     std::string bool_flag = "";
     if (provider_options_map.find("device_type") != provider_options_map.end()) {
-      device_type = provider_options_map.at("device_type").c_str();
+      pi.device_type = provider_options_map.at("device_type").c_str();
 
       std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
                                                          "GPU.0", "GPU.1", "NPU"};
@@ -147,20 +66,20 @@ struct OpenVINO_Provider : Provider {
           ov_supported_device_types.emplace(device);
         }
       }
-      if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) {
-        std::string deprecated_device = device_type;
-        int delimit = device_type.find("_");
-        device_type = deprecated_device.substr(0, delimit);
-        precision = deprecated_device.substr(delimit + 1);
+      if (deprecated_device_types.find(pi.device_type) != deprecated_device_types.end()) {
+        std::string deprecated_device = pi.device_type;
+        int delimit = pi.device_type.find("_");
+        pi.device_type = deprecated_device.substr(0, delimit);
+        pi.precision = deprecated_device.substr(delimit + 1);
         LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n"
                               << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', "
                               << "'GPU.1', 'NPU' or from"
                               << " HETERO/MULTI/AUTO options and set 'precision' separately. \n";
       }
-      if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) ||
-            (device_type.find("HETERO:") == 0) ||
-            (device_type.find("MULTI:") == 0) ||
-            (device_type.find("AUTO:") == 0))) {
+      if (!((ov_supported_device_types.find(pi.device_type) != ov_supported_device_types.end()) ||
+            (pi.device_type.find("HETERO:") == 0) ||
+            (pi.device_type.find("MULTI:") == 0) ||
+            (pi.device_type.find("AUTO:") == 0))) {
         ORT_THROW(
             "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
             "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
@@ -172,36 +91,36 @@ struct OpenVINO_Provider : Provider {
       LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. "
                             << "Upgrade to set deice_type and precision session options.\n";
       if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") {
-        device_type = std::move(dev_id);
+        pi.device_type = std::move(dev_id);
       } else {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options.");
       }
     }
     if (provider_options_map.find("precision") != provider_options_map.end()) {
-      precision = provider_options_map.at("precision").c_str();
+      pi.precision = provider_options_map.at("precision").c_str();
     }
-    if (device_type.find("GPU") != std::string::npos) {
-      if (precision == "") {
-        precision = "FP16";
-      } else if (precision != "ACCURACY" && precision != "FP16" && precision != "FP32") {
+    if (pi.device_type.find("GPU") != std::string::npos) {
+      if (pi.precision == "") {
+        pi.precision = "FP16";
+      } else if (pi.precision != "ACCURACY" && pi.precision != "FP16" && pi.precision != "FP32") {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n");
       }
-    } else if (device_type.find("NPU") != std::string::npos) {
-      if (precision == "" || precision == "ACCURACY" || precision == "FP16") {
-        precision = "FP16";
+    } else if (pi.device_type.find("NPU") != std::string::npos) {
+      if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP16") {
+        pi.precision = "FP16";
       } else {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n");
       }
-    } else if (device_type.find("CPU") != std::string::npos) {
-      if (precision == "" || precision == "ACCURACY" || precision == "FP32") {
-        precision = "FP32";
+    } else if (pi.device_type.find("CPU") != std::string::npos) {
+      if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP32") {
+        pi.precision = "FP32";
       } else {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n");
       }
     }
 
     if (provider_options_map.find("cache_dir") != provider_options_map.end()) {
-      cache_dir = provider_options_map.at("cache_dir");
+      pi.cache_dir = provider_options_map.at("cache_dir");
     }
 
     if (provider_options_map.find("load_config") != provider_options_map.end()) {
@@ -263,13 +182,13 @@ struct OpenVINO_Provider : Provider {
         return target_map;
       };
 
-      load_config = parse_config(provider_options_map.at("load_config"));
+      pi.load_config = parse_config(provider_options_map.at("load_config"));
     }
 
     if (provider_options_map.find("context") != provider_options_map.end()) {
       std::string str = provider_options_map.at("context");
       uint64_t number = std::strtoull(str.c_str(), nullptr, 16);
-      context = reinterpret_cast<void*>(number);
+      pi.context = reinterpret_cast<void*>(number);
     }
 #if defined(IO_BUFFER_ENABLED)
     // a valid context must be provided to enable IO Buffer optimizations
@@ -285,20 +204,20 @@ struct OpenVINO_Provider : Provider {
                        provider_options_map.at("num_of_threads").end(), ::isdigit)) {
         ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n");
       }
-      num_of_threads = std::stoi(provider_options_map.at("num_of_threads"));
-      if (num_of_threads <= 0) {
-        num_of_threads = 1;
+      pi.num_of_threads = std::stoi(provider_options_map.at("num_of_threads"));
+      if (pi.num_of_threads <= 0) {
+        pi.num_of_threads = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n "
                               << "Executing with num_threads=1";
       }
     }
 
     if (provider_options_map.find("model_priority") != provider_options_map.end()) {
-      model_priority = provider_options_map.at("model_priority").c_str();
+      pi.model_priority = provider_options_map.at("model_priority").c_str();
       std::vector<std::string> supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"});
       if (std::find(supported_priorities.begin(), supported_priorities.end(),
-                    model_priority) == supported_priorities.end()) {
-        model_priority = "DEFAULT";
+                    pi.model_priority) == supported_priorities.end()) {
+        pi.model_priority = "DEFAULT";
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'model_priority' "
                               << "is not one of LOW, MEDIUM, HIGH, DEFAULT. "
                               << "Executing with model_priorty=DEFAULT";
@@ -306,9 +225,9 @@ struct OpenVINO_Provider : Provider {
     }
 
     if (provider_options_map.find("num_streams") != provider_options_map.end()) {
-      num_streams = std::stoi(provider_options_map.at("num_streams"));
-      if (num_streams <= 0) {
-        num_streams = 1;
+      pi.num_streams = std::stoi(provider_options_map.at("num_streams"));
+      if (pi.num_streams <= 0) {
+        pi.num_streams = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n "
                               << "Executing with num_streams=1";
       }
@@ -316,57 +235,56 @@ struct OpenVINO_Provider : Provider {
     if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("enable_opencl_throttling");
       if (bool_flag == "true" || bool_flag == "True")
-        enable_opencl_throttling = true;
+        pi.enable_opencl_throttling = true;
       else if (bool_flag == "false" || bool_flag == "False")
-        enable_opencl_throttling = false;
+        pi.enable_opencl_throttling = false;
       bool_flag = "";
     }
 
     if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("enable_qdq_optimizer");
       if (bool_flag == "true" || bool_flag == "True")
-        enable_qdq_optimizer = true;
+        pi.enable_qdq_optimizer = true;
       else if (bool_flag == "false" || bool_flag == "False")
-        enable_qdq_optimizer = false;
+        pi.enable_qdq_optimizer = false;
       else
         ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n");
       bool_flag = "";
     }
 
-    // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to static shape at runtime and execute.
-    // Always true for NPU plugin.
-    bool disable_dynamic_shapes = false;
-    if (device_type.find("NPU") != std::string::npos) {
-      disable_dynamic_shapes = true;
+    // Always true for NPU plugin or when passed .
+    if (pi.device_type.find("NPU") != std::string::npos) {
+      pi.disable_dynamic_shapes = true;
     }
     if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("disable_dynamic_shapes");
       if (bool_flag == "true" || bool_flag == "True") {
-        disable_dynamic_shapes = true;
+        pi.disable_dynamic_shapes = true;
       } else if (bool_flag == "false" || bool_flag == "False") {
-        if (device_type.find("NPU") != std::string::npos) {
-          disable_dynamic_shapes = true;
+        if (pi.device_type.find("NPU") != std::string::npos) {
+          pi.disable_dynamic_shapes = true;
           LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to "
                              << "TRUE for NPU backend.\n ";
         } else {
-          disable_dynamic_shapes = false;
+          pi.disable_dynamic_shapes = false;
         }
       }
       bool_flag = "";
     }
 
-    return std::make_shared<OpenVINOProviderFactory>(device_type,
-                                                     precision,
-                                                     num_of_threads,
-                                                     load_config,
-                                                     cache_dir,
-                                                     model_priority,
-                                                     num_streams,
-                                                     context,
-                                                     enable_opencl_throttling,
-                                                     disable_dynamic_shapes,
-                                                     enable_qdq_optimizer,
-                                                     config_options);
+    pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
+    pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
+    pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+    std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data();
+
+    if (pi.so_context_enable && !so_context_file_path.empty()) {
+      pi.cache_dir = std::move(so_context_file_path);
+    }
+
+    SharedContext* shared_context = pi.so_share_ep_contexts ? &shared_context_ : nullptr;
+
+    return std::make_shared<OpenVINOProviderFactory>(pi, shared_context);
   }
 
   void Initialize() override {
@@ -374,13 +292,17 @@ struct OpenVINO_Provider : Provider {
 
   void Shutdown() override {
   }
+
+ private:
+  SharedContext shared_context_;
 } g_provider;
 
+}  // namespace openvino_ep
 }  // namespace onnxruntime
 
 extern "C" {
 
 ORT_API(onnxruntime::Provider*, GetProvider) {
-  return &onnxruntime::g_provider;
+  return &onnxruntime::openvino_ep::g_provider;
 }
 }

From a78166d591532b518d1d29ff9c7c23789d7fe2d7 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Sun, 12 Jan 2025 22:06:13 -0800
Subject: [PATCH 13/35] Store metadata in shared context

---
 .../providers/openvino/backend_manager.cc     |   9 +-
 .../core/providers/openvino/backend_manager.h |   6 +-
 .../core/providers/openvino/backend_utils.cc  |   2 +-
 .../core/providers/openvino/contexts.h        |  35 ++-
 .../openvino/openvino_execution_provider.cc   |  14 +-
 .../openvino/openvino_execution_provider.h    |   4 +-
 .../openvino/openvino_provider_factory.cc     |  13 +-
 .../qdq_transformations/qdq_stripping.cc      | 257 +++++++++---------
 .../qdq_transformations/qdq_stripping.h       |   7 +-
 9 files changed, 191 insertions(+), 156 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 6e308d78ca066..c06e00272a8c8 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -32,11 +32,14 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() {
   return (ov_ptr);
 }
 
-BackendManager::BackendManager(const SessionContext& session_context,
+BackendManager::BackendManager(SessionContext& session_context,
+                               SharedContext& shared_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger,
-                               EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), session_context_(session_context) {
+                               EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle),
+                                                              session_context_(session_context),
+                                                              shared_context_{shared_context} {
   subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
 
   subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
@@ -360,7 +363,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
       IsQDQGraph(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
     std::unique_ptr<onnxruntime::Model> model;
-    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model);
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model, shared_context_.shared_weights);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 43dc9ceaa558e..cdc27701ec2e6 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -19,7 +19,8 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const SessionContext& session_context,
+  BackendManager(SessionContext& session_context,
+                 SharedContext& shared_context,
                  const onnxruntime::Node& fused_node,
                  const onnxruntime::GraphViewer& subgraph,
                  const logging::Logger& logger,
@@ -51,7 +52,8 @@ class BackendManager {
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
   EPCtxHandler& ep_ctx_handle_;
-  SessionContext session_context_;
+  SessionContext& session_context_;
+  SharedContext& shared_context_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index c447a7847434a..4adf9f5b89833 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -48,7 +48,7 @@ CreateOVModel(const std::string model,
     std::cout << "CreateNgraphFunc" << std::endl;
   }
   try {
-    auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name);
+    auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name.string());
 
     // Check for Constant Folding
     if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) {
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 2947d43b4600b..f96fec345eef1 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -16,17 +16,28 @@ namespace openvino_ep {
 namespace fs = std::filesystem;
 
 struct SharedContext {
-  struct shared_weight_key {
-    std::string_view name;
-    std::string location;
-  };
-  struct shared_weight_value {
-    unsigned int data_offset;
-    unsigned int size;
-    ov::Tensor* tensor;
-  };
-  std::map<shared_weight_key, shared_weight_value> shared_weight_map;
-  fs::path bin_pathname;
+  struct SharedWeights {
+    struct Metadata {
+      struct Key {
+        std::string name;
+        bool operator==(const Key&) const = default;
+      };
+      struct KeyHash {
+        std::size_t operator()(const Key& key) const noexcept {
+          return std::hash<std::string>()(key.name);
+        }
+      };
+      struct Value {
+        std::string location;
+        unsigned int data_offset;
+        unsigned int size;
+        ov::Tensor* tensor;
+      };
+      using Map = std::unordered_map<Key, Value, KeyHash>;
+    };
+    Metadata::Map metadata;
+    fs::path external_weight_filename;
+  } shared_weights;
 };
 
 using config_t = std::map<std::string, ov::AnyMap>;
@@ -73,7 +84,7 @@ struct SessionContext : ProviderInfo {
   OVCore ie_core;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::string onnx_model_name;
-  std::string onnx_model_path_name;
+  std::filesystem::path onnx_model_path_name;
   int onnx_opset_version;
   bool use_api_2;
   const std::vector<int> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index ab7604e1344f2..684a2c64237b8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
+#include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
@@ -103,7 +104,7 @@ void AdjustProviderInfo(ProviderInfo& info) {
                      << "Choosing Device: " << info.device_type << " , Precision: " << info.precision;
 }
 
-OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context)
+OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context)
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider},
       session_context_(info),
       shared_context_{shared_context},
@@ -198,6 +199,7 @@ common::Status OpenVINOExecutionProvider::Compile(
     // For original model, check if the user wants to export a model with pre-compiled blob
 
     auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                           shared_context_,
                                                            fused_node,
                                                            graph_body_viewer,
                                                            logger,
@@ -239,6 +241,16 @@ common::Status OpenVINOExecutionProvider::Compile(
     }
   }
 
+  if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) {
+    // Metadata is generated only for shared contexts
+    // If metadata is generated then only save it if also saving epcontext (so_context_enable)
+    // If saving metadata then save it to the provided path
+    std::filesystem::path metadata_name = session_context_.cache_dir.parent_path();
+    metadata_name /= session_context_.cache_dir.stem().string() + "_metadata";
+    metadata_name.replace_extension("bin");
+    dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string());
+  }
+
   return status;
 }
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index d35dc5513ed1d..95d7027fd70e3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -52,7 +52,7 @@ static std::vector<std::string> split(const std::string& s, char delim) {
 // Logical device representation.
 class OpenVINOExecutionProvider : public IExecutionProvider {
  public:
-  explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context = nullptr);
+  explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context);
   ~OpenVINOExecutionProvider() = default;
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -76,7 +76,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
 #endif
  private:
   SessionContext session_context_;
-  SharedContext* shared_context_{nullptr};
+  SharedContext& shared_context_;
   std::list<BackendManager> backend_managers_;  // EP session owns the backend objects
   EPCtxHandler ep_ctx_handle_;
 };
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 2028979c1c87d..7b0d6c6751120 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -14,7 +14,7 @@
 namespace onnxruntime {
 namespace openvino_ep {
 struct OpenVINOProviderFactory : IExecutionProviderFactory {
-  OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext* shared_context)
+  OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context)
       : provider_info_(provider_info), shared_context_(shared_context) {}
 
   ~OpenVINOProviderFactory() override {}
@@ -23,7 +23,7 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
 
  private:
   ProviderInfo provider_info_;
-  SharedContext* shared_context_;
+  SharedContext& shared_context_;
 };
 
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
@@ -282,9 +282,14 @@ struct OpenVINO_Provider : Provider {
       pi.cache_dir = std::move(so_context_file_path);
     }
 
-    SharedContext* shared_context = pi.so_share_ep_contexts ? &shared_context_ : nullptr;
+    // Append values to config to support weight-as-inputs conversion for shared contexts
+    if (pi.so_share_ep_contexts) {
+      ov::AnyMap map;
+      map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum";
+      pi.load_config["NPU"] = map;
+    }
 
-    return std::make_shared<OpenVINOProviderFactory>(pi, shared_context);
+    return std::make_shared<OpenVINOProviderFactory>(pi, shared_context_);
   }
 
   void Initialize() override {
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 7c1e850b0b7a0..2ba6e03dd4d8e 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -56,7 +56,7 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph,
                                   std::set<std::string>& initializers_to_keep,
                                   const NodeUnitIODef& io_def) {
   const std::string& name = io_def.node_arg.Name();
-  const ONNX_NAMESPACE::TypeProto* orig_type_proto = io_def.node_arg.TypeAsProto();
+  const auto* orig_type_proto = io_def.node_arg.TypeAsProto();
 
   // Handle quantized input or output. Convert to float type.
   if (io_def.quant_param.has_value()) {
@@ -68,11 +68,11 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph,
     ORT_ENFORCE(tensor_proto_iter != src_initializers.end(),
                 "Unable to find scale initializer ", scale_initializer_name);
 
-    const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = tensor_proto_iter->second;
+    const auto* scale_tensor_proto = tensor_proto_iter->second;
     int32_t float_type = scale_tensor_proto->data_type();
 
     // Noe set the arg type to the float type of scale. Could be one of float/float16/bfloat16
-    std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
+    auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
     type_proto->copy_from(orig_type_proto);
     type_proto->mutable_tensor_type()->set_elem_type(float_type);
 
@@ -457,7 +457,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti
     if (duplicate_dq &&
         GetQDQDataType(&node_unit.GetNode()) != DT_UINT16 && GetQDQDataType(&node_unit.GetNode()) != DT_INT16) {
       std::string orig_dq_name = node_unit.Outputs()[0].node_arg.Name();  // ex: dql_output/duplicated
-      std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
+      auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
       type_proto->copy_from(node_unit.Inputs()[0].node_arg.TypeAsProto());
       type_proto->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
       orig_dq_name.erase(orig_dq_name.find(DuplicateDQ), std::string::npos);  // ex: dql_output
@@ -625,93 +625,93 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
   KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node);
 }
 
-static void AddInitializerAsInput (onnxruntime::Graph& dst_graph,
+static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
                                   InlinedVector<const NodeArg*>& accumulated_inputs,
                                   const onnxruntime::GraphViewer& src_graph,
                                   const std::string& initializer_name) {
-    // Get the initializer from source graph
-    const auto& src_initializers = src_graph.GetAllInitializedTensors();
-    auto init_iter = src_initializers.find(initializer_name);
+  // Get the initializer from source graph
+  const auto& src_initializers = src_graph.GetAllInitializedTensors();
+  auto init_iter = src_initializers.find(initializer_name);
 
-    if (init_iter == src_initializers.end()) {
-      // Initializer not found
-      return;
-    }
+  if (init_iter == src_initializers.end()) {
+    // Initializer not found
+    return;
+  }
 
-    const ONNX_NAMESPACE::TensorProto* tensor_proto = init_iter->second;
+  const auto* tensor_proto = init_iter->second;
 
-    // Create TypeProto for the initializer
-    std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
-    auto* tensor_type = type_proto->mutable_tensor_type();
-    tensor_type->set_elem_type(tensor_proto->data_type());
+  // Create TypeProto for the initializer
+  auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+  auto* tensor_type = type_proto->mutable_tensor_type();
+  tensor_type->set_elem_type(tensor_proto->data_type());
 
-    for (int i = 0; i < tensor_proto->dims_size(); ++i) {
-        tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i));
-    }
+  for (int i = 0; i < tensor_proto->dims_size(); ++i) {
+    tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i));
+  }
 
-    // Create NodeArg for the initializer
-    auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get());
+  // Create NodeArg for the initializer
+  auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get());
 
-    // Check if input already exists in accumulated inputs
-    bool input_exists = false;
-    for (const auto* existing_input : accumulated_inputs) {
-        if (existing_input->Name() == initializer_name) {
-            input_exists = true;
-            break;
-        }
+  // Check if input already exists in accumulated inputs
+  bool input_exists = false;
+  for (const auto* existing_input : accumulated_inputs) {
+    if (existing_input->Name() == initializer_name) {
+      input_exists = true;
+      break;
     }
+  }
 
-    if (!input_exists) {
-        // Add to accumulated inputs
-        accumulated_inputs.push_back(&input_arg);
-    }
+  if (!input_exists) {
+    // Add to accumulated inputs
+    accumulated_inputs.push_back(&input_arg);
+  }
 }
 
-bool writeString(std::ofstream& outfile, const std::string& str) {
-    size_t size = str.size();
-    outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    if (!outfile.good()) return false;
+template <typename T>
+bool writeScalar(std::ofstream& outfile, const T& scalar) {
+  auto size = sizeof(T);
+  outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
+  if (!outfile.good()) return false;
 
-    outfile.write(str.c_str(), size);
-    return outfile.good();
+  outfile.write(reinterpret_cast<const char*>(&scalar), size);
+  return outfile.good();
 }
 
-bool writeStringVector(std::ofstream& outfile, const std::vector<std::string>& vec) {
-    size_t size = vec.size();
-    outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    if (!outfile.good()) return false;
+template <>
+bool writeScalar(std::ofstream& outfile, const std::string& text) {
+  auto size = text.size() * sizeof(std::string::value_type);
+  outfile.write(reinterpret_cast<const char*>(&size), size);
+  if (!outfile.good()) return false;
 
-    for (const auto& str : vec) {
-        if (!writeString(outfile, str)) {
-            return false;
-        }
-    }
-    return true;
+  outfile.write(text.data(), size);
+  return outfile.good();
 }
 
 // Main function to dump the map to a binary file
-bool dumpMetaDataMapToBinary(const std::unordered_map<std::string, std::vector<std::string>>& map, const std::string& filename) {
-
+bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::string& filename) {
   std::ofstream outfile(filename, std::ios::binary);
   if (!outfile.is_open()) {
-      ORT_THROW("Error: Could not open file for writing metadata.");
-      return false;
+    ORT_THROW("Error: Could not open file for writing metadata.");
+    return false;
   }
 
   // Write the size of the map
-  size_t map_size = map.size();
+  size_t map_size = metadata.size();
   outfile.write(reinterpret_cast<const char*>(&map_size), sizeof(map_size));
   if (!outfile.good()) {
-      ORT_THROW("Error: Failed to write map size.");
-      return false;
+    ORT_THROW("Error: Failed to write map size.");
+    return false;
   }
 
   // Write each key-value pair
-  for (const auto& pair : map) {
-      if (!writeString(outfile, pair.first) || !writeStringVector(outfile, pair.second)) {
-          ORT_THROW("Error: Failed to write map data.");
-          return false;
-      }
+  for (const auto& [key, value] : metadata) {
+    bool result = true;
+    result &= writeScalar(outfile, key.name);
+    result &= writeScalar(outfile, value.location);
+    result &= writeScalar(outfile, value.data_offset);
+    result &= writeScalar(outfile, value.size);
+
+    ORT_ENFORCE(result, "Error: Failed to write map data.");
   }
 
   return true;
@@ -721,7 +721,8 @@ bool dumpMetaDataMapToBinary(const std::unordered_map<std::string, std::vector<s
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
                                        bool enable_ovep_weight_sharing,
-                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model) {
+                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model,
+                                       /*out*/ sw& shared_weights) {
   // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc
   // with the following differences:
   //   - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs.
@@ -819,7 +820,6 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     seen_node_units.insert(node_unit);
   }
 
-
   //  Copy initializers to dst graph.
 
   std::unordered_set<std::string> current_scope_initializer_set;
@@ -834,97 +834,94 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   std::sort(const_inits.begin(), const_inits.end());
 
   // initialize map for creating metadata for initilizers with external weights
-  std::unordered_map<std::string, std::vector<std::string>> metadata_map;
+  auto& metadata = shared_weights.metadata;
+
+  const auto& insert_metadata = [&metadata](const std::string& name, ONNX_NAMESPACE::StringStringEntryProtos* entry_protos) {
+    // key: [name], value: [location, offset, length]
+    sw::Metadata::Map::key_type key{name};
+    sw::Metadata::Map::mapped_type value{};
+
+    for (int i = 0; i < entry_protos->size(); i++) {
+      auto& string_entry_proto{entry_protos->at(i)};
+      const auto& pb_key{*(string_entry_proto.mutable_key())};
+      const auto& pb_value{*(string_entry_proto.mutable_value())};
+      if (pb_key == "location") {
+        value.location = pb_value;
+      } else if (pb_key == "offset") {
+        value.data_offset = std::stoul(pb_value);
+      } else if (pb_key == "length") {
+        value.size = std::stoul(pb_value);
+      }
+    }
 
+    metadata.emplace(key, value);
+  };
   // metadata structure: initializer_name as key
   // and [location, offset, length] as value
 
   for (auto& it : const_inits) {
-      const auto* initializer_tensor = initializers.at(it);
-
-      // Check if the initializer has external data
-      if (initializer_tensor->has_data_location() &&
-          initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
-          enable_ovep_weight_sharing) {
+    const auto* initializer_tensor = initializers.at(it);
 
-              // Cast away const to access mutable_external_data
-              struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
+    // Check if the initializer has external data
+    if (initializer_tensor->has_data_location() &&
+        initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+        enable_ovep_weight_sharing) {
+      // Cast away const to access mutable_external_data
+      auto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
 
-              // get meta data about the initilizers with external data
-              struct ONNX_NAMESPACE::StringStringEntryProtos* external_data =  non_const_initializer_tensor->mutable_external_data();
+      // get meta data about the initilizers with external data
+      auto* external_data = non_const_initializer_tensor->mutable_external_data();
 
-              std::vector<std::string> init_info;
-              // init_info structure: [location, offset, length]
+      insert_metadata(initializer_tensor->name(), external_data);
 
-              for (int i = 0 ; i < external_data->size() ; i++) {
-                init_info.push_back(*external_data->at(i).mutable_value());
-              }
+      // Add initializer with external data as input
+      AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
 
-              metadata_map.emplace(initializer_tensor->name(), init_info);
-              // Add initializer with external data as input
-              AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
-
-      } else {
-          // Add as an initialized tensor if it does not have external data
-          if (initializers_to_keep.count(it))
-            dst_graph.AddInitializedTensor(*(initializers.at(it)));
-
-      }
+    } else {
+      // Add as an initialized tensor if it does not have external data
+      if (initializers_to_keep.count(it))
+        dst_graph.AddInitializedTensor(*(initializers.at(it)));
+    }
 
-      current_scope_initializer_set.insert(it);
+    current_scope_initializer_set.insert(it);
   }
 
   // Handle outer-scope constant initializers
   for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) {
-      const auto& node = src_graph.GetNode(node_idx);
-      for (const auto& input : node->InputDefs()) {
-          if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) {
-              continue;
-          }
-
-          if (src_graph.IsConstantInitializer(input->Name(), true)) {
-              const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
-              // Check if the initializer has external data
-              if (initializer_tensor->has_data_location() &&
-                  initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
-                  enable_ovep_weight_sharing) {
-
-                      // Cast away const to access mutable_external_data
-                      struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
-
-                      // get meta data about the initilizers with external data
-                      struct ONNX_NAMESPACE::StringStringEntryProtos* external_data =  non_const_initializer_tensor->mutable_external_data();
+    const auto& node = src_graph.GetNode(node_idx);
+    for (const auto& input : node->InputDefs()) {
+      if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) {
+        continue;
+      }
 
-                      std::vector<std::string> init_info;
-                      for (int i = 0 ; i < external_data->size() ; i++) {
-                        init_info.push_back(*external_data->at(i).mutable_value());
-                      }
+      if (src_graph.IsConstantInitializer(input->Name(), true)) {
+        const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
+        // Check if the initializer has external data
+        if (initializer_tensor->has_data_location() &&
+            initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+            enable_ovep_weight_sharing) {
+          // Cast away const to access mutable_external_data
+          auto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
 
-                      metadata_map.emplace(initializer_tensor->name(), init_info);
+          // get meta data about the initilizers with external data
+          auto* external_data = non_const_initializer_tensor->mutable_external_data();
 
-                      // Add initializer as input if it has external data
-                      AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());
+          insert_metadata(initializer_tensor->name(), external_data);
 
-              } else {
-                  // Add as an initialized tensor if it does not have external data
-                  if (initializers_to_keep.count(input->Name())) {
-                    dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
-                  }
-              }
+          // Add initializer as input if it has external data
+          AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());
 
-              current_scope_initializer_set.insert(input->Name());
+        } else {
+          // Add as an initialized tensor if it does not have external data
+          if (initializers_to_keep.count(input->Name())) {
+            dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
           }
+        }
+
+        current_scope_initializer_set.insert(input->Name());
       }
-  }
-  if (enable_ovep_weight_sharing) {
-    // creating bin file of metadata_map and dumping the bin file
-    if (dumpMetaDataMapToBinary(metadata_map, "metadata.bin")) {
-      LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped.";
-    } else {
-      ORT_THROW("Error: Unable to write metadat to file.");
     }
   }
-
   accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end());
 
   // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
index 5b777a388adda..02831525cba32 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
@@ -5,15 +5,20 @@
 
 #include <memory>
 #include "core/providers/shared_library/provider_api.h"
+#include "core/providers/openvino/contexts.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
+using sw = SharedContext::SharedWeights;
+
 // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
                                        bool enable_ovep_weight_sharing,
-                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model);
+                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model,
+                                       /*out*/ sw& shared_weights);
 
+bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename);
 }  // namespace openvino_ep
 }  // namespace onnxruntime

From 01ac259a838732fae0cb5125fa5cbed9c61f95a2 Mon Sep 17 00:00:00 2001
From: ankitm3k <ankit.maheshkar@intel.com>
Date: Mon, 13 Jan 2025 12:21:17 +0530
Subject: [PATCH 14/35] fix: fix provider options

---
 .../providers/openvino/openvino_execution_provider.cc     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 684a2c64237b8..d02a642699a82 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -69,11 +69,11 @@ void AdjustProviderInfo(ProviderInfo& info) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
                        << "No runtime device selection option provided.";
 #if defined OPENVINO_CONFIG_CPU
-    device_type_ = "CPU";
-    precision_ = "FP32";
+    info.device_type = "CPU";
+    info.precision = "FP32";
 #elif defined OPENVINO_CONFIG_GPU
-    device_type_ = "GPU";
-    precision_ = "FP16";
+    info.device_type = "GPU";
+    info.precision = "FP16";
 #elif defined OPENVINO_CONFIG_NPU
     info.device_type = "NPU";
     info.precision = "FP16";

From db075cd1475356c1e54cf37480e7ba089b80fd3d Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Mon, 13 Jan 2025 08:23:04 -0800
Subject: [PATCH 15/35] create ov tensor from meta data and external data

---
 .../qdq_transformations/qdq_stripping.cc      | 70 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 2ba6e03dd4d8e..79529a8586be0 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -717,6 +717,68 @@ bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::strin
   return true;
 }
 
+// Helper function to read binary data from a file
+std::vector<float> readBinaryData(const std::string& filePath, size_t offset, size_t length) {
+    std::vector<float> data(length / sizeof(float), 0);
+    std::ifstream file(filePath, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Failed to open file: " + filePath);
+    }
+
+    file.seekg(offset, std::ios::beg);
+    file.read(reinterpret_cast<char*>(data.data()), length);
+
+    if (!file) {
+        throw std::runtime_error("Error reading from file: " + filePath);
+    }
+    return data;
+}
+
+// Function to handle tensor creation from external data
+void CreateOVTensor(const ONNX_NAMESPACE::TensorProto* initializer_tensor,
+                    onnxruntime::openvino_ep::SharedContext::SharedWeights::Metadata::Map& metadata_map) {
+
+  for (auto itr: metadata_map) {
+    if (initializer_tensor->name() == itr.first.name) {
+      std::string filePath = itr.second.location;
+      std::uint32_t offset = itr.second.data_offset;
+      std::uint32_t length = itr.second.size;
+
+    // Read binary data
+    auto rawData = readBinaryData(filePath, offset, length);
+
+    // Get dimensions
+    std::vector<size_t> shape;
+    for (auto itt = 0 ; itt < initializer_tensor->dims().size() ; itt++) {
+      shape.push_back(initializer_tensor->dims()[itt]);
+    }
+
+    // Create OpenVINO Tensor
+    ov::element::Type elementType = ov::element::f32;
+    ov::Tensor tensor(elementType, shape, rawData.data());
+    }
+  }
+}
+
+ov::element::Type GetOpenVINOElementType(int onnx_data_type) {
+    switch (onnx_data_type) {
+        case 1: return ov::element::f32;      // FLOAT
+        case 2: return ov::element::u8;       // UINT8
+        case 3: return ov::element::i8;       // INT8
+        case 4: return ov::element::u16;      // UINT16
+        case 5: return ov::element::i16;      // INT16
+        case 6: return ov::element::i32;      // INT32
+        case 7: return ov::element::i64;      // INT64
+        case 9: return ov::element::boolean;  // BOOL
+        case 10: return ov::element::f16;     // FLOAT16
+        case 11: return ov::element::f64;     // DOUBLE
+        case 12: return ov::element::u32;     // UINT32
+        case 13: return ov::element::u64;     // UINT64
+        default:
+            throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(onnx_data_type));
+    }
+}
+
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
@@ -858,7 +920,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   };
   // metadata structure: initializer_name as key
   // and [location, offset, length] as value
-
+  std::cout << typeid(metadata).name();
   for (auto& it : const_inits) {
     const auto* initializer_tensor = initializers.at(it);
 
@@ -866,6 +928,10 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     if (initializer_tensor->has_data_location() &&
         initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
         enable_ovep_weight_sharing) {
+
+      int onnx_data_type = initializer_tensor->data_type();  // Get ONNX data type
+      ov::element::Type elementType = GetOpenVINOElementType(onnx_data_type); // Map to OpenVINO data type
+
       // Cast away const to access mutable_external_data
       auto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
 
@@ -877,6 +943,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
       // Add initializer with external data as input
       AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
 
+      // Create OV tensor based on external data and metadata
+      CreateOVTensor(initializer_tensor, metadata);
     } else {
       // Add as an initialized tensor if it does not have external data
       if (initializers_to_keep.count(it))

From 8209162949715e54acb672b696da61d8b0467030 Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Mon, 13 Jan 2025 08:25:48 -0800
Subject: [PATCH 16/35] create ov tensor

---
 .../core/providers/openvino/qdq_transformations/qdq_stripping.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 79529a8586be0..4bdc72f643018 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -920,7 +920,6 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   };
   // metadata structure: initializer_name as key
   // and [location, offset, length] as value
-  std::cout << typeid(metadata).name();
   for (auto& it : const_inits) {
     const auto* initializer_tensor = initializers.at(it);
 

From 89ebe8d6765ede635514d6560c0cd011496ff13f Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Tue, 14 Jan 2025 21:53:02 -0800
Subject: [PATCH 17/35] Add support for binding weight as input tensors

---
 .../providers/openvino/backend_manager.cc     |  23 ++++
 .../core/providers/openvino/backend_utils.cc  |  92 +++++++++++++-
 .../core/providers/openvino/backend_utils.h   |   4 +
 .../openvino/backends/backend_factory.cc      |   3 +-
 .../openvino/backends/basic_backend.cc        |  21 +++-
 .../openvino/backends/basic_backend.h         |   6 +-
 .../core/providers/openvino/contexts.h        |  23 +++-
 .../core/providers/openvino/ibackend.h        |   1 +
 .../core/providers/openvino/ov_interface.cc   |  17 ++-
 .../core/providers/openvino/ov_interface.h    |   4 +-
 .../qdq_transformations/qdq_stripping.cc      | 115 ++++--------------
 11 files changed, 202 insertions(+), 107 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index c06e00272a8c8..d9ef25cefbf59 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -104,6 +104,24 @@ BackendManager::BackendManager(SessionContext& session_context,
   }
   std::string device_type = session_context_.device_type;
 
+  auto& sw = shared_context_.shared_weights;
+  if (session_context_.so_share_ep_contexts) {
+    std::filesystem::path weight_filename = session_context_.cache_dir.parent_path();
+    if (sw.external_weight_filename.empty())
+    {
+      sw.external_weight_filename = sw.metadata.begin()->second.location;
+    }
+    weight_filename /= sw.external_weight_filename;
+    std::ifstream weight_file(weight_filename);
+
+    if (weight_file) {
+      if (!sw.mapped_weights) {
+        sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::MappedWeights>(weight_filename);
+      }
+      backend_utils::CreateOVTensors(sw.metadata, sw.mapped_weights->weight_data);
+    }
+  }
+
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
@@ -116,6 +134,7 @@ BackendManager::BackendManager(SessionContext& session_context,
         concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                         session_context_,
                                                         subgraph_context_,
+                                                        shared_context_,
                                                         model_stream);
       } catch (std::string const& msg) {
         ORT_THROW(msg);
@@ -139,6 +158,7 @@ BackendManager::BackendManager(SessionContext& session_context,
       concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                       session_context_,
                                                       subgraph_context_,
+                                                      shared_context_,
                                                       model_stream);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
@@ -158,6 +178,7 @@ BackendManager::BackendManager(SessionContext& session_context,
           concrete_backend_ = BackendFactory::MakeBackend(model_proto,
                                                           session_context_,
                                                           subgraph_context_,
+                                                          shared_context_,
                                                           model_stream);
         } catch (std::string const& msg) {
           ORT_THROW(msg);
@@ -489,6 +510,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
         dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
                                                       session_context_,
                                                       subgraph_context_,
+                                                      shared_context_,
                                                       model_stream);
       } catch (const OnnxRuntimeException& ex) {
         // Build option disables fallback to CPU on compilation failures with NPU.
@@ -508,6 +530,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
             dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
                                                           session_context_,
                                                           subgraph_context_,
+                                                          shared_context_,
                                                           model_stream);
           } catch (std::string const& msg) {
             ORT_THROW(msg);
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 4adf9f5b89833..440b2e9bc5019 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -12,10 +12,46 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/ov_interface.h"
 
+#include "Windows.h"
+
 using Exception = ov::Exception;
 
 namespace onnxruntime {
 namespace openvino_ep {
+
+SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) {
+  file_ = CreateFile(filename.string().data(),
+                     GENERIC_READ,
+                     FILE_SHARE_READ,
+                     0,
+                     OPEN_EXISTING,
+                     FILE_ATTRIBUTE_NORMAL,
+                     0);
+  ORT_ENFORCE(file_ != nullptr, "Unable to open weight file at ", filename.string());
+
+  mapping_ = CreateFileMapping(file_, 0, PAGE_READONLY, 0, 0, 0);
+  ORT_ENFORCE(mapping_ != nullptr, "Unable to create mapping of weight file at ", filename.string());
+
+  const char* raw_data = static_cast<const char*>(MapViewOfFile(mapping_, FILE_MAP_READ, 0, 0, 0));
+  ORT_ENFORCE(raw_data != nullptr, "Unable to map weight file at ", filename.string());
+
+  weight_data = std::string_view(raw_data, std::filesystem::file_size(filename));
+}
+
+SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
+  if (!weight_data.empty()) {
+    UnmapViewOfFile(weight_data.data());
+  }
+  if (mapping_ != nullptr) {
+    CloseHandle(mapping_);
+    mapping_ = nullptr;
+  }
+  if (file_ != nullptr) {
+    CloseHandle(file_);
+    file_ = nullptr;
+  }
+}
+
 namespace backend_utils {
 
 bool IsDebugEnabled() {
@@ -34,11 +70,6 @@ bool IsCILogEnabled() {
   return false;
 }
 
-struct static_cast_int64 {
-  template <typename T1>  // T1 models type statically convertible to T
-  int64_t operator()(const T1& x) const { return static_cast<int64_t>(x); }
-};
-
 std::shared_ptr<const OVNetwork>
 CreateOVModel(const std::string model,
               const SessionContext& session_context,
@@ -268,6 +299,57 @@ void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std
   printPerformanceCounts(performanceMap, stream, std::move(deviceName));
 }
 
+ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) {
+  static std::unordered_map<ONNX_NAMESPACE::TensorProto_DataType, ov::element::Type> map{
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64},
+      {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string},
+      {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean},
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16},
+      {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64},
+      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined},
+      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4},
+  };
+
+  if (auto result = map.find(dt); result != map.end()) {
+    return result->second;
+  } else {
+    throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt));
+  }
+}
+
+// Function to handle tensor creation from external data
+void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights) {
+  for (auto& [key, value] : metadata_map) {
+    if (value.tensor) continue;
+
+    // Get tensor data
+    const auto* tensor_data = weights.data() + value.data_offset;
+
+    // Get element data type
+    auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type;
+    ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type);  // Map to OpenVINO data type
+
+    // Create OpenVINO Tensor
+    value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions, (void*)tensor_data);
+    ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch");
+  }
+}
+
 }  // namespace backend_utils
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 0d7378072cb1b..a2e16f5dbbfa9 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -10,6 +10,7 @@
 #include <memory>
 #include <vector>
 #include <string>
+#include <string_view>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -66,6 +67,9 @@ CreateOVModel(const std::string model,
               const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
+void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map,
+                     std::string_view weights);
+
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName);
 
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index 2fd9a7fa0a537..fedc3f21c8e33 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -14,6 +14,7 @@ std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                             SessionContext& session_context,
                             const SubGraphContext& subgraph_context,
+                            SharedContext &shared_context,
                             ptr_stream_t& model_stream) {
   std::string type = session_context.device_type;
   if (type == "CPU" || type.find("GPU") != std::string::npos ||
@@ -23,7 +24,7 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
       type.find("AUTO") != std::string::npos) {
     std::shared_ptr<IBackend> concrete_backend_;
     try {
-      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, model_stream);
+      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, shared_context, model_stream);
     } catch (std::string const& msg) {
       ORT_THROW(msg);
     }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index fb0fdc9b5e85b..6202f9cd95f85 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -23,8 +23,9 @@ using namespace backend_utils;
 BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                            SessionContext& session_context,
                            const SubGraphContext& subgraph_context,
+                           SharedContext& shared_context,
                            ptr_stream_t& model_stream)
-    : session_context_(session_context), subgraph_context_(subgraph_context) {
+    : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} {
   std::string& hw_target = session_context_.device_type;
 
   if (ValidateSubgraph(const_outputs_map_))
@@ -123,8 +124,24 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   } catch (const char* msg) {
     ORT_THROW(msg);
   }
+
   int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
-  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req));
+  std::function<void(OVInferRequestPtr)> initializer = [](OVInferRequestPtr) {};
+  auto metadata = shared_context_.shared_weights.metadata;
+  if (session_context_.so_share_ep_contexts) {
+    initializer = [&metadata](OVInferRequestPtr ir_ptr) {
+      const auto input_count = ir_ptr->GetNumInputs();
+      for (auto i = 0; i < input_count; i++) {
+        using Key = SharedContext::SharedWeights::Metadata::Key;
+        const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)};
+        if (metadata.contains(tensor_key)) {
+          auto& value = metadata.at(tensor_key);
+          ir_ptr->SetTensor(tensor_key.name, value.tensor);
+        }
+      }
+    };
+  }
+  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, initializer));
 }
 
 bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 177784a71f575..22bcc4c1da40e 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -12,6 +12,7 @@
 #include <condition_variable>
 #include <mutex>
 #include <map>
+#include <functional>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -32,6 +33,7 @@ class BasicBackend : public IBackend {
   BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                SessionContext& session_context,
                const SubGraphContext& subgraph_context,
+               SharedContext& shared_context,
                ptr_stream_t& model_stream);
 
   void Infer(OrtKernelContext* context) override;
@@ -57,6 +59,7 @@ class BasicBackend : public IBackend {
 
   SessionContext& session_context_;
   SubGraphContext subgraph_context_;
+  SharedContext& shared_context_;
   mutable std::mutex compute_lock_;
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
@@ -71,10 +74,11 @@ class BasicBackend : public IBackend {
 
 class InferRequestsQueue {
  public:
-  InferRequestsQueue(OVExeNetwork& net, size_t nireq) {
+  InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
     OVInferRequestPtr infer_request;
     for (size_t id = 0; id < nireq; id++) {
       infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
+      initializer(infer_request);
       infer_requests_.push_back(infer_request);
     }
   }
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index f96fec345eef1..68c0ecf87004b 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -8,6 +8,8 @@
 #include <unordered_map>
 #include <string>
 #include <filesystem>
+#include <memory>
+#include "core/common/common.h"
 #include "core/providers/openvino/ov_interface.h"
 
 namespace onnxruntime {
@@ -31,12 +33,29 @@ struct SharedContext {
         std::string location;
         unsigned int data_offset;
         unsigned int size;
-        ov::Tensor* tensor;
+        std::vector<size_t> dimensions;
+        std::int32_t element_type;
+        std::shared_ptr<ov::Tensor> tensor;
       };
       using Map = std::unordered_map<Key, Value, KeyHash>;
     };
-    Metadata::Map metadata;
+
+    struct MappedWeights {
+      ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MappedWeights);
+      ~MappedWeights();
+      MappedWeights() = delete;
+      explicit MappedWeights(std::filesystem::path filename);
+
+      std::string_view weight_data;
+
+     private:
+      void* file_;
+      void* mapping_;
+    };
+
     fs::path external_weight_filename;
+    std::unique_ptr<MappedWeights> mapped_weights;
+    Metadata::Map metadata;
   } shared_weights;
 };
 
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 0d440eee598d3..dfa669aace875 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -24,6 +24,7 @@ class BackendFactory {
   MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
               SessionContext& session_context,
               const SubGraphContext& subgraph_context,
+              SharedContext &shared_context,
               ptr_stream_t& model_stream);
 };
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 804db5b726fc5..5b853539c31ea 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -197,7 +197,18 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
   }
 }
 
-void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) {
+std::string OVInferRequest::GetInputTensorName(uint32_t index) {
+  try {
+    const auto &model = ovInfReq.get_compiled_model();
+    return *model.input(index).get_names().begin();
+  } catch (const Exception& e) {
+    ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what());
+  } catch (...) {
+    ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index);
+  }
+}
+
+void OVInferRequest::SetTensor(const std::string &name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {
@@ -207,6 +218,10 @@ void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) {
   }
 }
 
+uint32_t OVInferRequest::GetNumInputs() {
+  return ovInfReq.get_compiled_model().inputs().size();
+}
+
 void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 550c7962cca13..5d88994dbabb0 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -86,8 +86,10 @@ class OVInferRequest {
   ov::InferRequest ovInfReq;
 
  public:
+  uint32_t GetNumInputs();
   OVTensorPtr GetTensor(const std::string& name);
-  void SetTensor(std::string name, OVTensorPtr& blob);
+  std::string GetInputTensorName(uint32_t index);
+  void SetTensor(const std::string& name, OVTensorPtr& blob);
   void StartAsync();
   void Infer();
   void WaitRequest();
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 4bdc72f643018..019e121b4f575 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -717,68 +717,6 @@ bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::strin
   return true;
 }
 
-// Helper function to read binary data from a file
-std::vector<float> readBinaryData(const std::string& filePath, size_t offset, size_t length) {
-    std::vector<float> data(length / sizeof(float), 0);
-    std::ifstream file(filePath, std::ios::binary);
-    if (!file) {
-        throw std::runtime_error("Failed to open file: " + filePath);
-    }
-
-    file.seekg(offset, std::ios::beg);
-    file.read(reinterpret_cast<char*>(data.data()), length);
-
-    if (!file) {
-        throw std::runtime_error("Error reading from file: " + filePath);
-    }
-    return data;
-}
-
-// Function to handle tensor creation from external data
-void CreateOVTensor(const ONNX_NAMESPACE::TensorProto* initializer_tensor,
-                    onnxruntime::openvino_ep::SharedContext::SharedWeights::Metadata::Map& metadata_map) {
-
-  for (auto itr: metadata_map) {
-    if (initializer_tensor->name() == itr.first.name) {
-      std::string filePath = itr.second.location;
-      std::uint32_t offset = itr.second.data_offset;
-      std::uint32_t length = itr.second.size;
-
-    // Read binary data
-    auto rawData = readBinaryData(filePath, offset, length);
-
-    // Get dimensions
-    std::vector<size_t> shape;
-    for (auto itt = 0 ; itt < initializer_tensor->dims().size() ; itt++) {
-      shape.push_back(initializer_tensor->dims()[itt]);
-    }
-
-    // Create OpenVINO Tensor
-    ov::element::Type elementType = ov::element::f32;
-    ov::Tensor tensor(elementType, shape, rawData.data());
-    }
-  }
-}
-
-ov::element::Type GetOpenVINOElementType(int onnx_data_type) {
-    switch (onnx_data_type) {
-        case 1: return ov::element::f32;      // FLOAT
-        case 2: return ov::element::u8;       // UINT8
-        case 3: return ov::element::i8;       // INT8
-        case 4: return ov::element::u16;      // UINT16
-        case 5: return ov::element::i16;      // INT16
-        case 6: return ov::element::i32;      // INT32
-        case 7: return ov::element::i64;      // INT64
-        case 9: return ov::element::boolean;  // BOOL
-        case 10: return ov::element::f16;     // FLOAT16
-        case 11: return ov::element::f64;     // DOUBLE
-        case 12: return ov::element::u32;     // UINT32
-        case 13: return ov::element::u64;     // UINT64
-        default:
-            throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(onnx_data_type));
-    }
-}
-
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
@@ -898,11 +836,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   // initialize map for creating metadata for initilizers with external weights
   auto& metadata = shared_weights.metadata;
 
-  const auto& insert_metadata = [&metadata](const std::string& name, ONNX_NAMESPACE::StringStringEntryProtos* entry_protos) {
-    // key: [name], value: [location, offset, length]
-    sw::Metadata::Map::key_type key{name};
+  const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) {
+    sw::Metadata::Map::key_type key{proto.name()};
     sw::Metadata::Map::mapped_type value{};
 
+    using mutable_proto_t = ONNX_NAMESPACE::TensorProto*;
+    auto& mutable_proto = *const_cast<mutable_proto_t>(&proto);
+    auto* entry_protos = mutable_proto.mutable_external_data();
     for (int i = 0; i < entry_protos->size(); i++) {
       auto& string_entry_proto{entry_protos->at(i)};
       const auto& pb_key{*(string_entry_proto.mutable_key())};
@@ -915,35 +855,28 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
         value.size = std::stoul(pb_value);
       }
     }
+    value.element_type = proto.data_type();
+    value.dimensions.resize(proto.dims_size());
+    for (uint32_t index = 0; auto& dim : value.dimensions) {
+      dim = proto.dims()[index++];
+    }
 
-    metadata.emplace(key, value);
+    metadata.emplace(key, std::move(value));
   };
-  // metadata structure: initializer_name as key
-  // and [location, offset, length] as value
+
+  // Handle constant initializers
   for (auto& it : const_inits) {
-    const auto* initializer_tensor = initializers.at(it);
+    const auto& initializer_tensor = *initializers.at(it);
 
     // Check if the initializer has external data
-    if (initializer_tensor->has_data_location() &&
-        initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+    if (initializer_tensor.has_data_location() &&
+        initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
         enable_ovep_weight_sharing) {
-
-      int onnx_data_type = initializer_tensor->data_type();  // Get ONNX data type
-      ov::element::Type elementType = GetOpenVINOElementType(onnx_data_type); // Map to OpenVINO data type
-
-      // Cast away const to access mutable_external_data
-      auto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
-
-      // get meta data about the initilizers with external data
-      auto* external_data = non_const_initializer_tensor->mutable_external_data();
-
-      insert_metadata(initializer_tensor->name(), external_data);
+      insert_metadata(initializer_tensor);
 
       // Add initializer with external data as input
       AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
 
-      // Create OV tensor based on external data and metadata
-      CreateOVTensor(initializer_tensor, metadata);
     } else {
       // Add as an initialized tensor if it does not have external data
       if (initializers_to_keep.count(it))
@@ -962,18 +895,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
       }
 
       if (src_graph.IsConstantInitializer(input->Name(), true)) {
-        const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true);
+        const auto& initializer_tensor = *src_graph.GetConstantInitializer(input->Name(), true);
         // Check if the initializer has external data
-        if (initializer_tensor->has_data_location() &&
-            initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+        if (initializer_tensor.has_data_location() &&
+            initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
             enable_ovep_weight_sharing) {
-          // Cast away const to access mutable_external_data
-          auto* non_const_initializer_tensor = const_cast<ONNX_NAMESPACE::TensorProto*>(initializer_tensor);
-
-          // get meta data about the initilizers with external data
-          auto* external_data = non_const_initializer_tensor->mutable_external_data();
-
-          insert_metadata(initializer_tensor->name(), external_data);
+          insert_metadata(initializer_tensor);
 
           // Add initializer as input if it has external data
           AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());

From ae408afe0e09bbf341095d09442ebe8a0015fb89 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Wed, 15 Jan 2025 21:14:03 -0800
Subject: [PATCH 18/35] Fix for mapping subgraph to ov compiled network
 arguments

---
 .../providers/openvino/backend_manager.cc     | 35 +++++-----------
 .../core/providers/openvino/backend_utils.cc  |  4 +-
 .../core/providers/openvino/backend_utils.h   |  4 +-
 .../openvino/backends/backend_factory.cc      |  2 +-
 .../openvino/backends/basic_backend.cc        | 41 ++++++++-----------
 .../core/providers/openvino/contexts.h        |  6 +--
 .../core/providers/openvino/ibackend.h        |  2 +-
 .../core/providers/openvino/ov_interface.cc   |  4 +-
 8 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index d9ef25cefbf59..d1564836cb247 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -71,30 +71,16 @@ BackendManager::BackendManager(SessionContext& session_context,
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
-  auto node_input_defs = fused_node.InputDefs();
-  int i = 0;
-  for (auto idef : node_input_defs) {
-    subgraph_context_.input_names.insert({idef->Name(), i});
-    i++;
+  for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) {
+    subgraph_context_.input_names.insert({node->Name(), index++});
   }
 
-  const std::vector<const NodeArg*>& graph_inputs = subgraph.GetInputs();
-  for (auto input : graph_inputs) {
-    auto it = subgraph_context_.input_names.find(input->Name());
-    if (it == subgraph_context_.input_names.end()) {
-      ORT_THROW("Input not found in the input defs list");
-    }
-    int index = it->second;
-    subgraph_context_.input_indexes.push_back(index);
+  for (uint32_t index = 0; const auto& node : subgraph.GetOutputs()) {
+    subgraph_context_.output_names.insert({node->Name(), index++});
   }
 
-  auto graph_outputs_defs = fused_node.OutputDefs();
-  i = 0;
-  for (auto output_def : graph_outputs_defs) {
-    subgraph_context_.output_names.insert({output_def->Name(), i});
-    i++;
-  }
   subgraph_context_.subgraph_name = fused_node.Name();
+
   ptr_stream_t model_stream;
   std::unique_ptr<onnx::ModelProto> model_proto;
   if (subgraph_context_.is_ep_ctx_graph) {
@@ -107,8 +93,7 @@ BackendManager::BackendManager(SessionContext& session_context,
   auto& sw = shared_context_.shared_weights;
   if (session_context_.so_share_ep_contexts) {
     std::filesystem::path weight_filename = session_context_.cache_dir.parent_path();
-    if (sw.external_weight_filename.empty())
-    {
+    if (sw.external_weight_filename.empty()) {
       sw.external_weight_filename = sw.metadata.begin()->second.location;
     }
     weight_filename /= sw.external_weight_filename;
@@ -276,8 +261,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
 bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const {
   bool has_batched_inputs = true;
 
-  for (int i = 0; i < static_cast<int>(subgraph_context_.input_indexes.size()); i++) {
-    auto& input = model_proto.graph().input(subgraph_context_.input_indexes[i]);
+  for (const auto& [name, index] : subgraph_context_.input_names) {
+    auto& input = model_proto.graph().input(index);
 
     // Batch-process only raw image inputs (NCHW or NHWC layouts)
     auto& shape = input.type().tensor_type().shape();
@@ -291,8 +276,8 @@ bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& mod
       break;
     }
 
-    for (int index = 1; index < 4; index++) {
-      if (shape.dim(index).value_case() != shape.dim(0).kDimValue) {
+    for (int dim_index = 1; dim_index < 4; dim_index++) {
+      if (shape.dim(dim_index).value_case() != shape.dim(0).kDimValue) {
         has_batched_inputs = false;
         break;
       }
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 440b2e9bc5019..e37254b34b9fd 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -114,7 +114,7 @@ Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
                 OVInferRequestPtr infer_request,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names) {
+                const SubGraphContext::string_index_map_t& output_names) {
   auto graph_output_blob = infer_request->GetTensor(output_name);
 
   auto graph_output_dims = graph_output_blob->get_shape();
@@ -139,7 +139,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names,
+                const SubGraphContext::string_index_map_t& output_names,
                 std::shared_ptr<ov::Node> node) {
   // Find position of '/' in the output_name
   int pos = output_name.find("/");
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index a2e16f5dbbfa9..cfb6adc8fbd3d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -45,14 +45,14 @@ void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> n
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names,
+                const SubGraphContext::string_index_map_t& output_names,
                 std::shared_ptr<ov::Node> node);
 
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
                 OVInferRequestPtr infer_request,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names);
+                const SubGraphContext::string_index_map_t& output_names);
 
 void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
                    std::string input_name, Ort::KernelContext& context,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index fedc3f21c8e33..99955da539ae7 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -14,7 +14,7 @@ std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
                             SessionContext& session_context,
                             const SubGraphContext& subgraph_context,
-                            SharedContext &shared_context,
+                            SharedContext& shared_context,
                             ptr_stream_t& model_stream) {
   std::string type = session_context.device_type;
   if (type == "CPU" || type.find("GPU") != std::string::npos ||
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 6202f9cd95f85..a8026d710827b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -131,7 +131,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   if (session_context_.so_share_ep_contexts) {
     initializer = [&metadata](OVInferRequestPtr ir_ptr) {
       const auto input_count = ir_ptr->GetNumInputs();
-      for (auto i = 0; i < input_count; i++) {
+      for (auto i = 0u; i < input_count; i++) {
         using Key = SharedContext::SharedWeights::Metadata::Key;
         const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)};
         if (metadata.contains(tensor_key)) {
@@ -357,28 +357,23 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
 // an Infer Request indexed by infer_req_idx
 void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
   try {
-    auto graph_input_info = exe_network_.Get().inputs();
-    int input_idx = 0;
-    for (auto input_info_iter = graph_input_info.begin();
-         input_info_iter != graph_input_info.end(); ++input_info_iter) {
-      auto input_names = input_info_iter->get_names();
-      std::string onnx_input_name;
-      std::string input_name;
-      // use names retrieved from original ONNX model to assign the right onnx input name for the graph
-      for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) {
-        if (it->second == input_idx) {
-          onnx_input_name = it->first;
+    auto ov_input_info = exe_network_.Get().inputs();
+
+    // Loop over subgraph original input names to find the correspondent OV input name
+    for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) {
+      std::string input_name{};
+      uint32_t input_idx = 0;
+      for (uint32_t index = 0; const auto& ov_input : ov_input_info) {
+        if (ov_input.get_names().contains(onnx_input_name)) {
+          input_name = onnx_input_name;
+          input_idx = index;
           break;
         }
+        index++;
       }
-      // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
-      if (input_names.find(onnx_input_name) != input_names.end()) {
-        input_name = std::move(onnx_input_name);
-      } else {
-        ORT_THROW(log_tag +
-                  "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
+      ORT_ENFORCE(!input_name.empty(), log_tag,
+                  "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
                   " doesn't exist in the list of OpenVINO input tensor names");
-      }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
           !session_context_.disable_dynamic_shapes &&
@@ -395,7 +390,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           input_tensor_shape[tensor_iter] = *i;
           tensor_iter += 1;
         }
-        const auto& input = graph_input_info.at(input_idx);
+        const auto& input = ov_input_info.at(input_idx);
         OVTensorPtr tensor_ptr;
         // avoid input copies on the CPU device
         if (session_context_.device_type.find("CPU") != std::string::npos) {
@@ -428,7 +423,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           if ((it == ort_ov_tensor_map.end()) ||
               (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
             ov_tensor_data_t ov_tensor_data;
-            const auto& input = graph_input_info.at(input_idx);
+            const auto& input = ov_input_info.at(input_idx);
             ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
                                                                      const_cast<void*>(tensor.GetTensorRawData()));
 
@@ -443,8 +438,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           }
         }
       }
-      input_idx++;
-    }
+    }  // Loop subgraph original input names
+
     if (session_context_.device_type.find("NPU") != std::string::npos) {
       // Set the output blob as remote blob
       auto graph_output_info = exe_network_.Get().outputs();
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 68c0ecf87004b..1e8b8fb1127ce 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -112,15 +112,15 @@ struct SessionContext : ProviderInfo {
 
 // Holds context specific to subgraph.
 struct SubGraphContext {
+  using string_index_map_t = std::unordered_map<std::string, uint32_t>;
   bool has_dynamic_input_shape = false;
   bool enable_batching = false;
   bool set_npu_config = false;
   bool is_constant = false;
   void* context = 0;
   std::string subgraph_name;
-  std::vector<int> input_indexes;
-  std::unordered_map<std::string, int> input_names;
-  std::unordered_map<std::string, int> output_names;
+  string_index_map_t input_names;
+  string_index_map_t output_names;
   bool is_wholly_supported_graph = false;
   bool has_external_weights = false;
   std::string model_precision;
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index dfa669aace875..d2f91cacb6c4d 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -24,7 +24,7 @@ class BackendFactory {
   MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
               SessionContext& session_context,
               const SubGraphContext& subgraph_context,
-              SharedContext &shared_context,
+              SharedContext& shared_context,
               ptr_stream_t& model_stream);
 };
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 5b853539c31ea..9b0e9c94c0f6e 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -199,7 +199,7 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
 
 std::string OVInferRequest::GetInputTensorName(uint32_t index) {
   try {
-    const auto &model = ovInfReq.get_compiled_model();
+    const auto& model = ovInfReq.get_compiled_model();
     return *model.input(index).get_names().begin();
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what());
@@ -208,7 +208,7 @@ std::string OVInferRequest::GetInputTensorName(uint32_t index) {
   }
 }
 
-void OVInferRequest::SetTensor(const std::string &name, OVTensorPtr& blob) {
+void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {

From ac9c998c2fbded89f597cb5506a52579ad251688 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 16 Jan 2025 13:51:20 -0800
Subject: [PATCH 19/35] Fix for using so_share_ep_contexts without ep.context*
 flags

---
 .../providers/openvino/backend_manager.cc     |  5 +++--
 .../core/providers/openvino/contexts.h        |  1 -
 .../openvino/openvino_execution_provider.cc   | 19 ++++++++++++++-----
 .../openvino/openvino_provider_factory.cc     |  2 +-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index d1564836cb247..589bee61e5200 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -92,8 +92,9 @@ BackendManager::BackendManager(SessionContext& session_context,
 
   auto& sw = shared_context_.shared_weights;
   if (session_context_.so_share_ep_contexts) {
-    std::filesystem::path weight_filename = session_context_.cache_dir.parent_path();
-    if (sw.external_weight_filename.empty()) {
+    std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path();
+    if (sw.external_weight_filename.empty() && !sw.metadata.empty()) {
+      // Reasonable assumption that all metadata entries have the same external file location
       sw.external_weight_filename = sw.metadata.begin()->second.location;
     }
     weight_filename /= sw.external_weight_filename;
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 1e8b8fb1127ce..7945f96c51138 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -102,7 +102,6 @@ struct SessionContext : ProviderInfo {
 
   OVCore ie_core;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
-  std::string onnx_model_name;
   std::filesystem::path onnx_model_path_name;
   int onnx_opset_version;
   bool use_api_2;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index d02a642699a82..a53bcf5cdbf6f 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -174,10 +174,13 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
-  // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
-  session_context_.onnx_model_path_name = fused_nodes[0].filtered_graph.get().ModelPath().string();
-  session_context_.onnx_opset_version =
-      fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain);
+  if (!fused_nodes.empty()) {
+    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+    session_context_.onnx_opset_version =
+        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
+  }
 
   struct OpenVINOEPFunctionState {
     AllocateFunc allocate_func = nullptr;
@@ -242,10 +245,16 @@ common::Status OpenVINOExecutionProvider::Compile(
   }
 
   if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) {
+    std::filesystem::path metadata_name = session_context_.cache_dir.parent_path();
+
+    // If cache_dir hasn't been set use the model path to dump files
+    if (metadata_name.empty()) {
+      metadata_name = session_context_.onnx_model_path_name.parent_path();
+    }
+
     // Metadata is generated only for shared contexts
     // If metadata is generated then only save it if also saving epcontext (so_context_enable)
     // If saving metadata then save it to the provided path
-    std::filesystem::path metadata_name = session_context_.cache_dir.parent_path();
     metadata_name /= session_context_.cache_dir.stem().string() + "_metadata";
     metadata_name.replace_extension("bin");
     dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string());
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 7b0d6c6751120..06187573a7346 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -279,7 +279,7 @@ struct OpenVINO_Provider : Provider {
     std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data();
 
     if (pi.so_context_enable && !so_context_file_path.empty()) {
-      pi.cache_dir = std::move(so_context_file_path);
+      pi.cache_dir = so_context_file_path;
     }
 
     // Append values to config to support weight-as-inputs conversion for shared contexts

From 6512ec634673ea222c41a6e7c5fec6917db91567 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 16 Jan 2025 16:41:00 -0800
Subject: [PATCH 20/35] Add remote tensor support for NPU weight sharing

---
 .../providers/openvino/backend_manager.cc     |  2 +-
 .../core/providers/openvino/backend_utils.cc  | 20 +++++++++++++++++--
 .../core/providers/openvino/backend_utils.h   |  4 +++-
 .../core/providers/openvino/contexts.h        |  4 ++--
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 589bee61e5200..d7c55ad1ac84c 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -104,7 +104,7 @@ BackendManager::BackendManager(SessionContext& session_context,
       if (!sw.mapped_weights) {
         sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::MappedWeights>(weight_filename);
       }
-      backend_utils::CreateOVTensors(sw.metadata, sw.mapped_weights->weight_data);
+      backend_utils::CreateOVTensors(session_context_.device_type, session_context_.ie_core, sw.metadata, sw.mapped_weights->weight_data);
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index e37254b34b9fd..576718c10481a 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -8,6 +8,7 @@
 
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/pass/constant_folding.hpp"
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/ov_interface.h"
@@ -333,7 +334,10 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt
 }
 
 // Function to handle tensor creation from external data
-void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights) {
+void CreateOVTensors(const std::string& device_name,
+                     OVCore& ov_core,
+                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
+                     std::string_view weights) {
   for (auto& [key, value] : metadata_map) {
     if (value.tensor) continue;
 
@@ -342,10 +346,22 @@ void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map,
 
     // Get element data type
     auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type;
+
     ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type);  // Map to OpenVINO data type
 
     // Create OpenVINO Tensor
-    value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions, (void*)tensor_data);
+    if (device_name == "NPU") {
+      // Use remote tensors
+      auto npu_context = ov_core.Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
+      auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions);
+
+      // Copy data to remote tensor
+      std::memcpy(remote_tensor.data(), (void*)tensor_data, value.size);
+      value.tensor = std::make_shared<ov::Tensor>(remote_tensor);
+    } else {
+      // Use vanilla tensors
+      value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions, (void*)tensor_data);
+    }
     ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch");
   }
 }
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index cfb6adc8fbd3d..4fb54507ad31c 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -67,7 +67,9 @@ CreateOVModel(const std::string model,
               const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
-void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map,
+void CreateOVTensors(const std::string& device_name,
+                     OVCore& ov_core,
+                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
                      std::string_view weights);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 7945f96c51138..d7b76f2a9e0de 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -24,7 +24,7 @@ struct SharedContext {
         std::string name;
         bool operator==(const Key&) const = default;
       };
-      struct KeyHash {
+      struct Hash {
         std::size_t operator()(const Key& key) const noexcept {
           return std::hash<std::string>()(key.name);
         }
@@ -37,7 +37,7 @@ struct SharedContext {
         std::int32_t element_type;
         std::shared_ptr<ov::Tensor> tensor;
       };
-      using Map = std::unordered_map<Key, Value, KeyHash>;
+      using Map = std::unordered_map<Key, Value, Hash>;
     };
 
     struct MappedWeights {

From 5594817209d247ed150daa176738aee8829723fb Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 16 Jan 2025 20:46:38 -0800
Subject: [PATCH 21/35] Use a single ov::Core copy across OVEP

---
 .../providers/openvino/backend_manager.cc     |  2 +-
 .../core/providers/openvino/backend_utils.cc  |  5 +-
 .../core/providers/openvino/backend_utils.h   |  1 -
 .../openvino/backends/basic_backend.cc        | 28 +++++-----
 .../core/providers/openvino/contexts.h        |  1 -
 .../openvino/openvino_execution_provider.cc   |  9 ++--
 .../openvino/openvino_execution_provider.h    |  7 ---
 .../openvino/openvino_provider_factory.cc     |  6 +--
 .../core/providers/openvino/ov_interface.cc   | 21 +++++---
 .../core/providers/openvino/ov_interface.h    | 52 +++++++++----------
 10 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index d7c55ad1ac84c..5a4bf791b4760 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -104,7 +104,7 @@ BackendManager::BackendManager(SessionContext& session_context,
       if (!sw.mapped_weights) {
         sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::MappedWeights>(weight_filename);
       }
-      backend_utils::CreateOVTensors(session_context_.device_type, session_context_.ie_core, sw.metadata, sw.mapped_weights->weight_data);
+      backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, sw.mapped_weights->weight_data);
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 576718c10481a..05084fe8f838d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -80,7 +80,7 @@ CreateOVModel(const std::string model,
     std::cout << "CreateNgraphFunc" << std::endl;
   }
   try {
-    auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name.string());
+    auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string());
 
     // Check for Constant Folding
     if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) {
@@ -335,7 +335,6 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt
 
 // Function to handle tensor creation from external data
 void CreateOVTensors(const std::string& device_name,
-                     OVCore& ov_core,
                      SharedContext::SharedWeights::Metadata::Map& metadata_map,
                      std::string_view weights) {
   for (auto& [key, value] : metadata_map) {
@@ -352,7 +351,7 @@ void CreateOVTensors(const std::string& device_name,
     // Create OpenVINO Tensor
     if (device_name == "NPU") {
       // Use remote tensors
-      auto npu_context = ov_core.Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
+      auto npu_context = OVCore::Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
       auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions);
 
       // Copy data to remote tensor
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 4fb54507ad31c..e27a6e277a1a3 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -68,7 +68,6 @@ CreateOVModel(const std::string model,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void CreateOVTensors(const std::string& device_name,
-                     OVCore& ov_core,
                      SharedContext::SharedWeights::Metadata::Map& metadata_map,
                      std::string_view weights);
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index a8026d710827b..6f8ab00956fef 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -62,9 +62,9 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     // Pre-requisite is provider_option "context" must be set
 #if defined(IO_BUFFER_ENABLED)
     cl_context ctx = static_cast<cl_context>(session_context_.context);
-    remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx);
+    remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx);
     if (subgraph_context_.is_ep_ctx_graph) {
-      exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+      exe_network_ = OVCore::ImportModel(*model_stream,
                                                           remote_context_,
                                                           subgraph_context_.subgraph_name);
       model_stream.reset();  // Delete stream after it is no longer needed
@@ -78,7 +78,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
       }
       LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-      exe_network_ = session_context_.ie_core.CompileModel(
+      exe_network_ = OVCore::CompileModel(
           ov_model, remote_context_, subgraph_context_.subgraph_name);
     }
 #else  // !IO_BUFFER_ENABLED
@@ -88,7 +88,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     if (subgraph_context_.is_ep_ctx_graph) {
       // If the blob is held in an EPContext node, then skip FE+Compile
       // and directly move on to creating a backend with the executable blob
-      exe_network_ = session_context_.ie_core.ImportModel(*model_stream,
+      exe_network_ = OVCore::ImportModel(*model_stream,
                                                           hw_target,
                                                           device_config,
                                                           subgraph_context_.subgraph_name);
@@ -102,12 +102,12 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       // Inputs with static dimenstions
       // Not enabled for models with external weights and when ep context is set.
       const std::string model = model_proto->SerializeAsString();
-      exe_network_ = session_context_.ie_core.CompileModel(model,
+      exe_network_ = OVCore::CompileModel(model,
                                                            hw_target,
                                                            device_config,
                                                            subgraph_context_.subgraph_name);
-    } else {  // For all other types use ov::core read_model() to generate OV IR
-              // followed by ov::core compile_model()
+    } else {  // For all other types use ov::ov_core read_model() to generate OV IR
+              // followed by ov::ov_core compile_model()
       std::shared_ptr<const OVNetwork> ov_model;
       {
         const std::string model = model_proto->SerializeAsString();
@@ -116,7 +116,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         }
         ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
       }
-      exe_network_ = session_context_.ie_core.CompileModel(
+      exe_network_ = OVCore::CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
 #endif
@@ -196,7 +196,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     device_config.emplace(ov::device::properties("NPU", device_property));
 #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
     if (session_context_.so_context_enable) {
-      session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
+      OVCore::Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
     }
 #endif
   }
@@ -264,7 +264,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
-          session_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
+          OVCore::Get().set_property(device, ov::AnyMap{{key, value}});
         } else {
           LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key
                                 << "\" is either unsupported in current OpenVINO version"
@@ -284,14 +284,14 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
       for (const std::string& device : individual_devices) {
         if (target_config.count(device)) {
           // Get supported properties for each individual device
-          auto device_properties = session_context_.ie_core.Get().get_property(device, ov::supported_properties);
+          auto device_properties = OVCore::Get().get_property(device, ov::supported_properties);
           // Set properties for the device
           set_target_properties(device, target_config.at(device), device_properties);
         }
       }
     } else {
       if (target_config.count(session_context_.device_type)) {
-        auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type,
+        auto supported_properties = OVCore::Get().get_property(session_context_.device_type,
                                                                                 ov::supported_properties);
         set_target_properties(session_context_.device_type,
                               target_config.at(session_context_.device_type), supported_properties);
@@ -311,7 +311,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
       device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir);
       device_config.emplace(ov::device::properties("GPU", device_property));
     } else {
-      session_context_.ie_core.SetCache(session_context_.cache_dir.string());
+      OVCore::SetCache(session_context_.cache_dir.string());
     }
   }
 }
@@ -343,7 +343,7 @@ void BasicBackend::EnableStreams() {
     }
     // Do nothing
   } else {
-    session_context_.ie_core.SetStreams(session_context_.device_type, session_context_.num_streams);
+    OVCore::SetStreams(session_context_.device_type, session_context_.num_streams);
   }
 }
 
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index d7b76f2a9e0de..dc6f87520bfd3 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -100,7 +100,6 @@ struct ProviderInfo {
 struct SessionContext : ProviderInfo {
   SessionContext(const ProviderInfo& info) : ProviderInfo{info} {}
 
-  OVCore ie_core;
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::filesystem::path onnx_model_path_name;
   int onnx_opset_version;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index a53bcf5cdbf6f..3fd6a70e2b7aa 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -56,8 +56,7 @@ void AdjustProviderInfo(ProviderInfo& info) {
   std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
                                                      "GPU.0", "GPU.1", "NPU"};
 
-  OVDevices devices;
-  std::vector<std::string> available_devices = devices.get_ov_devices();
+  std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
 
   for (auto& device : available_devices) {
     if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
@@ -112,10 +111,10 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, S
   InitProviderOrtApi();
 
   // to check if target device is available
-  // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
+  // using OVCore capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir.empty()) {
     bool device_found = false;
-    std::vector<std::string> available_devices = session_context_.ie_core.GetAvailableDevices();
+    std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type != "") {
       if (info.device_type.find("HETERO") != std::string::npos ||
@@ -269,7 +268,7 @@ std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators()
     AllocatorCreationInfo npu_allocator_info{
         [this](OrtDevice::DeviceId device_id) {
           return std::make_unique<OVRTAllocator>(
-              session_context_.ie_core.Get(),
+              OVCore::Get(),
               OrtDevice::NPU,
               device_id,
               OpenVINO_RT_NPU);
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 95d7027fd70e3..294f4d6db54a4 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -18,13 +18,6 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-struct OVDevices {
-  ov::Core core;
-  std::vector<std::string> get_ov_devices() const {
-    return core.get_available_devices();
-  }
-};
-
 static void print_build_options() {
   std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl;
   std::cout << "Specify the keyword HETERO (or) MULTI (or) AUTO followed by the devices in the order of priority "
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 06187573a7346..9bf3e8b040406 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -32,8 +32,7 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
 
 struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO {
   std::vector<std::string> GetAvailableDevices() const override {
-    openvino_ep::OVCore ie_core;
-    return ie_core.GetAvailableDevices();
+    return OVCore::GetAvailableDevices();
   }
 } g_info;
 
@@ -58,8 +57,7 @@ struct OpenVINO_Provider : Provider {
       std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
                                                        "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
                                                        "GPU.0_FP16", "GPU.1_FP16"};
-      OVDevices devices;
-      std::vector<std::string> available_devices = devices.get_ov_devices();
+      std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
 
       for (auto& device : available_devices) {
         if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 9b0e9c94c0f6e..6ce2d506211e7 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -13,7 +13,8 @@ using Exception = ov::Exception;
 namespace onnxruntime {
 namespace openvino_ep {
 
-const std::string log_tag = "[OpenVINO-EP] ";
+static const std::string log_tag = "[OpenVINO-EP] ";
+static ov::Core g_core;
 
 #ifndef NDEBUG
 void printDebugInfo(const ov::CompiledModel& obj) {
@@ -46,7 +47,7 @@ void printDebugInfo(const ov::CompiledModel& obj) {
 }
 #endif
 
-std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std::string& model_path) const {
+std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std::string& model_path) {
   try {
     std::istringstream modelStringStream(model);
     std::istream& modelStream = modelStringStream;
@@ -77,7 +78,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = oe.compile_model(ie_cnn_network, hw_target, device_config);
+    obj = g_core.compile_model(ie_cnn_network, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -96,7 +97,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
+    obj = g_core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -115,7 +116,7 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    obj = oe.import_model(model_stream, hw_target, device_config);
+    obj = g_core.import_model(model_stream, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -129,7 +130,11 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
 }
 
 void OVCore::SetCache(const std::string& cache_dir_path) {
-  oe.set_property(ov::cache_dir(cache_dir_path));
+  g_core.set_property(ov::cache_dir(cache_dir_path));
+}
+
+ov::Core& OVCore::Get() {
+  return g_core;
 }
 
 #ifdef IO_BUFFER_ENABLED
@@ -165,12 +170,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_strea
 #endif
 
 std::vector<std::string> OVCore::GetAvailableDevices() {
-  auto available_devices = oe.get_available_devices();
+  auto available_devices = g_core.get_available_devices();
   return available_devices;
 }
 
 void OVCore::SetStreams(const std::string& device_type, int num_streams) {
-  oe.set_property(device_type, {ov::num_streams(num_streams)});
+  g_core.set_property(device_type, {ov::num_streams(num_streams)});
 }
 
 OVInferRequest OVExeNetwork::CreateInferRequest() {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 5d88994dbabb0..a2547ada60f34 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -37,39 +37,37 @@ typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr;
 typedef ov::RemoteContext OVRemoteContext;
 #endif
 
-class OVCore {
-  ov::Core oe;
-
- public:
+struct OVCore {
   // OV Interface For Reading Model
-  std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path) const;
+  static std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path);
+
   // OV Interface for Compiling OV Model Type
-  OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                            std::string& hw_target,
-                            ov::AnyMap& device_config,
-                            const std::string& name);
+  static OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
+                                   std::string& hw_target,
+                                   ov::AnyMap& device_config,
+                                   const std::string& name);
   // OV Interface for Fast Compile
-  OVExeNetwork CompileModel(const std::string& onnx_model,
-                            std::string& hw_target,
-                            ov::AnyMap& device_config,
-                            const std::string& name);
+  static OVExeNetwork CompileModel(const std::string& onnx_model,
+                                   std::string& hw_target,
+                                   ov::AnyMap& device_config,
+                                   const std::string& name);
   // OV Interface for Import model Stream
-  OVExeNetwork ImportModel(std::istream& model_stream,
-                           std::string hw_target,
-                           const ov::AnyMap& device_config,
-                           std::string name);
+  static OVExeNetwork ImportModel(std::istream& model_stream,
+                                  std::string hw_target,
+                                  const ov::AnyMap& device_config,
+                                  std::string name);
 #ifdef IO_BUFFER_ENABLED
-  OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
-                            OVRemoteContextPtr context,
-                            std::string name);
-  OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
-                           OVRemoteContextPtr context,
-                           std::string name);
+  static OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
+                                   OVRemoteContextPtr context,
+                                   std::string name);
+  static OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
+                                  OVRemoteContextPtr context,
+                                  std::string name);
 #endif
-  std::vector<std::string> GetAvailableDevices();
-  void SetCache(const std::string& cache_dir_path);
-  ov::Core& Get() { return oe; }
-  void SetStreams(const std::string& device_type, int num_streams);
+  static std::vector<std::string> GetAvailableDevices();
+  static void SetCache(const std::string& cache_dir_path);
+  static ov::Core& Get();
+  static void SetStreams(const std::string& device_type, int num_streams);
 };
 
 class OVExeNetwork {

From f85c7b5c9dca9c9170c60473b8367e7aed909fc6 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 16 Jan 2025 21:18:35 -0800
Subject: [PATCH 22/35] Decouple provider option cache_dir from session option
 ep.context_file_path

---
 .../core/providers/openvino/backend_manager.cc   |  9 +++------
 onnxruntime/core/providers/openvino/contexts.h   |  1 +
 .../openvino/openvino_execution_provider.cc      | 16 +++++++---------
 .../openvino/openvino_provider_factory.cc        |  6 +-----
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 5a4bf791b4760..858d7fb3f0298 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -232,12 +232,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     }
   } else {
     // External blob
-    std::filesystem::path blob_filename;
-    // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
-    if (!session_context_.cache_dir.empty()) {
-      blob_filename = session_context_.cache_dir;
-    } else {
-      blob_filename = graph_body_viewer.ModelPath();
+    std::filesystem::path blob_filename = session_context_.so_context_file_path;
+    if (blob_filename.empty()) {
+      blob_filename = session_context_.onnx_model_path_name;
     }
     const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)};
     blob_filename = blob_filename.parent_path() / name;
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index dc6f87520bfd3..4f3b22236ae0f 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -94,6 +94,7 @@ struct ProviderInfo {
   bool so_disable_cpu_ep_fallback{false};  // ORT session option
   bool so_context_embed_mode{false};       // ORT session option
   bool so_share_ep_contexts{false};        // ORT session option
+  fs::path so_context_file_path{};         // ORT session option
 };
 
 // Holds context applicable to the entire EP instance.
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 3fd6a70e2b7aa..bc06e2e8e9a70 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -243,19 +243,17 @@ common::Status OpenVINOExecutionProvider::Compile(
     }
   }
 
-  if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) {
-    std::filesystem::path metadata_name = session_context_.cache_dir.parent_path();
-
-    // If cache_dir hasn't been set use the model path to dump files
+  if (session_context_.so_share_ep_contexts) {
+    auto metadata_name = session_context_.so_context_file_path.parent_path();
     if (metadata_name.empty()) {
-      metadata_name = session_context_.onnx_model_path_name.parent_path();
+      metadata_name = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+    } else {
+      metadata_name /= metadata_name.stem().string() + "_metadata";
+      metadata_name.replace_extension("bin");
     }
 
     // Metadata is generated only for shared contexts
-    // If metadata is generated then only save it if also saving epcontext (so_context_enable)
-    // If saving metadata then save it to the provided path
-    metadata_name /= session_context_.cache_dir.stem().string() + "_metadata";
-    metadata_name.replace_extension("bin");
+    // If saving metadata then save it to the provided path or ose the original model path
     dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string());
   }
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 9bf3e8b040406..86804b8961cac 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -274,11 +274,7 @@ struct OpenVINO_Provider : Provider {
     pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
     pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
     pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
-    std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data();
-
-    if (pi.so_context_enable && !so_context_file_path.empty()) {
-      pi.cache_dir = so_context_file_path;
-    }
+    pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
 
     // Append values to config to support weight-as-inputs conversion for shared contexts
     if (pi.so_share_ep_contexts) {

From f25f72c36a26b604b4671fd6af23da7bcb19b377 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Fri, 17 Jan 2025 00:43:41 -0800
Subject: [PATCH 23/35] Add support for serialization and deserialization of
 metadata to disk

---
 .../core/providers/openvino/backend_utils.cc  | 66 +++++++++++++++++++
 .../openvino/backends/basic_backend.cc        | 18 ++---
 .../core/providers/openvino/contexts.h        |  7 +-
 .../openvino/openvino_execution_provider.cc   | 29 +++++---
 .../qdq_transformations/qdq_stripping.cc      | 50 --------------
 5 files changed, 100 insertions(+), 70 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 05084fe8f838d..e5a335bd0bfdd 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -53,6 +53,72 @@ SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
   }
 }
 
+std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) {
+  try {
+    stream << metadata.size();
+
+    // Write each key-value pair
+    // Put elements in separate lines to facilitate reading
+    for (const auto& [key, value] : metadata) {
+      stream << std::endl
+             << key.name;
+      stream << std::endl
+             << value.location;
+      stream << std::endl
+             << value.data_offset;
+      stream << std::endl
+             << value.size;
+      stream << std::endl
+             << value.dimensions.size();
+      for (const auto& dim : value.dimensions) {
+        stream << std::endl
+               << dim;
+      }
+      stream << std::endl
+             << value.element_type;
+    }
+  } catch (const Exception& e) {
+    ORT_THROW("Error: Failed to write map data.", e.what());
+  } catch (...) {
+    ORT_THROW("Error: Failed to write map data.");
+  }
+
+  ORT_ENFORCE(stream.good(), "Error: Failed to write map data.");
+  return stream;
+}
+
+std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) {
+  size_t map_size{0};
+  try {
+    stream >> map_size;
+
+    while (!stream.eof()) {
+      SharedContext::SharedWeights::Metadata::Key key;
+      SharedContext::SharedWeights::Metadata::Value value;
+      stream >> key.name;
+      stream >> value.location;
+      stream >> value.data_offset;
+      stream >> value.size;
+      size_t num_dimensions;
+      stream >> num_dimensions;
+      value.dimensions.resize(num_dimensions);
+      for (auto& dim : value.dimensions) {
+        stream >> dim;
+      }
+      stream >> value.element_type;
+      metadata.emplace(key, value);
+    }
+  } catch (const Exception& e) {
+    ORT_THROW("Error: Failed to read map data.", e.what());
+  } catch (...) {
+    ORT_THROW("Error: Failed to read map data.");
+  }
+
+  ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data.");
+
+  return stream;
+}
+
 namespace backend_utils {
 
 bool IsDebugEnabled() {
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 6f8ab00956fef..a730c0b59628b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -65,8 +65,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx);
     if (subgraph_context_.is_ep_ctx_graph) {
       exe_network_ = OVCore::ImportModel(*model_stream,
-                                                          remote_context_,
-                                                          subgraph_context_.subgraph_name);
+                                         remote_context_,
+                                         subgraph_context_.subgraph_name);
       model_stream.reset();  // Delete stream after it is no longer needed
     } else {
       std::shared_ptr<const OVNetwork> ov_model;
@@ -89,9 +89,9 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       // If the blob is held in an EPContext node, then skip FE+Compile
       // and directly move on to creating a backend with the executable blob
       exe_network_ = OVCore::ImportModel(*model_stream,
-                                                          hw_target,
-                                                          device_config,
-                                                          subgraph_context_.subgraph_name);
+                                         hw_target,
+                                         device_config,
+                                         subgraph_context_.subgraph_name);
       model_stream.reset();  // Delete stream after it is no longer needed
     } else if (!subgraph_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
@@ -103,9 +103,9 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       // Not enabled for models with external weights and when ep context is set.
       const std::string model = model_proto->SerializeAsString();
       exe_network_ = OVCore::CompileModel(model,
-                                                           hw_target,
-                                                           device_config,
-                                                           subgraph_context_.subgraph_name);
+                                          hw_target,
+                                          device_config,
+                                          subgraph_context_.subgraph_name);
     } else {  // For all other types use ov::ov_core read_model() to generate OV IR
               // followed by ov::ov_core compile_model()
       std::shared_ptr<const OVNetwork> ov_model;
@@ -292,7 +292,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     } else {
       if (target_config.count(session_context_.device_type)) {
         auto supported_properties = OVCore::Get().get_property(session_context_.device_type,
-                                                                                ov::supported_properties);
+                                                               ov::supported_properties);
         set_target_properties(session_context_.device_type,
                               target_config.at(session_context_.device_type), supported_properties);
       }
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 4f3b22236ae0f..c861446e5cd08 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -38,6 +38,8 @@ struct SharedContext {
         std::shared_ptr<ov::Tensor> tensor;
       };
       using Map = std::unordered_map<Key, Value, Hash>;
+      friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata);
+      friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata);
     };
 
     struct MappedWeights {
@@ -103,9 +105,8 @@ struct SessionContext : ProviderInfo {
 
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::filesystem::path onnx_model_path_name;
-  int onnx_opset_version;
-  bool use_api_2;
-  const std::vector<int> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
+  uint32_t onnx_opset_version{0};
+  const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
   const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR);
 };
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index bc06e2e8e9a70..05a54c2a328bb 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -181,6 +181,17 @@ common::Status OpenVINOExecutionProvider::Compile(
         graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
   }
 
+  // Temporary code to read metadata before it moves to the .bin
+  auto& metadata = shared_context_.shared_weights.metadata;
+  if (session_context_.so_share_ep_contexts && metadata.empty()) {
+    // Metadata is always read from model location, this could be a source or epctx model
+    fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+    std::ifstream file(metadata_filename, std::ios::binary);
+    if (file) {
+      file >> metadata;
+    }
+  }
+
   struct OpenVINOEPFunctionState {
     AllocateFunc allocate_func = nullptr;
     DestroyFunc destroy_func = nullptr;
@@ -194,8 +205,6 @@ common::Status OpenVINOExecutionProvider::Compile(
 
     NodeComputeInfo compute_info;
 
-    session_context_.use_api_2 = true;
-
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
@@ -244,17 +253,21 @@ common::Status OpenVINOExecutionProvider::Compile(
   }
 
   if (session_context_.so_share_ep_contexts) {
-    auto metadata_name = session_context_.so_context_file_path.parent_path();
-    if (metadata_name.empty()) {
-      metadata_name = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+    fs::path metadata_filename;
+    if (session_context_.so_context_file_path.empty()) {
+      metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
     } else {
-      metadata_name /= metadata_name.stem().string() + "_metadata";
-      metadata_name.replace_extension("bin");
+      metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin";
     }
 
     // Metadata is generated only for shared contexts
     // If saving metadata then save it to the provided path or ose the original model path
-    dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string());
+    // Multiple calls to Compile() will update the metadata and for the last call
+    //   the resulting file will contain the aggregated content
+    std::ofstream file(metadata_filename, std::ios::binary);
+    if (file) {
+      file << metadata;
+    }
   }
 
   return status;
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 019e121b4f575..902dab8c04ed0 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -667,56 +667,6 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
   }
 }
 
-template <typename T>
-bool writeScalar(std::ofstream& outfile, const T& scalar) {
-  auto size = sizeof(T);
-  outfile.write(reinterpret_cast<const char*>(&size), sizeof(size));
-  if (!outfile.good()) return false;
-
-  outfile.write(reinterpret_cast<const char*>(&scalar), size);
-  return outfile.good();
-}
-
-template <>
-bool writeScalar(std::ofstream& outfile, const std::string& text) {
-  auto size = text.size() * sizeof(std::string::value_type);
-  outfile.write(reinterpret_cast<const char*>(&size), size);
-  if (!outfile.good()) return false;
-
-  outfile.write(text.data(), size);
-  return outfile.good();
-}
-
-// Main function to dump the map to a binary file
-bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::string& filename) {
-  std::ofstream outfile(filename, std::ios::binary);
-  if (!outfile.is_open()) {
-    ORT_THROW("Error: Could not open file for writing metadata.");
-    return false;
-  }
-
-  // Write the size of the map
-  size_t map_size = metadata.size();
-  outfile.write(reinterpret_cast<const char*>(&map_size), sizeof(map_size));
-  if (!outfile.good()) {
-    ORT_THROW("Error: Failed to write map size.");
-    return false;
-  }
-
-  // Write each key-value pair
-  for (const auto& [key, value] : metadata) {
-    bool result = true;
-    result &= writeScalar(outfile, key.name);
-    result &= writeScalar(outfile, value.location);
-    result &= writeScalar(outfile, value.data_offset);
-    result &= writeScalar(outfile, value.size);
-
-    ORT_ENFORCE(result, "Error: Failed to write map data.");
-  }
-
-  return true;
-}
-
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,

From 225b6787d3a90a25150526bbd49037f25620e010 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Tue, 21 Jan 2025 13:45:21 -0800
Subject: [PATCH 24/35] Load blobs from relative path stored in
 ep_cache_context

---
 onnxruntime/core/providers/openvino/backend_manager.cc       | 2 +-
 onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 858d7fb3f0298..a6efe9bc41e07 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -245,7 +245,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
       ORT_THROW("Unable to open file for epctx model dump.");
     }
     compiled_model.export_model(blob_file);
-    model_blob_str = blob_filename.string();
+    model_blob_str = blob_filename.filename().string();
   }
 
   ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 1c6b0a0467836..9c55614633b82 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -115,7 +115,9 @@ std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const GraphViewer
   if (embed_mode) {
     result.reset((std::istream*)new std::istringstream(ep_cache_context));
   } else {
-    result.reset((std::istream*)new std::ifstream(ep_cache_context, std::ios_base::binary | std::ios_base::in));
+    const auto& blob_filepath = graph_viewer.ModelPath().parent_path() / ep_cache_context;
+    ORT_ENFORCE(std::filesystem::exists(blob_filepath), "Blob file not found: ", blob_filepath.string());
+    result.reset((std::istream*)new std::ifstream(blob_filepath, std::ios_base::binary | std::ios_base::in));
   }
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
   return result;

From 532401162add1ad7fadc2b71e398cd64ce6c5844 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Tue, 21 Jan 2025 13:46:29 -0800
Subject: [PATCH 25/35] Use remote L0 tensors for shared weights

---
 onnxruntime/core/providers/openvino/backend_utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index e5a335bd0bfdd..1aa4565671c53 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -418,10 +418,10 @@ void CreateOVTensors(const std::string& device_name,
     if (device_name == "NPU") {
       // Use remote tensors
       auto npu_context = OVCore::Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
-      auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions);
+      auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT);
 
       // Copy data to remote tensor
-      std::memcpy(remote_tensor.data(), (void*)tensor_data, value.size);
+      std::memcpy(remote_tensor.get(), (void*)tensor_data, value.size);
       value.tensor = std::make_shared<ov::Tensor>(remote_tensor);
     } else {
       // Use vanilla tensors

From 241cfae6bb9fb10ff58b49deac02df9f8fa61d3c Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Tue, 21 Jan 2025 11:23:25 +0000
Subject: [PATCH 26/35] fix linux ci issues

---
 .../providers/openvino/backend_manager.cc     |  3 +-
 .../core/providers/openvino/backend_utils.cc  | 55 +++++++++++++++++++
 .../core/providers/openvino/contexts.h        |  3 +-
 .../openvino/openvino_execution_provider.cc   | 12 +++-
 4 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index a6efe9bc41e07..d913daf14d6fe 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -236,7 +236,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     if (blob_filename.empty()) {
       blob_filename = session_context_.onnx_model_path_name;
     }
-    const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)};
+    // const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)};
+    const auto name = graph_body_viewer.ModelPath().stem().string() + "_" + subgraph_context_.subgraph_name;
     blob_filename = blob_filename.parent_path() / name;
     blob_filename.replace_extension("blob");
     std::ofstream blob_file(blob_filename,
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 1aa4565671c53..ffe238a48ba7f 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -6,6 +6,14 @@
 #include <fstream>
 #include <utility>
 
+#include <fcntl.h>       // For open
+#include <sys/mman.h>    // For mmap, munmap
+#include <sys/stat.h>    // For fstat
+#include <unistd.h>      // For close
+#include <string>
+#include <filesystem>
+#include <stdexcept>
+
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
@@ -13,13 +21,16 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/ov_interface.h"
 
+#ifdef _WIN32
 #include "Windows.h"
+#endif
 
 using Exception = ov::Exception;
 
 namespace onnxruntime {
 namespace openvino_ep {
 
+#ifdef _WIN32
 SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) {
   file_ = CreateFile(filename.string().data(),
                      GENERIC_READ,
@@ -52,6 +63,50 @@ SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
     file_ = nullptr;
   }
 }
+#else
+SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename)
+    : file_(nullptr), mapping_(nullptr) {
+    // Open the file
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd == -1) {
+        ORT_THROW("Unable to open weight file at " + filename.string());
+    }
+
+    // Get file size
+    struct stat file_stat;
+    if (fstat(fd, &file_stat) == -1) {
+        close(fd);
+        ORT_THROW("Unable to get file size for " + filename.string());
+    }
+    size_t file_size = file_stat.st_size;
+
+    // Map the file into memory
+    void* raw_data = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    if (raw_data == MAP_FAILED) {
+        close(fd);
+        ORT_THROW("Unable to map weight file at " + filename.string());
+    }
+
+    // Set class members
+    file_ = reinterpret_cast<void*>(fd);       // Store file descriptor
+    mapping_ = raw_data;                       // Store mapping address
+    weight_data = std::string_view(static_cast<const char*>(raw_data), file_size);
+
+    // Close the file descriptor, as mmap does not need it open
+    close(fd);
+}
+
+SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
+    // Unmap memory if it was mapped
+    if (mapping_ != nullptr) {
+        munmap(mapping_, weight_data.size());
+        mapping_ = nullptr;
+    }
+
+    // Clear the file descriptor, though it was already closed after mmap
+    file_ = nullptr;
+}
+#endif
 
 std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) {
   try {
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index c861446e5cd08..786be49954e65 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -107,7 +107,8 @@ struct SessionContext : ProviderInfo {
   std::filesystem::path onnx_model_path_name;
   uint32_t onnx_opset_version{0};
   const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
-  const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR);
+  // const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR);
+  const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR);
 };
 
 // Holds context specific to subgraph.
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 05a54c2a328bb..b2be8176e315d 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -5,7 +5,7 @@
 #include <string>
 #include <memory>
 #include <vector>
-#include <format>
+// #include <format>
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/contexts.h"
@@ -23,6 +23,7 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Parking this code here for now before it's moved to the factory
+#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
 static std::vector<std::string> parseDevices(const std::string& device_string,
                                              const std::vector<std::string>& available_devices) {
   std::string comma_separated_devices = device_string;
@@ -50,6 +51,7 @@ static std::vector<std::string> parseDevices(const std::string& device_string,
   }
   return devices;
 }
+#endif
 
 // Parking this code here for now before it's moved to the factory
 void AdjustProviderInfo(ProviderInfo& info) {
@@ -93,10 +95,14 @@ void AdjustProviderInfo(ProviderInfo& info) {
 #endif
   } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) {
     info.device_type = std::move(info.device_type);
-  } else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+  }
+#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO  
+   else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
     std::ignore = parseDevices(info.device_type, available_devices);
     info.device_type = std::move(info.device_type);
-  } else {
+  } 
+#endif
+  else {
     ORT_THROW("Invalid device string: " + info.device_type);
   }
   LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"

From 16ddb42926a5891a733e74d00ea0fb059f1a815f Mon Sep 17 00:00:00 2001
From: saurabhkale17 <saurabh1.kale@intel.com>
Date: Wed, 22 Jan 2025 20:27:00 +0530
Subject: [PATCH 27/35] fix ci issues

---
 onnxruntime/core/providers/openvino/backend_manager.cc           | 1 -
 onnxruntime/core/providers/openvino/contexts.h                   | 1 -
 .../core/providers/openvino/openvino_execution_provider.cc       | 1 -
 3 files changed, 3 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index d913daf14d6fe..4479399f22790 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -236,7 +236,6 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     if (blob_filename.empty()) {
       blob_filename = session_context_.onnx_model_path_name;
     }
-    // const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)};
     const auto name = graph_body_viewer.ModelPath().stem().string() + "_" + subgraph_context_.subgraph_name;
     blob_filename = blob_filename.parent_path() / name;
     blob_filename.replace_extension("blob");
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 786be49954e65..b76e6d7657a45 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -107,7 +107,6 @@ struct SessionContext : ProviderInfo {
   std::filesystem::path onnx_model_path_name;
   uint32_t onnx_opset_version{0};
   const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
-  // const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR);
   const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR);
 };
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index b2be8176e315d..ae9b347b26c16 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -5,7 +5,6 @@
 #include <string>
 #include <memory>
 #include <vector>
-// #include <format>
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/contexts.h"

From 10e851b31ce2888e6658bb0d2a591afad80dbfe5 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Wed, 22 Jan 2025 21:32:55 -0800
Subject: [PATCH 28/35] Fix Windows build failure

---
 onnxruntime/core/providers/openvino/backend_utils.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index ffe238a48ba7f..ecfd2d03dfa6d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -6,11 +6,6 @@
 #include <fstream>
 #include <utility>
 
-#include <fcntl.h>       // For open
-#include <sys/mman.h>    // For mmap, munmap
-#include <sys/stat.h>    // For fstat
-#include <unistd.h>      // For close
-#include <string>
 #include <filesystem>
 #include <stdexcept>
 
@@ -23,6 +18,11 @@
 
 #ifdef _WIN32
 #include "Windows.h"
+#else
+#include <fcntl.h>       // For open
+#include <sys/mman.h>    // For mmap, munmap
+#include <sys/stat.h>    // For fstat
+#include <unistd.h>      // For close
 #endif
 
 using Exception = ov::Exception;

From 6f7782ca36069fa460200a06d5c3bbb7dafb2d5f Mon Sep 17 00:00:00 2001
From: Eric Crawford <eric.r.crawford@intel.com>
Date: Wed, 22 Jan 2025 17:01:13 -0800
Subject: [PATCH 29/35] Use ifstream to load weights instead of mmaped file

---
 .../providers/openvino/backend_manager.cc     |   4 +-
 .../core/providers/openvino/backend_utils.cc  | 102 +++---------------
 .../core/providers/openvino/backend_utils.h   |   2 +-
 .../core/providers/openvino/contexts.h        |  17 ++-
 4 files changed, 25 insertions(+), 100 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 4479399f22790..dc4a1cf2b4ce9 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -102,9 +102,9 @@ BackendManager::BackendManager(SessionContext& session_context,
 
     if (weight_file) {
       if (!sw.mapped_weights) {
-        sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::MappedWeights>(weight_filename);
+        sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::WeightsFile>(weight_filename);
       }
-      backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, sw.mapped_weights->weight_data);
+      backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights);
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index ecfd2d03dfa6d..90e5fd92517f8 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -1,6 +1,5 @@
 // Copyright (C) Intel Corporation
 // Licensed under the MIT License
-
 #include <algorithm>
 #include <sstream>
 #include <fstream>
@@ -16,97 +15,26 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/ov_interface.h"
 
-#ifdef _WIN32
-#include "Windows.h"
-#else
-#include <fcntl.h>       // For open
-#include <sys/mman.h>    // For mmap, munmap
-#include <sys/stat.h>    // For fstat
-#include <unistd.h>      // For close
-#endif
 
 using Exception = ov::Exception;
 
 namespace onnxruntime {
 namespace openvino_ep {
 
-#ifdef _WIN32
-SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) {
-  file_ = CreateFile(filename.string().data(),
-                     GENERIC_READ,
-                     FILE_SHARE_READ,
-                     0,
-                     OPEN_EXISTING,
-                     FILE_ATTRIBUTE_NORMAL,
-                     0);
-  ORT_ENFORCE(file_ != nullptr, "Unable to open weight file at ", filename.string());
-
-  mapping_ = CreateFileMapping(file_, 0, PAGE_READONLY, 0, 0, 0);
-  ORT_ENFORCE(mapping_ != nullptr, "Unable to create mapping of weight file at ", filename.string());
-
-  const char* raw_data = static_cast<const char*>(MapViewOfFile(mapping_, FILE_MAP_READ, 0, 0, 0));
-  ORT_ENFORCE(raw_data != nullptr, "Unable to map weight file at ", filename.string());
-
-  weight_data = std::string_view(raw_data, std::filesystem::file_size(filename));
-}
-
-SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
-  if (!weight_data.empty()) {
-    UnmapViewOfFile(weight_data.data());
-  }
-  if (mapping_ != nullptr) {
-    CloseHandle(mapping_);
-    mapping_ = nullptr;
-  }
-  if (file_ != nullptr) {
-    CloseHandle(file_);
-    file_ = nullptr;
+SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) {
+  try {
+    file_.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    weights_size_ = file_.seekg(0, std::ios::end).tellg();
+  } catch (std::ifstream::failure& e) {
+    ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what());
   }
 }
-#else
-SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename)
-    : file_(nullptr), mapping_(nullptr) {
-    // Open the file
-    int fd = open(filename.c_str(), O_RDONLY);
-    if (fd == -1) {
-        ORT_THROW("Unable to open weight file at " + filename.string());
-    }
-
-    // Get file size
-    struct stat file_stat;
-    if (fstat(fd, &file_stat) == -1) {
-        close(fd);
-        ORT_THROW("Unable to get file size for " + filename.string());
-    }
-    size_t file_size = file_stat.st_size;
-
-    // Map the file into memory
-    void* raw_data = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
-    if (raw_data == MAP_FAILED) {
-        close(fd);
-        ORT_THROW("Unable to map weight file at " + filename.string());
-    }
-
-    // Set class members
-    file_ = reinterpret_cast<void*>(fd);       // Store file descriptor
-    mapping_ = raw_data;                       // Store mapping address
-    weight_data = std::string_view(static_cast<const char*>(raw_data), file_size);
-
-    // Close the file descriptor, as mmap does not need it open
-    close(fd);
-}
-
-SharedContext::SharedWeights::MappedWeights::~MappedWeights() {
-    // Unmap memory if it was mapped
-    if (mapping_ != nullptr) {
-        munmap(mapping_, weight_data.size());
-        mapping_ = nullptr;
-    }
 
-    // Clear the file descriptor, though it was already closed after mmap
-    file_ = nullptr;
+void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) {
+  ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds.");
+  file_.seekg(file_offset);
+  file_.read(reinterpret_cast<char*>(data), size);
 }
-#endif
 
 std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) {
   try {
@@ -457,13 +385,10 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt
 // Function to handle tensor creation from external data
 void CreateOVTensors(const std::string& device_name,
                      SharedContext::SharedWeights::Metadata::Map& metadata_map,
-                     std::string_view weights) {
+                     SharedContext::SharedWeights::WeightsFile &weights) {
   for (auto& [key, value] : metadata_map) {
     if (value.tensor) continue;
 
-    // Get tensor data
-    const auto* tensor_data = weights.data() + value.data_offset;
-
     // Get element data type
     auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type;
 
@@ -476,11 +401,12 @@ void CreateOVTensors(const std::string& device_name,
       auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT);
 
       // Copy data to remote tensor
-      std::memcpy(remote_tensor.get(), (void*)tensor_data, value.size);
+      weights.load_weights(value.data_offset, remote_tensor.get(), value.size);
       value.tensor = std::make_shared<ov::Tensor>(remote_tensor);
     } else {
       // Use vanilla tensors
-      value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions, (void*)tensor_data);
+      value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions);
+      weights.load_weights(value.data_offset, value.tensor->data(), value.size);
     }
     ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch");
   }
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index e27a6e277a1a3..d406daa4e24e4 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -69,7 +69,7 @@ CreateOVModel(const std::string model,
 
 void CreateOVTensors(const std::string& device_name,
                      SharedContext::SharedWeights::Metadata::Map& metadata_map,
-                     std::string_view weights);
+                     SharedContext::SharedWeights::WeightsFile& weights);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName);
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index b76e6d7657a45..a0462e5be35f3 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -42,21 +42,20 @@ struct SharedContext {
       friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata);
     };
 
-    struct MappedWeights {
-      ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MappedWeights);
-      ~MappedWeights();
-      MappedWeights() = delete;
-      explicit MappedWeights(std::filesystem::path filename);
+    struct WeightsFile {
+      ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile);
+      WeightsFile() = delete;
+      explicit WeightsFile(std::filesystem::path filename);
 
-      std::string_view weight_data;
+      void load_weights(size_t file_offset, void* data, size_t size);
 
      private:
-      void* file_;
-      void* mapping_;
+      std::ifstream file_;
+      size_t weights_size_;
     };
 
     fs::path external_weight_filename;
-    std::unique_ptr<MappedWeights> mapped_weights;
+    std::unique_ptr<WeightsFile> mapped_weights;
     Metadata::Map metadata;
   } shared_weights;
 };

From f3e4e078aadedffc33ce209dc9768d4863e4fe9e Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Thu, 23 Jan 2025 15:48:24 -0800
Subject: [PATCH 30/35] Fix for epctx models made up entirely of OVEP epctx
 nodes

---
 .../core/providers/openvino/ov_versions/capability.cc       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index b9f01cc261f52..cb538c84441fa 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -95,6 +95,12 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     }
   }
 
+  // If all the nodes have been accounted for then no more processing is needed
+  if (result.size() == nodes.size()) {
+    is_wholly_supported_graph_ = true;
+    return result;
+  }
+
   // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc.
   std::unordered_set<std::string> ng_required_initializers;
 

From 1d4c16ef2d39cbab9a7a2e0a907d5fe063d379cb Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Fri, 24 Jan 2025 12:01:23 -0800
Subject: [PATCH 31/35] Limit ov::Core lifetime to that of provider object

---
 .../openvino/openvino_provider_factory.cc     | 20 +++++++------
 .../core/providers/openvino/ov_interface.cc   | 29 +++++++++++++------
 .../core/providers/openvino/ov_interface.h    |  5 +++-
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 86804b8961cac..a80c250a75bf7 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -19,25 +19,23 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
 
   ~OpenVINOProviderFactory() override {}
 
-  std::unique_ptr<IExecutionProvider> CreateProvider() override;
+  std::unique_ptr<IExecutionProvider> CreateProvider() override {
+    return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
+  }
 
  private:
   ProviderInfo provider_info_;
   SharedContext& shared_context_;
 };
 
-std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
-  return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
-}
-
 struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO {
   std::vector<std::string> GetAvailableDevices() const override {
     return OVCore::GetAvailableDevices();
   }
-} g_info;
+};
 
 struct OpenVINO_Provider : Provider {
-  void* GetInfo() override { return &g_info; }
+  void* GetInfo() override { return &info_; }
 
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* void_params) override {
     // Extract the void_params into ProviderOptions and ConfigOptions
@@ -287,14 +285,17 @@ struct OpenVINO_Provider : Provider {
   }
 
   void Initialize() override {
+    OVCore::Initialize();
   }
 
   void Shutdown() override {
+    OVCore::Teardown();
   }
 
  private:
   SharedContext shared_context_;
-} g_provider;
+  ProviderInfo_OpenVINO_Impl info_;
+};  // OpenVINO_Provider
 
 }  // namespace openvino_ep
 }  // namespace onnxruntime
@@ -302,6 +303,7 @@ struct OpenVINO_Provider : Provider {
 extern "C" {
 
 ORT_API(onnxruntime::Provider*, GetProvider) {
-  return &onnxruntime::openvino_ep::g_provider;
+  static onnxruntime::openvino_ep::OpenVINO_Provider g_provider;
+  return &g_provider;
 }
 }
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 6ce2d506211e7..e12a560809519 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -14,7 +14,17 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 static const std::string log_tag = "[OpenVINO-EP] ";
-static ov::Core g_core;
+static std::unique_ptr<ov::Core> g_core;
+
+void OVCore::Initialize()
+{
+  g_core = std::make_unique<ov::Core>();
+}
+
+void OVCore::Teardown()
+{
+  g_core.reset();
+}
 
 #ifndef NDEBUG
 void printDebugInfo(const ov::CompiledModel& obj) {
@@ -78,7 +88,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = g_core.compile_model(ie_cnn_network, hw_target, device_config);
+    obj = Get().compile_model(ie_cnn_network, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -97,7 +107,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = g_core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
+    obj = Get().compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -116,7 +126,7 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    obj = g_core.import_model(model_stream, hw_target, device_config);
+    obj = Get().import_model(model_stream, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -130,11 +140,12 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
 }
 
 void OVCore::SetCache(const std::string& cache_dir_path) {
-  g_core.set_property(ov::cache_dir(cache_dir_path));
+  Get().set_property(ov::cache_dir(cache_dir_path));
 }
 
-ov::Core& OVCore::Get() {
-  return g_core;
+inline ov::Core& OVCore::Get() {
+  ORT_ENFORCE(g_core);
+  return *g_core;
 }
 
 #ifdef IO_BUFFER_ENABLED
@@ -170,12 +181,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_strea
 #endif
 
 std::vector<std::string> OVCore::GetAvailableDevices() {
-  auto available_devices = g_core.get_available_devices();
+  auto available_devices = Get().get_available_devices();
   return available_devices;
 }
 
 void OVCore::SetStreams(const std::string& device_type, int num_streams) {
-  g_core.set_property(device_type, {ov::num_streams(num_streams)});
+  Get().set_property(device_type, {ov::num_streams(num_streams)});
 }
 
 OVInferRequest OVExeNetwork::CreateInferRequest() {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index a2547ada60f34..53b814094438e 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -38,6 +38,9 @@ typedef ov::RemoteContext OVRemoteContext;
 #endif
 
 struct OVCore {
+  static void Initialize();
+  static void Teardown();
+
   // OV Interface For Reading Model
   static std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path);
 
@@ -66,7 +69,7 @@ struct OVCore {
 #endif
   static std::vector<std::string> GetAvailableDevices();
   static void SetCache(const std::string& cache_dir_path);
-  static ov::Core& Get();
+  inline static ov::Core& Get();
   static void SetStreams(const std::string& device_type, int num_streams);
 };
 

From 6ee1e162bb103b19ca09dd88b54514812dbb90e5 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Fri, 24 Jan 2025 13:24:27 -0800
Subject: [PATCH 32/35] Enforce shared tensors cleanup on shutdown

---
 .../core/providers/openvino/backend_manager.cc      |  3 +++
 .../core/providers/openvino/backend_utils.cc        |  9 +++++++++
 onnxruntime/core/providers/openvino/backend_utils.h |  1 +
 .../providers/openvino/backends/backend_factory.cc  |  6 ++++++
 onnxruntime/core/providers/openvino/ibackend.h      |  1 +
 .../openvino/openvino_execution_provider.cc         | 13 ++++++++++---
 .../openvino/openvino_execution_provider.h          |  2 +-
 .../providers/openvino/openvino_provider_factory.cc |  2 ++
 8 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index dc4a1cf2b4ce9..3ae4677a86375 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -544,6 +544,9 @@ void BackendManager::Compute(OrtKernelContext* context) {
 }
 
 void BackendManager::ShutdownBackendManager() {
+  backend_map_.clear();
+  BackendFactory::DestroyBackend(concrete_backend_.get());
+  concrete_backend_.reset();
 }
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 90e5fd92517f8..dfc094267f905 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -412,6 +412,15 @@ void CreateOVTensors(const std::string& device_name,
   }
 }
 
+void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) {
+  for (auto& [key, value] : metadata_map) {
+    if (value.tensor) {
+      value.tensor.reset();
+    }
+  }
+  metadata_map.clear();
+}
+
 }  // namespace backend_utils
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index d406daa4e24e4..06fdfe9cd5eca 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -70,6 +70,7 @@ CreateOVModel(const std::string model,
 void CreateOVTensors(const std::string& device_name,
                      SharedContext::SharedWeights::Metadata::Map& metadata_map,
                      SharedContext::SharedWeights::WeightsFile& weights);
+void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName);
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index 99955da539ae7..78c38ba882512 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -33,5 +33,11 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
     ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
   }
 }
+
+void BackendFactory::DestroyBackend(IBackend* backend) {
+  BasicBackend* backend_ptr = (BasicBackend*)backend;
+  delete backend_ptr;
+  backend_ptr = nullptr;
+}
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index d2f91cacb6c4d..2e01dc00faa6a 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -26,6 +26,7 @@ class BackendFactory {
               const SubGraphContext& subgraph_context,
               SharedContext& shared_context,
               ptr_stream_t& model_stream);
+  static void DestroyBackend(IBackend* backend);
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index ae9b347b26c16..68ee37097cc84 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -95,11 +95,11 @@ void AdjustProviderInfo(ProviderInfo& info) {
   } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) {
     info.device_type = std::move(info.device_type);
   }
-#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO  
-   else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+  else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
     std::ignore = parseDevices(info.device_type, available_devices);
     info.device_type = std::move(info.device_type);
-  } 
+  }
 #endif
   else {
     ORT_THROW("Invalid device string: " + info.device_type);
@@ -153,6 +153,13 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, S
   }
 }
 
+OpenVINOExecutionProvider::~OpenVINOExecutionProvider() {
+  for (auto& backend_manager : backend_managers_) {
+    backend_manager.ShutdownBackendManager();
+  }
+  backend_managers_.clear();
+}
+
 std::vector<std::unique_ptr<ComputeCapability>>
 OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/) const {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 294f4d6db54a4..75f4ef9f8ecc8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -46,7 +46,7 @@ static std::vector<std::string> split(const std::string& s, char delim) {
 class OpenVINOExecutionProvider : public IExecutionProvider {
  public:
   explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context);
-  ~OpenVINOExecutionProvider() = default;
+  ~OpenVINOExecutionProvider();
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph_viewer,
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index a80c250a75bf7..41f62377a6a3d 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -8,6 +8,7 @@
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/openvino/contexts.h"
+#include "core/providers/openvino/backend_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
@@ -289,6 +290,7 @@ struct OpenVINO_Provider : Provider {
   }
 
   void Shutdown() override {
+    backend_utils::DestroyOVTensors(shared_context_.shared_weights.metadata);
     OVCore::Teardown();
   }
 

From 6c108f2c978b1151aebf43c8abd46c117d6ba1c1 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Fri, 24 Jan 2025 21:27:49 -0800
Subject: [PATCH 33/35] Add support for default device type based on project
 configuration

---
 .../openvino/openvino_provider_factory.cc     | 288 +++++++++++-------
 1 file changed, 171 insertions(+), 117 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 41f62377a6a3d..aec02bbc8be7b 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -14,6 +14,150 @@
 
 namespace onnxruntime {
 namespace openvino_ep {
+void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) {
+  pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
+  pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+  pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
+  pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+  pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+}
+
+void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) {
+  if (provider_options.contains("context")) {
+    uint64_t number = std::strtoull(provider_options.at("context").data(), nullptr, 16);
+    return reinterpret_cast<void*>(number);
+  } else {
+    return nullptr;
+  }
+}
+
+bool ParseBooleanOption(const ProviderOptions& provider_options, std::string option_name) {
+  if (provider_options.contains(option_name)) {
+    const auto& value = provider_options.at(option_name);
+    if (value == "true" || value == "True") {
+      return true;
+    } else if (value == "false" || value == "False") {
+      return false;
+    } else {
+      ORT_THROW("[ERROR] [OpenVINO-EP] ", option_name, " should be a boolean.\n");
+    }
+  }
+  return false;
+}
+
+std::string ParseDeviceType(const ProviderOptions& provider_options, std::string option_name) {
+  const std::vector<std::string> ov_available_devices = OVCore::GetAvailableDevices();
+
+  std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
+                                                     "GPU.0", "GPU.1", "NPU"};
+  std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
+                                                   "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
+                                                   "GPU.0_FP16", "GPU.1_FP16"};
+
+  // Expand set of supported device with OV devices
+  ov_supported_device_types.insert(ov_available_devices.begin(), ov_available_devices.end());
+
+  if (provider_options.contains(option_name)) {
+    const auto& selected_device = provider_options.at("device_type");
+
+    if (deprecated_device_types.contains(selected_device)) {
+      // Deprecated device and precision is handled together at ParsePrecision
+      return selected_device;
+    }
+
+    if (!((ov_supported_device_types.contains(selected_device)) ||
+          (selected_device.find("HETERO:") == 0) ||
+          (selected_device.find("MULTI:") == 0) ||
+          (selected_device.find("AUTO:") == 0))) {
+      ORT_THROW(
+          "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
+          "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
+          " HETERO/MULTI/AUTO options available. \n");
+    }
+    return selected_device;
+  } else {
+    std::string default_device;
+
+    // Take default behavior from project configuration
+#if defined OPENVINO_CONFIG_CPU
+    default_device = "CPU";
+#elif defined OPENVINO_CONFIG_GPU
+    default_device = "GPU";
+#elif defined OPENVINO_CONFIG_NPU
+    default_device = "NPU";
+#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+    default_device = DEVICE_NAME;
+
+    // Validate that devices passed are valid
+    int delimit = device_type.find(":");
+    const auto& devices = device_type.substr(delimit + 1);
+    auto device_list = split(devices, ',');
+    for (const auto& device : devices) {
+      if (!ov_supported_device_types.contains(device)) {
+        ORT_THROW("[ERROR] [OpenVINO] Invalid device selected: ", device);
+      }
+    }
+#endif
+
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << default_device;
+    return default_device;
+  }
+}
+
+// Depends on ProviderOptions.
+std::string ParsePrecision(const ProviderOptions& provider_options, std::string& device_type, const std::string& option_name) {
+  using DeviceName = std::string;
+  using DefaultValue = std::string;
+  using ValidValues = std::list<std::string>;
+  using foo = std::pair<DefaultValue, ValidValues>;
+  using ParserHelper = std::map<DeviceName, foo>;
+  ParserHelper helper = {
+      {"GPU", {"FP16", {"FP16", "FP32"}}},
+      {"NPU", {"FP16", {"FP16"}}},
+      {"CPU", {"FP32", {"FP32"}}},
+  };
+
+  std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
+                                                   "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
+                                                   "GPU.0_FP16", "GPU.1_FP16"};
+
+  if (provider_options.contains(option_name)) {
+    // Start by checking if the device_type is a normal valid one
+    if (helper.contains(device_type)) {
+      auto const& valid_values = helper[device_type].second;
+      const auto& precision = provider_options.at(option_name);
+      if (precision == "ACCURACY") {
+        return valid_values.back();  // Return highest supported precision
+      } else {
+        if (std::find(valid_values.begin(), valid_values.end(), precision) != valid_values.end()) {
+          return precision;  // Return precision selected if valid
+        } else {
+          auto value_iter = valid_values.begin();
+          std::string valid_values_joined = *value_iter;
+          // Append 2nd and up, if only one then ++value_iter is same as end()
+          for (++value_iter; value_iter != valid_values.end(); ++value_iter) {
+            valid_values_joined += ", " + *value_iter;
+          }
+
+          ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. ", device_type, " only supports", valid_values_joined, ".\n");
+        }
+      }
+    } else if (deprecated_device_types.contains(device_type)) {
+      LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + device_type + " is deprecated. \n"
+                            << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', "
+                            << "'GPU.1', 'NPU' or from"
+                            << " HETERO/MULTI/AUTO options and set 'precision' separately. \n";
+      int delimit = device_type.find("_");
+      device_type = device_type.substr(0, delimit);
+      return device_type.substr(delimit + 1);
+    }
+  }
+  // Return default
+  return helper[device_type].first;
+}
+
+void ParseProviderOptions(ProviderInfo& result, const ProviderOptions& config_options) {}
+
 struct OpenVINOProviderFactory : IExecutionProviderFactory {
   OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context)
       : provider_info_(provider_info), shared_context_(shared_context) {}
@@ -42,49 +186,17 @@ struct OpenVINO_Provider : Provider {
     // Extract the void_params into ProviderOptions and ConfigOptions
     using ConfigBuffer = std::pair<const ProviderOptions*, const ConfigOptions&>;
     const ConfigBuffer* buffer = reinterpret_cast<const ConfigBuffer*>(void_params);
-    const auto& provider_options_map = *buffer->first;
+    const auto& provider_options = *buffer->first;
     const auto& config_options = buffer->second;
 
     ProviderInfo pi;
 
     std::string bool_flag = "";
-    if (provider_options_map.find("device_type") != provider_options_map.end()) {
-      pi.device_type = provider_options_map.at("device_type").c_str();
-
-      std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
-                                                         "GPU.0", "GPU.1", "NPU"};
-      std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
-                                                       "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                       "GPU.0_FP16", "GPU.1_FP16"};
-      std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
-
-      for (auto& device : available_devices) {
-        if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
-          ov_supported_device_types.emplace(device);
-        }
-      }
-      if (deprecated_device_types.find(pi.device_type) != deprecated_device_types.end()) {
-        std::string deprecated_device = pi.device_type;
-        int delimit = pi.device_type.find("_");
-        pi.device_type = deprecated_device.substr(0, delimit);
-        pi.precision = deprecated_device.substr(delimit + 1);
-        LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n"
-                              << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', "
-                              << "'GPU.1', 'NPU' or from"
-                              << " HETERO/MULTI/AUTO options and set 'precision' separately. \n";
-      }
-      if (!((ov_supported_device_types.find(pi.device_type) != ov_supported_device_types.end()) ||
-            (pi.device_type.find("HETERO:") == 0) ||
-            (pi.device_type.find("MULTI:") == 0) ||
-            (pi.device_type.find("AUTO:") == 0))) {
-        ORT_THROW(
-            "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
-            "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
-            " HETERO/MULTI/AUTO options available. \n");
-      }
-    }
-    if (provider_options_map.find("device_id") != provider_options_map.end()) {
-      std::string dev_id = provider_options_map.at("device_id").c_str();
+
+    pi.device_type = ParseDeviceType(provider_options, "device_type");
+
+    if (provider_options.contains("device_id")) {
+      std::string dev_id = provider_options.at("device_id").data();
       LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. "
                             << "Upgrade to set deice_type and precision session options.\n";
       if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") {
@@ -93,34 +205,13 @@ struct OpenVINO_Provider : Provider {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options.");
       }
     }
-    if (provider_options_map.find("precision") != provider_options_map.end()) {
-      pi.precision = provider_options_map.at("precision").c_str();
-    }
-    if (pi.device_type.find("GPU") != std::string::npos) {
-      if (pi.precision == "") {
-        pi.precision = "FP16";
-      } else if (pi.precision != "ACCURACY" && pi.precision != "FP16" && pi.precision != "FP32") {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n");
-      }
-    } else if (pi.device_type.find("NPU") != std::string::npos) {
-      if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP16") {
-        pi.precision = "FP16";
-      } else {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n");
-      }
-    } else if (pi.device_type.find("CPU") != std::string::npos) {
-      if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP32") {
-        pi.precision = "FP32";
-      } else {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n");
-      }
+    if (provider_options.contains("cache_dir")) {
+      pi.cache_dir = provider_options.at("cache_dir");
     }
 
-    if (provider_options_map.find("cache_dir") != provider_options_map.end()) {
-      pi.cache_dir = provider_options_map.at("cache_dir");
-    }
+    pi.precision = ParsePrecision(provider_options, pi.device_type, "precision");
 
-    if (provider_options_map.find("load_config") != provider_options_map.end()) {
+    if (provider_options.contains("load_config")) {
       auto parse_config = [&](const std::string& config_str) -> std::map<std::string, ov::AnyMap> {
         // If the config string is empty, return an empty map and skip processing
         if (config_str.empty()) {
@@ -179,29 +270,25 @@ struct OpenVINO_Provider : Provider {
         return target_map;
       };
 
-      pi.load_config = parse_config(provider_options_map.at("load_config"));
+      pi.load_config = parse_config(provider_options.at("load_config"));
     }
 
-    if (provider_options_map.find("context") != provider_options_map.end()) {
-      std::string str = provider_options_map.at("context");
-      uint64_t number = std::strtoull(str.c_str(), nullptr, 16);
-      pi.context = reinterpret_cast<void*>(number);
-    }
+    pi.context = ParseUint64(provider_options, "context");
 #if defined(IO_BUFFER_ENABLED)
     // a valid context must be provided to enable IO Buffer optimizations
-    if (context == nullptr) {
+    if (pi.context == nullptr) {
 #undef IO_BUFFER_ENABLED
 #define IO_BUFFER_ENABLED = 0
       LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
     }
 #endif
 
-    if (provider_options_map.find("num_of_threads") != provider_options_map.end()) {
-      if (!std::all_of(provider_options_map.at("num_of_threads").begin(),
-                       provider_options_map.at("num_of_threads").end(), ::isdigit)) {
+    if (provider_options.contains("num_of_threads")) {
+      if (!std::all_of(provider_options.at("num_of_threads").begin(),
+                       provider_options.at("num_of_threads").end(), ::isdigit)) {
         ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n");
       }
-      pi.num_of_threads = std::stoi(provider_options_map.at("num_of_threads"));
+      pi.num_of_threads = std::stoi(provider_options.at("num_of_threads"));
       if (pi.num_of_threads <= 0) {
         pi.num_of_threads = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n "
@@ -209,8 +296,8 @@ struct OpenVINO_Provider : Provider {
       }
     }
 
-    if (provider_options_map.find("model_priority") != provider_options_map.end()) {
-      pi.model_priority = provider_options_map.at("model_priority").c_str();
+    if (provider_options.contains("model_priority")) {
+      pi.model_priority = provider_options.at("model_priority").data();
       std::vector<std::string> supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"});
       if (std::find(supported_priorities.begin(), supported_priorities.end(),
                     pi.model_priority) == supported_priorities.end()) {
@@ -221,59 +308,26 @@ struct OpenVINO_Provider : Provider {
       }
     }
 
-    if (provider_options_map.find("num_streams") != provider_options_map.end()) {
-      pi.num_streams = std::stoi(provider_options_map.at("num_streams"));
+    if (provider_options.contains("num_streams")) {
+      pi.num_streams = std::stoi(provider_options.at("num_streams"));
       if (pi.num_streams <= 0) {
         pi.num_streams = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n "
                               << "Executing with num_streams=1";
       }
     }
-    if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("enable_opencl_throttling");
-      if (bool_flag == "true" || bool_flag == "True")
-        pi.enable_opencl_throttling = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        pi.enable_opencl_throttling = false;
-      bool_flag = "";
-    }
+    pi.enable_opencl_throttling = ParseBooleanOption(provider_options, "enable_opencl_throttling");
 
-    if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("enable_qdq_optimizer");
-      if (bool_flag == "true" || bool_flag == "True")
-        pi.enable_qdq_optimizer = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        pi.enable_qdq_optimizer = false;
-      else
-        ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n");
-      bool_flag = "";
-    }
+    pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");
+
+    pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");
+
+    ParseConfigOptions(pi, config_options);
 
     // Always true for NPU plugin or when passed .
     if (pi.device_type.find("NPU") != std::string::npos) {
       pi.disable_dynamic_shapes = true;
     }
-    if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("disable_dynamic_shapes");
-      if (bool_flag == "true" || bool_flag == "True") {
-        pi.disable_dynamic_shapes = true;
-      } else if (bool_flag == "false" || bool_flag == "False") {
-        if (pi.device_type.find("NPU") != std::string::npos) {
-          pi.disable_dynamic_shapes = true;
-          LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to "
-                             << "TRUE for NPU backend.\n ";
-        } else {
-          pi.disable_dynamic_shapes = false;
-        }
-      }
-      bool_flag = "";
-    }
-
-    pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-    pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-    pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
-    pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
-    pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
 
     // Append values to config to support weight-as-inputs conversion for shared contexts
     if (pi.so_share_ep_contexts) {

From 6d1f1cf9bcdee2c3ddefc78755124cb6016dce44 Mon Sep 17 00:00:00 2001
From: ankitm3k <ankit.maheshkar@intel.com>
Date: Sat, 25 Jan 2025 16:00:11 +0530
Subject: [PATCH 34/35] fix: Fixed concrete_backend_ pointer double free issue
 on Linux

---
 onnxruntime/core/providers/openvino/backend_manager.cc       | 1 -
 .../core/providers/openvino/backends/backend_factory.cc      | 5 -----
 onnxruntime/core/providers/openvino/backends/basic_backend.h | 1 +
 onnxruntime/core/providers/openvino/ibackend.h               | 2 +-
 .../core/providers/openvino/openvino_provider_factory.cc     | 4 ++--
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 3ae4677a86375..574b4371fee87 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -545,7 +545,6 @@ void BackendManager::Compute(OrtKernelContext* context) {
 
 void BackendManager::ShutdownBackendManager() {
   backend_map_.clear();
-  BackendFactory::DestroyBackend(concrete_backend_.get());
   concrete_backend_.reset();
 }
 
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index 78c38ba882512..6c1ed9aa42727 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -34,10 +34,5 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
   }
 }
 
-void BackendFactory::DestroyBackend(IBackend* backend) {
-  BasicBackend* backend_ptr = (BasicBackend*)backend;
-  delete backend_ptr;
-  backend_ptr = nullptr;
-}
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 22bcc4c1da40e..2690b84cb432f 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -37,6 +37,7 @@ class BasicBackend : public IBackend {
                ptr_stream_t& model_stream);
 
   void Infer(OrtKernelContext* context) override;
+  ~BasicBackend() override = default;
   ov::CompiledModel& GetOVCompiledModel() override {
     return exe_network_.Get();
   }
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 2e01dc00faa6a..04d1f52cbf834 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -16,6 +16,7 @@ class IBackend {
  public:
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel& GetOVCompiledModel() = 0;
+  virtual ~IBackend() = default;
 };
 using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
@@ -26,7 +27,6 @@ class BackendFactory {
               const SubGraphContext& subgraph_context,
               SharedContext& shared_context,
               ptr_stream_t& model_stream);
-  static void DestroyBackend(IBackend* backend);
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index aec02bbc8be7b..40843be978d90 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -22,7 +22,7 @@ void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) {
   pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
 }
 
-void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) {
+void* ParseUint64(const ProviderOptions& provider_options, [[maybe_unused]] std::string option_name) {
   if (provider_options.contains("context")) {
     uint64_t number = std::strtoull(provider_options.at("context").data(), nullptr, 16);
     return reinterpret_cast<void*>(number);
@@ -156,7 +156,7 @@ std::string ParsePrecision(const ProviderOptions& provider_options, std::string&
   return helper[device_type].first;
 }
 
-void ParseProviderOptions(ProviderInfo& result, const ProviderOptions& config_options) {}
+void ParseProviderOptions([[maybe_unused]] ProviderInfo& result, [[maybe_unused]] const ProviderOptions& config_options) {}
 
 struct OpenVINOProviderFactory : IExecutionProviderFactory {
   OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context)

From 7179a0ba53d60fd98fe8e7d3e31dfbbdab3362d9 Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Mon, 27 Jan 2025 06:10:37 -0800
Subject: [PATCH 35/35] Preetha/weight sharing fix (#545)

* Move variables from subgraph to session context for model specific properties

* Fix for redundant subgraph creation

* Remove unused variable
---
 .../providers/openvino/backend_manager.cc     |  8 -----
 .../core/providers/openvino/backend_utils.cc  |  3 +-
 .../core/providers/openvino/backend_utils.h   |  1 -
 .../openvino/backends/basic_backend.cc        |  6 ++--
 .../core/providers/openvino/contexts.h        |  5 ++-
 .../openvino/openvino_execution_provider.cc   |  4 +--
 .../openvino/openvino_execution_provider.h    |  2 +-
 .../openvino/ov_versions/capability.cc        | 36 ++++++-------------
 8 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 574b4371fee87..3740fdc239aea 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -61,14 +61,6 @@ BackendManager::BackendManager(SessionContext& session_context,
     return "";
   }(subgraph);
 
-  openvino_ep::GetCapability obj(ep_ctx_handle_,
-                                 subgraph,
-                                 session_context_.device_type,
-                                 session_context_.enable_qdq_optimizer);
-  std::ignore = obj.Execute();
-  subgraph_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph();
-  subgraph_context_.has_external_weights = obj.HasExternalWeights();
-
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
   for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) {
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index dfc094267f905..5322008905c0d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -123,7 +123,6 @@ bool IsCILogEnabled() {
 std::shared_ptr<const OVNetwork>
 CreateOVModel(const std::string model,
               const SessionContext& session_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
@@ -132,7 +131,7 @@ CreateOVModel(const std::string model,
     auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string());
 
     // Check for Constant Folding
-    if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) {
+    if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
       pass_const_obj.run_on_model(ov_model);
       auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 06fdfe9cd5eca..a4e6fc0828f79 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -64,7 +64,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 std::shared_ptr<const OVNetwork>
 CreateOVModel(const std::string model,
               const SessionContext& session_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void CreateOVTensors(const std::string& device_name,
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index a730c0b59628b..4d294a298fdf5 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -75,7 +75,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         if (!subgraph_context.has_dynamic_input_shape) {
           delete model_proto.release();
         }
-        ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
+        ov_model = CreateOVModel(model, session_context_, const_outputs_map_);
       }
       LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
       exe_network_ = OVCore::CompileModel(
@@ -93,7 +93,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                          device_config,
                                          subgraph_context_.subgraph_name);
       model_stream.reset();  // Delete stream after it is no longer needed
-    } else if (!subgraph_context_.has_external_weights &&
+    } else if (!session_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
                !session_context_.so_context_enable &&
                auto_unified_compile) {
@@ -114,7 +114,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         if (!subgraph_context.has_dynamic_input_shape) {
           delete model_proto.release();
         }
-        ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_);
+        ov_model = CreateOVModel(model, session_context_, const_outputs_map_);
       }
       exe_network_ = OVCore::CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index a0462e5be35f3..3b9da726822d5 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -101,10 +101,11 @@ struct ProviderInfo {
 // Holds context applicable to the entire EP instance.
 struct SessionContext : ProviderInfo {
   SessionContext(const ProviderInfo& info) : ProviderInfo{info} {}
-
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::filesystem::path onnx_model_path_name;
   uint32_t onnx_opset_version{0};
+  mutable bool is_wholly_supported_graph = false; //Value is set to mutable to modify from capability
+  mutable bool has_external_weights = false; //Value is set to mutable to modify from capability
   const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
   const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR);
 };
@@ -120,8 +121,6 @@ struct SubGraphContext {
   std::string subgraph_name;
   string_index_map_t input_names;
   string_index_map_t output_names;
-  bool is_wholly_supported_graph = false;
-  bool has_external_weights = false;
   std::string model_precision;
   bool is_ep_ctx_graph = false;
 };
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 68ee37097cc84..7bd50e71935a8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -169,13 +169,13 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-
   openvino_ep::GetCapability obj(ep_ctx_handle_,
                                  graph_viewer,
                                  session_context_.device_type,
                                  session_context_.enable_qdq_optimizer);
   result = obj.Execute();
-
+  session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+  session_context_.has_external_weights = obj.HasExternalWeights();
   return result;
 }
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 75f4ef9f8ecc8..1ce9f83fd78a8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -50,7 +50,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph_viewer,
-                const IKernelLookup& /*kernel_lookup*/) const override;
+                const IKernelLookup& /*kernel_lookup*/) const override ;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index cb538c84441fa..23cd7de6e84ba 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -74,26 +74,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
 
   // Check for EpContext nodes
   const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder();
-  for (const auto node_index : nodes) {
-    const auto& node = *graph_viewer_.GetNode(node_index);
-    if (ep_ctx_handler_.CheckForOVEPCtxNode(node)) {
-      std::vector<std::string> inputs;
-      std::vector<std::string> outputs;
-
-      Iterable2String(inputs, node.InputDefs());
-      Iterable2String(outputs, node.OutputDefs());
-
-      auto sub_graph = IndexedSubGraph::Create();
-      sub_graph->Nodes().push_back(node_index);
-      auto meta_def = IndexedSubGraph_MetaDef::Create();
-      meta_def->name() = node.Name();
-      meta_def->domain() = kMSDomain;
-      meta_def->inputs() = inputs;
-      meta_def->outputs() = outputs;
-      sub_graph->SetMetaDef(std::move(meta_def));
-      result.push_back(ComputeCapability::Create(std::move(sub_graph)));
-    }
-  }
 
   // If all the nodes have been accounted for then no more processing is needed
   if (result.size() == nodes.size()) {
@@ -109,8 +89,8 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
     std::cout << "No of unsupported nodes " << unsupported_nodes.size() << std::endl;
     for (size_t i = 0; i < unsupported_nodes.size(); i++) {
-      const Node* node = graph_viewer_.GetNode(unsupported_nodes[i]);
-      std::cout << "Unsupported node op " << node->OpType() << std::endl;
+      const Node* unode = graph_viewer_.GetNode(unsupported_nodes[i]);
+      std::cout << "Unsupported node op " << unode->OpType() << std::endl;
     }
   }
 #endif
@@ -190,9 +170,16 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     int no_of_clusters = 0;
 
     for (auto this_cluster : connected_clusters) {
-      // If subgraph has less then three, graph is considered trivial
+
+      // If subgraph has less then three, graph is considered trivial unless its an epctx cluster
       if (this_cluster.size() < 3) {
-        continue;
+        bool is_epctx_node = false;
+        for(auto node_idx:this_cluster){
+          if(graph_viewer_.GetNode(node_idx)->OpType() == "EPContext")
+            is_epctx_node = true;
+        }
+        if(!is_epctx_node)
+          continue;
       }
 
       std::vector<std::string> cluster_graph_inputs, cluster_inputs, cluster_outputs;
@@ -245,7 +232,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     }
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters;
   }
-
   return result;
 }